In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
events2015 = pd.read_csv('/kaggle/input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MEvents2015.csv')
events2015.head()

In [None]:
events2015['EventType'].value_counts().index

In [None]:
events2015.loc[events2015['EventType'] == 'sub']

### So you must be wondering why I am trying to look at substitution data

Basketball is a game of lineups and matchups. Usually, the best lineup on the floor has the highest chance of winning, no matter the amount of coaching. This is probably not a common thought with college basketball as the coach is usually considered the superstar, but we are increasingly getting players like Ja Morant, Trae Young and Zion Williamson whose insane talent has carried teams.

In [None]:
players = pd.read_csv('/kaggle/input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MPlayers.csv')
teams = pd.read_csv('/kaggle/input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/MTeams.csv')

players.shape

In [None]:
players.head()

In [None]:
teams.head()

Now I want to calculate the PER (Player Efficiency Rating) for each player. https://www.basketball-reference.com/about/per.html

For this, I would need the season stats for each player. For this, I will be using the sports reference python package!

In [None]:
!pip install sportsreference

In [None]:
from sportsreference.ncaab.roster import Player
from sportsreference.ncaab.teams import Teams

In [None]:
temp_player_df = players.loc[players['FirstName'] == 'Karl-Anthony'].loc[players['LastName'] == 'Towns'].iloc[0] #Looking at Kentucky star and No. 1 pick Karl-Anthony Towns 
print(temp_player_df)

In [None]:
def GetReferenceIDForPlayer(playerdf):
    teamid = playerdf['TeamID']
    players_team = teams.loc[teams['TeamID'] == teamid]
    allTeams = Teams(year=2015)
    for team in allTeams:
        if(team.name.startswith(players_team.iloc[0]['TeamName'])):
            print('Team found')
            print(team.name)
            roster = team.roster  # Gets each team's roster
            for player in roster.players:
                if(playerdf['FirstName'] in player.name and playerdf['LastName'] in player.name):
                    print(player.player_id)
                    print(player.player_efficiency_rating)
                    print(player.points)
                    return

In [None]:
GetReferenceIDForPlayer(temp_player_df)

There are a bunch of caveats with this process:
- The queries to this api is really slow and takes about 20 seconds per request. Obviously we can't keep running the code above to get the player efficiency rating for every player at runtime. And the code isn't the most efficient
- We are finding the player with a string compare which is again not very accurate. We could pre-prepare the data to have a direct reference to the sportreference stats
- Sportsreference does not have the PER values for all the players. We could normalize this database with a random number in the average PER range