In [169]:
import pandas as pd
import numpy as np

In [170]:
#ALL PRODUCTION CODE GOES HERE
#Test code below is to be used to build the function in question.
def call_game_logs (key_list : list, year : int) -> dict:
    """
    Accepts a list of keys corresponding to NBA players on basketballreference and returns dict of the form: 
    {player_key : player_game_log,...} where player_key is a string and player_game_log is a dataframe. 
    """
    # Initializes gamelog_dfs dict
    gamelog_dfs = {}
    
    # Populates gamelog_dfs with the given pairs
    for key in key_list:
        for df in pd.read_html(f'https://www.basketball-reference.com/players/g/{key}/gamelog/{year}'):
            if df.columns[0] == "Rk":
                gamelog_dfs[player_dict[key]] = df
    return gamelog_dfs

def standardize_game_log(func_df : pd.DataFrame) -> pd.DataFrame:
    """
    Accepts gamelog data from basketballreference.com and converts 
    it into accurately typed, usable data for pandas analysis.
    """
    # Split Unnamed:7 into "win_bool" and "margin_of_victory" columns.
    # Rename "+/-" to "plus_minus"
    # Replaces functionally null values with np.nan

    func_df = pd.concat(
        [func_df.rename(columns={'+/-' : 'plus_minus', 'Unnamed: 5' : 'home_bool'}).drop(labels='Unnamed: 7', axis=1), 
         func_df['Unnamed: 7'].str.split(' ', expand=True).rename(columns={0 : "win_bool", 1 : "margin_of_victory"})], axis=1
    ).replace(to_replace='Inactive', value=np.nan)



    # Remove rows reiterating the column name; ought to be 20th, 41st, and 62nd rows.
    # Convert L,W,@ to False, True, False
    func_df = func_df[func_df.Rk != 'Rk'].replace(to_replace=['L','W','@'], value=[False, True, False])

    # Replaces NaN values in the home_bool column with True values, indicating that the row represents a home game.
    func_df.home_bool = func_df.home_bool.replace(to_replace=np.nan, value=True)
    
    # Removes symbols that would prevent dtype conversion in MoV and +/- columns
    func_df.margin_of_victory = func_df.margin_of_victory.str.translate({ord(i): None for i in '()+'})
    func_df.plus_minus = func_df.plus_minus.str.translate({ord(i): None for i in '+'})
    
    #Sets rank as the index of the df (that is its purpose)
    func_df.set_index('Rk')
    

    # Adds a float-variant of the column 'MP'. This allows for easier operations than the actual 'MP' column,
    # which is otherwise in timedelta64
    func_df['MP_floats'] = \
    func_df.MP.str.split(':', expand=True).astype(float)[0] + func_df.MP.str.split(':', expand=True).astype(float)[1]/60
    
    # Simplifies age to year-only
    func_df.Age = func_df.Age.str[:2].astype(float)
        
    # This section is to convert the prepared columns to their correct dtype if that hasn't already been done.
    num_cols = ['G', 'GS', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 
            'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'plus_minus', 'margin_of_victory','FG%', '3P%', 'FT%', 'GmSc']

    for col in num_cols:                                # Converts datatype of all numerical, non-DateTime columns
        func_df[col] = func_df[col].astype(dtype=float)
    func_df.MP = pd.to_timedelta('00:' + func_df.MP)
    func_df.Date = pd.to_datetime(func_df.Date)

    return func_df

In [171]:
#TEST CODE IGNORE
#This is to determine whether my plan to call player game logs will work 
cavs_keys = ["garlada01", "mitchdo01", 'allenja01','mobleev01','mobleis01']  # Test key set
call_str = f'https://www.basketball-reference.com/players/g/{cavs_keys[0]}/gamelog/2023' # String used in pd.read_html()

player_dict = {  # This dict contains pairs of the format key(str) : name(str) to allow keys to be mapped to names below
    'garlada01' : 'Garland',
    'mitchdo01' : 'Mitchell',
    'allenja01' : 'Jarrett Allen',
    'mobleev01' : 'Evan Mobley',
    'mobleis01' : 'Isaiah Mobley',
}
gamelog_dfs = {}
for key in cavs_keys:
    for df in pd.read_html(f'https://www.basketball-reference.com/players/g/{key}/gamelog/2023'):
        if df.columns[0] == "Rk":
            gamelog_dfs[player_dict[key]] = df
#This code returns dfs corresponding to each player listed in the cavs_keys list. Their names are found in the player_dict.

In [172]:
#ALL CODE BELOW IS EXPLORATORY ANALYSIS AND NOT TO BE USED IN PRODUCTION
func_df = gamelog_dfs['Garland']
func_df.head()
#DATA CLEANING FUNCTION REQUIREMENTS
# Remove unnecessary Rk column
# Split Unnamed:7 into "win_bool" and "point_differential" columns. map "L" : 0 and "W" : 1.
# Convert Unnamed:5 to "home_bool" and mape "@" : 0 and "NaN" : 1.
# Rename "+/-" to "plus_minus"
# Remove rows reiterating the column name; ought to be 20th, 41st, and 62nd rows.
# Replace all positive entries in the plus_minus column, such that "+5" now reads '5'.
# Replace "Inactive" with null values
# 

Unnamed: 0,Rk,G,Date,Age,Tm,Unnamed: 5,Opp,Unnamed: 7,GS,MP,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,GmSc,+/-
0,1,1.0,2022-10-19,22-266,CLE,@,TOR,L (-3),1,13:18,...,1,1,3,2,1,5,1,4,-1.1,-10
1,2,,2022-10-22,22-269,CLE,@,CHI,W (+32),Inactive,Inactive,...,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive
2,3,,2022-10-23,22-270,CLE,,WAS,W (+10),Inactive,Inactive,...,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive
3,4,,2022-10-26,22-273,CLE,,ORL,W (+11),Inactive,Inactive,...,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive
4,5,,2022-10-28,22-275,CLE,@,BOS,W (+9),Inactive,Inactive,...,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive,Inactive


In [158]:
# Split Unnamed:7 into "win_bool" and "margin_of_victory" columns. map "L" : 0 and "W" : 1.
# Rename "+/-" to "plus_minus"
"""
func_df = func_df.rename(columns={'+/-' : 'plus_minus', 'Unnamed: 5' : 'Home?', 'Unnamed: 7' : 'Win?'})
func_df = pd.concat([step_1, step_1['Win?'].str.split(' ', expand=True).rename(columns={0 : "win_bool", 1 : "margin_of_victory"})], axis=1)
func_df.head()
"""

'\nfunc_df = func_df.rename(columns={\'+/-\' : \'plus_minus\', \'Unnamed: 5\' : \'Home?\', \'Unnamed: 7\' : \'Win?\'})\nfunc_df = pd.concat([step_1, step_1[\'Win?\'].str.split(\' \', expand=True).rename(columns={0 : "win_bool", 1 : "margin_of_victory"})], axis=1)\nfunc_df.head()\n'

In [159]:
# Split Unnamed:7 into "win_bool" and "margin_of_victory" columns.
# Rename "+/-" to "plus_minus"

func_df = pd.concat(
    [func_df.rename(columns={'+/-' : 'plus_minus', 'Unnamed: 5' : 'home_bool'}).drop(labels='Unnamed: 7', axis=1), 
     func_df['Unnamed: 7'].str.split(' ', expand=True).rename(columns={0 : "win_bool", 1 : "margin_of_victory"})], axis=1
).replace(to_replace='Inactive', value=np.nan)



# Remove rows reiterating the column name; ought to be 20th, 41st, and 62nd rows.
# Convert L,W,@ to False, True, False
func_df = func_df[func_df.Rk != 'Rk'].replace(to_replace=['L','W','@'], value=[False, True, False])

# Replaces NaN values in the home_bool column with True values, indicating that the row represents a home game.
func_df.home_bool = func_df.home_bool.replace(to_replace=np.nan, value=True)
func_df.margin_of_victory = func_df.margin_of_victory.str.translate({ord(i): None for i in '()+'})
func_df.plus_minus = func_df.plus_minus.str.translate({ord(i): None for i in '+'})
func_df.set_index('Rk')
func_df.Date = pd.to_datetime(func_df.Date)

# Adds a float-variant of the column 'MP'. This allows for easier operations than the actual 'MP' column, which is timedelta64
func_df['MP_floats'] = \
func_df.MP.str.split(':', expand=True).astype(float)[0] + func_df.MP.str.split(':', expand=True).astype(float)[1]/60
func_df.Age = func_df.Age.str[:2].astype(float)
func_df.set_index('Rk')

Unnamed: 0_level_0,G,Date,Age,Tm,home_bool,Opp,GS,MP,FG,FGA,...,STL,BLK,TOV,PF,PTS,GmSc,plus_minus,win_bool,margin_of_victory,MP_floats
Rk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,2022-10-19,22.0,CLE,False,TOR,1.0,13:18,2.0,8.0,...,2.0,1.0,5.0,1.0,4.0,-1.1,-10.0,False,-3,13.3
2,,2022-10-22,22.0,CLE,False,CHI,,,,,...,,,,,,,,True,32,
3,,2022-10-23,22.0,CLE,True,WAS,,,,,...,,,,,,,,True,10,
4,,2022-10-26,22.0,CLE,True,ORL,,,,,...,,,,,,,,True,11,
5,,2022-10-28,22.0,CLE,False,BOS,,,,,...,,,,,,,,True,9,
6,,2022-10-30,22.0,CLE,True,NYK,,,,,...,,,,,,,,True,13,
7,2.0,2022-11-02,22.0,CLE,True,BOS,1.0,42:21,9.0,20.0,...,3.0,0.0,1.0,3.0,29.0,28.9,10.0,True,1,42.35
8,,2022-11-04,22.0,CLE,False,DET,,,,,...,,,,,,,,True,24,
9,3.0,2022-11-06,22.0,CLE,False,LAL,1.0,38:57,7.0,18.0,...,0.0,0.0,5.0,4.0,24.0,14.1,6.0,True,14,38.95
10,4.0,2022-11-07,22.0,CLE,False,LAC,1.0,37:47,8.0,17.0,...,1.0,0.0,5.0,1.0,19.0,14.2,-2.0,False,-2,37.783333


In [160]:
num_cols = ['G', 'GS', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 
            'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'plus_minus', 'margin_of_victory','FG%', '3P%', 'FT%', 'GmSc']
str_cols = ['Tm', 'Opp']
# Datetime already set
# Bools already set


for col in num_cols:  # Converts datatype of all numerical, non-DT columns
    func_df[col] = func_df[col].astype(dtype=float)
func_df.MP = pd.to_timedelta('00:' + func_df.MP)

In [162]:
func_df.dtypes

Rk                            object
G                            float64
Date                  datetime64[ns]
Age                          float64
Tm                            object
home_bool                       bool
Opp                           object
GS                           float64
MP                   timedelta64[ns]
FG                           float64
FGA                          float64
FG%                          float64
3P                           float64
3PA                          float64
3P%                          float64
FT                           float64
FTA                          float64
FT%                          float64
ORB                          float64
DRB                          float64
TRB                          float64
AST                          float64
STL                          float64
BLK                          float64
TOV                          float64
PF                           float64
PTS                          float64
G

In [173]:
play_df = standardize_game_log(gamelog_dfs['Garland'])
play_df.describe()

Unnamed: 0,G,Age,GS,MP,FG,FGA,FG%,3P,3PA,3P%,...,AST,STL,BLK,TOV,PF,PTS,GmSc,plus_minus,margin_of_victory,MP_floats
count,49.0,59.0,49.0,49,49.0,49.0,49.0,49.0,49.0,49.0,...,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,59.0,49.0
mean,25.0,22.169492,1.0,0 days 00:35:07.918367346,7.469388,16.163265,0.468367,2.591837,6.183673,0.408245,...,8.020408,1.244898,0.102041,2.938776,2.081633,21.755102,17.332653,6.142857,5.779661,35.131973
std,14.28869,0.378406,0.0,0 days 00:05:37.610342254,2.916205,5.063542,0.134831,1.790232,2.751314,0.18721,...,2.749923,1.26706,0.305839,2.014615,1.288357,8.511097,8.585486,13.012814,12.324727,5.626839
min,1.0,22.0,1.0,0 days 00:13:18,1.0,6.0,0.111,0.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,4.0,-1.1,-26.0,-15.0,13.3
25%,13.0,22.0,1.0,0 days 00:33:07,6.0,12.0,0.4,1.0,4.0,0.273,...,6.0,0.0,0.0,1.0,1.0,18.0,11.2,-3.0,-4.0,33.116667
50%,25.0,22.0,1.0,0 days 00:35:30,7.0,17.0,0.45,2.0,6.0,0.4,...,8.0,1.0,0.0,2.0,2.0,21.0,17.9,6.0,8.0,35.5
75%,37.0,22.0,1.0,0 days 00:38:23,9.0,19.0,0.563,3.0,8.0,0.556,...,10.0,2.0,0.0,4.0,3.0,24.0,22.1,16.0,14.0,38.383333
max,49.0,23.0,1.0,0 days 00:48:13,16.0,31.0,0.727,10.0,15.0,0.75,...,14.0,5.0,1.0,8.0,5.0,51.0,43.3,27.0,32.0,48.216667
