In [1]:
import ruamel.yaml as yaml
import os
import sys
import pandas as pd
import numpy as np


NO_CONFIG_ERR_MSG = """No config file found. Root directory is determined by presence of "config.yaml" file."""

original_wd = os.getcwd()

# Number of times to move back in directory
num_retries = 10
for x in range(0, num_retries):
    # try to load config file
    try:
        with open("config.yaml", 'r') as stream:
            cfg = yaml.safe_load(stream)
    # If not found move back one directory level
    except FileNotFoundError:
        os.chdir('../')
        # If reached the max number of directory levels change to original wd and print error msg
        if x+1 == num_retries:
            os.chdir(original_wd)
            print(NO_CONFIG_ERR_MSG)

# Add directory to PATH
path = os.getcwd()

if path not in sys.path:
    sys.path.append(path)


ModuleNotFoundError: No module named 'ruamel'

## Load Data

In [168]:
RPM_df = pd.read_csv('data/interim/player_RPM_stats.csv')
box_score_df = pd.read_csv('data/raw/Box_Scores.csv')

## Joining RPM to players

Need to join on composite key made up of Player Name + Season

season_id appears to be calendar year the season started in 

Based off of the latest season being 2017 and not 2018

In [178]:
box_score_df['season_id'].unique()

array([2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017], dtype=int64)

In [179]:
RPM_df.head()

Unnamed: 0,DRPM,GP,MPG,ORPM,RPM,Season_End_Year,Season_Start_Year,WINS,names,name_length_post_split
0,1.95,62,35.0,6.03,7.98,2014,2013,14.47,Chris Paul,2
1,0.15,81,38.5,6.27,6.42,2014,2013,17.63,Kevin Durant,2
2,-0.38,78,36.5,6.62,6.24,2014,2013,15.48,Stephen Curry,2
3,0.35,68,22.8,5.07,5.42,2014,2013,7.68,Manu Ginobili,2
4,3.23,69,36.2,1.89,5.12,2014,2013,11.64,LaMarcus Aldridge,2


In [180]:
# most number of whitespaces 
RPM_df['names'].str.split(' ').apply(len).max()

4

In [181]:
# Who has more than one space?

RPM_df['name_length_post_split'] = RPM_df['names'].str.split(' ').apply(len)
RPM_df.query("name_length_post_split > 2").names.value_counts().head()

# Mostly Jrs and some multiple names

Tim Hardaway Jr.        5
Otto Porter Jr.         5
Luc Mbah a Moute        5
James Michael McAdoo    4
Glenn Robinson III      4
Name: names, dtype: int64

What's the RPM of these players?

In [182]:
RPM_df.query("name_length_post_split > 2").RPM.describe()

count    51.000000
mean     -1.338627
std       2.072766
min      -6.390000
25%      -2.375000
50%      -1.620000
75%      -0.440000
max       4.960000
Name: RPM, dtype: float64

In [183]:
first_name_to_search = 'Tim'
box_score_df[box_score_df.First_Name.str.contains(first_name_to_search)].head()

Unnamed: 0,Game_id,Person_id,Team_id,First_Name,Last_Name,minutes,Field_Goals,Field_Goals_Attempted,Field_Goal_Percentage,Three_Pointers,...,Fast_Break_Points,Triple_Doubles,Double_Doubles,actual_minutes,actual_seconds,Plus_Minus,Blocks_Against,PTS_OFF_TO,Second_Chance_PTS,Total_Rebounds
28,20300002,1495,1610612759,Timothy,Duncan,40,8,13,0.615,0,...,0.0,0.0,1.0,40,0,-4,1.0,2,6,12.0
173,20300009,1501,1610612749,Timothy,Thomas,30,5,17,0.294,0,...,0.0,0.0,0.0,30,0,-19,2.0,4,4,3.0
210,20300011,1495,1610612759,Timothy,Duncan,34,7,22,0.318,0,...,2.0,0.0,1.0,34,0,-19,2.0,2,6,21.0
408,20300021,1501,1610612749,Timothy,Thomas,32,7,12,0.583,2,...,5.0,0.0,0.0,32,0,17,0.0,2,7,6.0
636,20300032,1501,1610612749,Timothy,Thomas,29,6,15,0.4,2,...,2.0,0.0,0.0,29,0,13,2.0,4,0,4.0


#### For now, will join to get RPM on a season/team/player level and fix this later if necessary

In [184]:
potential_keys = [
    'First_Name',
    'Last_Name',
    'season_id',
]

#### Steps:

1) RPM: Seperate names into first and last (start by splitting on a space)

2) Match season_id to either the calendar year the season ended or began. Map to RPM data

3) Join on First, Last, Season

In [142]:
game_mapping_df = pd.read_csv('data/raw/Game_Mapping.csv')
game_mapping_df.head()
game_mapping_df['Calendar_Year'] = game_mapping_df.Date_EST.str.split('/').str.get(-1)
calendar_year_to_df_map = game_mapping_df.groupby(by=['Calendar_Year', 'Season']).size().to_frame('count').reset_index()

In [143]:
team_tricodes_df = pd.read_csv('data/raw/Team_Tricodes.csv')
team_tricodes_df.head()

Unnamed: 0,Abbr,NBA_Flag,Team_Name
0,ATL,1,Atlanta Hawks
1,BRK,1,Brooklyn Nets
2,BOS,1,Boston Celtics
3,CHA,1,Charlotte Hornets
4,CHI,1,Chicago Bulls


In [144]:
team_mapping = pd.read_csv('data/raw/Team_Mapping.csv')
team_mapping.head(20)

Unnamed: 0,Team_id,Season_id,Arena_id,Conference_id,Division_id,Short_Name,Nickname,City,State,Country
0,1610612737,22003,135.0,East,Central,Atlanta,Hawks,Atlanta,GA,USA
1,1610612737,22004,135.0,East,Southeast,Atlanta,Hawks,Atlanta,GA,USA
2,1610612737,22005,135.0,East,Southeast,Atlanta,Hawks,Atlanta,GA,USA
3,1610612737,22006,135.0,East,Southeast,Atlanta,Hawks,Atlanta,GA,USA
4,1610612737,22007,135.0,East,Southeast,Atlanta,Hawks,Atlanta,GA,USA
5,1610612737,22008,135.0,East,Southeast,Atlanta,Hawks,Atlanta,GA,USA
6,1610612737,22009,135.0,East,Southeast,Atlanta,Hawks,Atlanta,GA,USA
7,1610612737,22010,135.0,East,Southeast,Atlanta,Hawks,Atlanta,GA,USA
8,1610612737,22011,135.0,East,Southeast,Atlanta,Hawks,Atlanta,GA,USA
9,1610612737,22012,135.0,East,Southeast,Atlanta,Hawks,Atlanta,GA,USA


In [145]:
calendar_year_to_df_map

Unnamed: 0,Calendar_Year,Season,count
0,2003,22003,450
1,2004,22003,739
2,2004,22004,426
3,2004,42003,105
4,2005,22004,804
5,2005,22005,434
6,2005,42004,105
7,2006,22005,796
8,2006,22006,456
9,2006,42005,107


In [146]:
# Only use season codes that start with 2
season_codes = [season_code for season_code in calendar_year_to_df_map.Season.unique() if str(season_code).startswith('2')]
season_code_starting_year = [int(str(season_code)[1:]) for season_code in season_codes]
season_code_ending_year = [season_code+1 for season_code in season_code_starting_year]

In [147]:
season_id_records = {
    'season_id':season_codes,
    'Season_Start_Year':season_code_starting_year,
    'Season_End_Year':season_code_ending_year,
    
}

In [148]:
season_id_map = pd.DataFrame.from_dict(season_id_records)
season_id_map.head()

Unnamed: 0,Season_End_Year,Season_Start_Year,season_id
0,2004,2003,22003
1,2005,2004,22004
2,2006,2005,22005
3,2007,2006,22006
4,2008,2007,22007


#### Merge to RPM data and check for nulls

In [149]:
RPM_df = pd.merge(RPM_df,
        season_id_map,
        on=['Season_Start_Year', 'Season_End_Year'],
                 how='left')

RPM_df['season_id'].isnull().sum()


0

In [150]:
RPM_df.head()

Unnamed: 0,DRPM,GP,MPG,ORPM,RPM,Season_End_Year,Season_Start_Year,WINS,names,name_length_post_split,season_id
0,1.95,62,35.0,6.03,7.98,2014,2013,14.47,Chris Paul,2,22013
1,0.15,81,38.5,6.27,6.42,2014,2013,17.63,Kevin Durant,2,22013
2,-0.38,78,36.5,6.62,6.24,2014,2013,15.48,Stephen Curry,2,22013
3,0.35,68,22.8,5.07,5.42,2014,2013,7.68,Manu Ginobili,2,22013
4,3.23,69,36.2,1.89,5.12,2014,2013,11.64,LaMarcus Aldridge,2,22013


#### Split Names First and Last

In [151]:
RPM_df['First_Name'] = RPM_df['names'].str.split(' ').str.get(0)
RPM_df['Last_Name'] = RPM_df['names'].str.split(' ').str.get(-1)

## Merge onto Box score

In [110]:
RPM_df[potential_keys].dtypes

First_Name    object
Last_Name     object
season_id      int64
dtype: object

In [111]:
box_score_df[potential_keys].dtypes

First_Name    object
Last_Name     object
season_id      int64
dtype: object

Max should be number of seasons since 2013 (when RPM was introduced)

In [157]:
RPM_df[potential_keys].groupby(by=['First_Name', 'Last_Name']).count().sort_values(by='season_id', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,season_id
First_Name,Last_Name,Unnamed: 2_level_1
Manu,Ginobili,5
Russell,Westbrook,5
Rudy,Gobert,5
Rudy,Gay,5
DeAndre,Jordan,5


In [160]:
box_score_df[potential_keys]

Unnamed: 0,First_Name,Last_Name,season_id
0,Eddie,Jones,2003
1,Aaron,McKie,2003
2,Brian,Grant,2003
3,Eric,Snow,2003
4,Vernell,Coles,2003
5,Derrick,Coleman,2003
6,Allen,Iverson,2003
7,Samaki,Walker,2003
8,John,Wallace,2003
9,Amal,McCaskill,2003


In [112]:
test_merge = pd.merge(RPM_df, box_score_df,
                     on=['season_id', 'First_Name', 'Last_Name'],
                     )

test_merge.head()

Unnamed: 0,DRPM,GP,MPG,ORPM,RPM,Season_End_Year,Season_Start_Year,WINS,names,name_length_post_split,...,Fast_Break_Points,Triple_Doubles,Double_Doubles,actual_minutes,actual_seconds,Plus_Minus,Blocks_Against,PTS_OFF_TO,Second_Chance_PTS,Total_Rebounds
