# Data Preparation

The metrics that will be used to evaluate this stage are:


- **Quality**, assessment of dimensions;

- **Feature Engineering and Selection** from tabular data, complex methods (e.g. aggregation) and knowledge (e.g. business concepts), and correct and combined use of filter and wrapper based methods;

- **Sampling** for domain-specific purposes, focus on the appropriate subset of the population, and for development, start with a very small sample and scale up to a significant sample;

- **Unbalanced** - you used advanced methods (e.g. SMOTE) correctly

## 0 - Loading

In [53]:
import pandas as pd
import os

awards_players = pd.read_csv('data/awards_players.csv')
coaches = pd.read_csv('data/coaches.csv')
players_teams = pd.read_csv('data/players_teams.csv')
players = pd.read_csv('data/players.csv')
series_post = pd.read_csv('data/series_post.csv')
teams_post = pd.read_csv('data/teams_post.csv')
teams = pd.read_csv('data/teams.csv')

tables = {
    'awards_players': awards_players,
    'coaches': coaches,
    'players_teams': players_teams,
    'players': players,
    'series_post': series_post,
    'teams_post': teams_post,
    'teams': teams
}

## 1 - Cleaning

In [54]:
# change tmID value to the franchID value
mapTeam = {}
for index, row in teams[['tmID', 'franchID']].iterrows():
    mapTeam [row['tmID']] = row['franchID']

tables['teams']['tmID'] = tables['teams']['tmID'].replace(mapTeam) 
tables['coaches']['tmID'] = tables['coaches']['tmID'].replace(mapTeam) 
tables['players_teams']['tmID'] = tables['players_teams']['tmID'].replace(mapTeam) 
tables['series_post']['tmIDWinner'] = tables['series_post']['tmIDWinner'].replace(mapTeam) 
tables['series_post']['tmIDLoser'] = tables['series_post']['tmIDLoser'].replace(mapTeam) 
tables['teams_post']['tmID'] = tables['teams_post']['tmID'].replace(mapTeam)

for name in tables.keys():
    # remove columns with missing values
    tables[name] = tables[name].dropna(axis=1, how='all')
    # remove columns with one unique value
    nunique = tables[name].nunique()
    tables[name] = tables[name].drop(nunique[nunique == 1].index, axis=1)

tables['awards_players'].drop(['award'], axis=1)
tables['coaches'] = tables['coaches'].drop(columns=['stint'])
tables['players'] = tables['players'].drop(['college', 'collegeOther', 'birthDate', 'deathDate', 'height', 'weight'], axis=1)
tables['players_teams'] = tables['players_teams'].drop(columns=['stint'])
tables['players_teams'] = tables['players_teams'][tables['players_teams']['minutes'] != 0]
tables['teams'] = tables['teams'].drop(['franchID', 'firstRound', 'semis', 'finals', 'attend', 'name', 'arena'], axis=1)

## 3 - Integration

In [55]:
# join players.csv with players_teams.csv
tables['players'].rename(columns={'bioID': 'playerID'}, inplace=True)
tables['players_teams'] = pd.merge( tables['players_teams'], tables['players'], on='playerID', how='left')
tables.pop('players')

# join awards_players.csv with players_teams.csv and coaches.csv
merged_players_awards = pd.merge(tables['awards_players'], tables['players_teams'], on=['playerID', 'year'], how='inner')
awards_count = merged_players_awards.groupby(['playerID', 'year']).size().reset_index(name='awards')
tables['players_teams'] = pd.merge(tables['players_teams'], awards_count[['playerID', 'year', 'awards']], on=['playerID', 'year'], how='left')
tables['players_teams']['awards'] = tables['players_teams']['awards'].fillna(0).astype(int)
merged_coaches_awards = pd.merge(tables['awards_players'], tables['coaches'], left_on=['playerID', 'year'], right_on=['coachID', 'year'], how='inner')
awards_count = merged_coaches_awards.groupby(['coachID', 'year']).size().reset_index(name='awards')
tables['coaches'] = pd.merge(tables['coaches'], awards_count[['coachID', 'year', 'awards']], on=['coachID', 'year'], how='left')
tables['coaches']['awards'] = tables['coaches']['awards'].fillna(0).astype(int)
tables.pop('awards_players')

# join teams_post.csv with teams.csv
tables['teams_post'].rename(columns={'W': 'wonPost', 'L': 'lostPost'}, inplace=True)
tables['teams'] = pd.merge(tables['teams'], tables['teams_post'], on=['tmID', 'year'], how='left')
tables['teams']['wonPost'] = tables['teams']['wonPost'].fillna(0).astype(int)
tables['teams']['lostPost'] = tables['teams']['lostPost'].fillna(0).astype(int)
tables.pop('teams_post')

# join awards column of players_teams.csv and coaches.csv with teams.csv
team_players_count = tables['players_teams'].groupby(['tmID', 'year'])['awards'].sum().reset_index()
team_players_count = team_players_count.rename(columns={'awards': 'awards_players'})
tables['teams'] = tables['teams'].merge(team_players_count, on=['year','tmID'], how='left')
tables['teams']['awards_players'] = tables['teams']['awards_players'].fillna(0).astype(int)
team_coaches_count = tables['coaches'].groupby(['tmID', 'year'])['awards'].sum().reset_index()
team_coaches_count = team_coaches_count.rename(columns={'awards': 'awards_coaches'})
tables['teams'] = tables['teams'].merge(team_coaches_count, on=['tmID', 'year'], how='left')
tables['teams']['awards_coaches'] = tables['teams']['awards_coaches'].fillna(0).astype(int)

## 4 - Feature Engineering

In [56]:
tables['teams']["perW"] = tables['teams']["won"] / tables['teams']["GP"] # Winning percentage
tables['teams']["perL"] = tables['teams']["lost"] / tables['teams']["GP"] # Losing percentage
tables['teams']["o_fgEf"] = tables['teams']["o_fgm"] / tables['teams']["o_fga"] # Field goals  Percentage
tables['teams']["o_ftEf"] = tables['teams']["o_ftm"] / tables['teams']["o_fta"] # Free throws Percentage
tables['teams']["o_3pEf"] = tables['teams']["o_3pm"] / tables['teams']["o_3pa"] # Three-pointers Percentage
tables['teams']["o_per"] = tables['teams']["o_pts"] + tables['teams']["o_pts"]*1.5 + tables['teams']["o_oreb"]*1.2 - tables['teams']["o_to"]* 1.5 - tables['teams']["o_pf"] # Performance
tables['teams']["d_fgEf"] = tables['teams']["d_fgm"] / tables['teams']["d_fga"] # Field goals by opponents Percentage
tables['teams']["d_ftEf"] = tables['teams']["d_ftm"] / tables['teams']["d_fta"] # Free throws by opponents Percentage
tables['teams']["d_3pEf"] = tables['teams']["d_3pm"] / tables['teams']["d_3pa"] # Three-pointers by opponents Percentage
tables['teams']["pts_diff"] =  tables['teams']["o_pts"] -  tables['teams']["d_pts"] # Difference between points scored and conceded
tables['teams']["pts_mpg"] =  tables['teams']["o_pts"] / tables['teams']["GP"] # Average points scored per game
tables['teams']["pts_tmp"] =  tables['teams']["d_pts"] / tables['teams']["GP"] # Average points conceded per game
tables['teams']["effec_fg"] =  (tables['teams']["pts_tmp"] + 0.5 * tables['teams']["o_3pm"]) /  tables['teams']["o_fga"] # Effective Field Goal Percentage
#Possessions 
tables['teams']["poss"] = 0.5 * ((tables['teams']["o_fga"] + 0.4 * tables['teams']["o_fta"] - 1.07 * (tables['teams']["o_oreb"] / (tables['teams']["o_oreb"] + tables['teams']["d_dreb"])) * (tables['teams']["o_fga"] - tables['teams']["o_fgm"]) + tables['teams']["o_to"]) + 
                       (tables['teams']["d_fga"] + 0.4 * tables['teams']["d_fga"] - 1.07 * (tables['teams']["d_oreb"] / (tables['teams']["d_oreb"] + tables['teams']["o_dreb"])) * (tables['teams']["d_fga"] - tables['teams']["d_fgm"]) + tables['teams']["d_to"]))

tables['teams']["off_rtg"] = (tables['teams']["o_pts"] / tables['teams']["poss"]) * 100 # Offensive Rating
tables['teams']["def_rtg"] = (tables['teams']["d_pts"] / tables['teams']["poss"]) * 100 # Defensive Rating
tables['teams']["minutes_team"] = tables['teams']['min'] / 5

## 5 - Feature Selection

In [57]:
tables['teams'].drop(columns=['o_reb','d_reb','GP'], inplace=True)
tables['players_teams'].drop(columns=['rebounds','PostRebounds'], inplace=True)

## 6 - Transformation

In [58]:
# shift for statistics from one year to be associated with those of the following year
tables['teams'] = tables['teams'].sort_values(by=['tmID', 'year','playoff'])
attributes = tables['teams'].drop(columns=['tmID', 'year','playoff']).columns
last_year_team = tables['teams'].groupby('tmID').last().reset_index()
last_year_lines = last_year_team.copy()
last_year_lines['year'] += 1
tables['teams'][attributes] = tables['teams'].groupby('tmID')[attributes].shift(fill_value=0)
tables['teams'] = tables['teams'].sort_values(by=['tmID', 'year'])
first_year_index = tables['teams'].groupby('tmID').head(1).index
tables['teams'] = tables['teams'].drop(first_year_index).reset_index(drop=True)
tables['teams'] = pd.concat([tables['teams'], last_year_lines], ignore_index=True)
tables['teams'] = tables['teams'].sort_values(by=['tmID', 'year']).reset_index(drop=True)

# shift for statistics from one year to be associated with those of the following year
tables['players_teams'] = tables['players_teams'].sort_values(by=['playerID', 'year'])
attributes = tables['players_teams'].drop(columns=['playerID', 'year', 'tmID']).columns
tables['players_teams'][attributes] = tables['players_teams'].groupby('playerID')[attributes].shift(fill_value=0)
first_year_index = tables['players_teams'].groupby('playerID')['year'].idxmin()
tables['players_teams'] = tables['players_teams'].drop(first_year_index).reset_index(drop=True)
tables['players_teams'] = tables['players_teams'].sort_values(by=['playerID', 'year']).reset_index(drop=True)
tables['players_teams'] = tables['players_teams'].apply(lambda x: x.fillna(0) if x.dtype in ['float64', 'int64'] else x)

# merge teams.csv with players_teams.csv
tables['players_teams'] = pd.merge(tables['players_teams'], tables['teams'], on=['year', 'tmID'], how='inner')
tables['players_teams'] = tables['players_teams'].sort_values(by=['tmID', 'year', 'minutes'], ascending=[True, True, False])
tables['players_teams'] = tables['players_teams'].groupby(['tmID', 'year']).head(5).reset_index(drop=True)

## 7 - Saving

In [59]:
def table_to_csv(name):
    os.makedirs('data_prepared', exist_ok=True)
    path = os.path.join('data_prepared', name.lower().replace(' ', '_') + '.csv')
    tables[name].to_csv(path, index=False)

for table in tables.keys():
    table_to_csv(table)