# Data Preparation

The metrics that will be used to evaluate this stage are:


- **Quality**, assessment of dimensions;

- **Feature Engineering and Selection** from tabular data, complex methods (e.g. aggregation) and knowledge (e.g. business concepts), and correct and combined use of filter and wrapper based methods;

- **Sampling** for domain-specific purposes, focus on the appropriate subset of the population, and for development, start with a very small sample and scale up to a significant sample;

- **Unbalanced** - you used advanced methods (e.g. SMOTE) correctly

## 0 - Loading

In [None]:
import pandas as pd
import os

awards_players = pd.read_csv('data/awards_players.csv')
coaches = pd.read_csv('data/coaches.csv')
players_teams = pd.read_csv('data/players_teams.csv')
players = pd.read_csv('data/players.csv')
series_post = pd.read_csv('data/series_post.csv')
teams_post = pd.read_csv('data/teams_post.csv')
teams = pd.read_csv('data/teams.csv')

tables = {
    'awards_players': awards_players,
    'coaches': coaches,
    'players_teams': players_teams,
    'players': players,
    'series_post': series_post,
    'teams_post': teams_post,
    'teams': teams
}

tables['awards_players'].drop(['award'], axis=1)
tables['coaches'] = tables['coaches'].drop(columns=['stint'])
tables['players'] = tables['players'].drop(['college', 'collegeOther', 'birthDate', 'deathDate', 'height', 'weight'], axis=1)
tables['players_teams'] = tables['players_teams'].drop(columns=['stint'])
tables['players_teams'] = tables['players_teams'][tables['players_teams']['minutes'] != 0]
tables['teams'] = tables['teams'].drop(['franchID', 'firstRound', 'semis', 'finals', 'attend', 'name', 'arena'], axis=1)

test_coaches = pd.read_csv('data_test/coaches.csv')
test_players_teams = pd.read_csv('data_test/players_teams.csv')
test_teams = pd.read_csv('data_test/teams.csv')

tables['test_coaches'] = pd.merge(tables['coaches'], test_coaches, on=['coachID','year','tmID'], how='outer')
tables['players_teams'] = pd.merge(tables['players_teams'], test_players_teams, on=['playerID','year','tmID'], how='outer')
tables['teams'] = pd.merge(tables['teams'], test_teams, on=['tmID','year','confID'], how='outer')


## 1 - Cleaning

In [None]:

# change tmID value to the franchID value
mapTeam = {}
for index, row in teams[['tmID', 'franchID']].iterrows():
    mapTeam [row['tmID']] = row['franchID']

tables['teams']['tmID'] = tables['teams']['tmID'].replace(mapTeam) 
tables['coaches']['tmID'] = tables['coaches']['tmID'].replace(mapTeam) 
tables['players_teams']['tmID'] = tables['players_teams']['tmID'].replace(mapTeam) 
tables['series_post']['tmIDWinner'] = tables['series_post']['tmIDWinner'].replace(mapTeam) 
tables['series_post']['tmIDLoser'] = tables['series_post']['tmIDLoser'].replace(mapTeam) 
tables['teams_post']['tmID'] = tables['teams_post']['tmID'].replace(mapTeam)

for name in tables.keys():
    # remove columns with missing values
    tables[name] = tables[name].dropna(axis=1, how='all')
    # remove columns with one unique value
    nunique = tables[name].nunique()
    tables[name] = tables[name].drop(nunique[nunique == 1].index, axis=1)


tables['teams'] = tables['teams'].drop(['franchID', 'name', 'arena'], axis=1)

## 3 - Integration

In [None]:
# join players.csv with players_teams.csv
tables['players'].rename(columns={'bioID': 'playerID'}, inplace=True)
tables['players_teams'] = pd.merge( tables['players_teams'], tables['players'], on='playerID', how='left')
tables.pop('players')

# join awards_players.csv with players_teams.csv and coaches.csv
merged_players_awards = pd.merge(tables['awards_players'], tables['players_teams'], on=['playerID', 'year'], how='inner')
awards_count = merged_players_awards.groupby(['playerID', 'year']).size().reset_index(name='awards')
tables['players_teams'] = pd.merge(tables['players_teams'], awards_count[['playerID', 'year', 'awards']], on=['playerID', 'year'], how='left')
tables['players_teams']['awards'] = tables['players_teams']['awards'].fillna(0).astype(int)
merged_coaches_awards = pd.merge(tables['awards_players'], tables['coaches'], left_on=['playerID', 'year'], right_on=['coachID', 'year'], how='inner')
awards_count = merged_coaches_awards.groupby(['coachID', 'year']).size().reset_index(name='awards')
tables['coaches'] = pd.merge(tables['coaches'], awards_count[['coachID', 'year', 'awards']], on=['coachID', 'year'], how='left')
tables['coaches']['awards'] = tables['coaches']['awards'].fillna(0).astype(int)
tables.pop('awards_players')

# join teams_post.csv with teams.csv
tables['teams_post'].rename(columns={'W': 'wonPost', 'L': 'lostPost'}, inplace=True)
tables['teams'] = pd.merge(tables['teams'], tables['teams_post'], on=['tmID', 'year'], how='left')
tables['teams']['wonPost'] = tables['teams']['wonPost'].fillna(0).astype(int)
tables['teams']['lostPost'] = tables['teams']['lostPost'].fillna(0).astype(int)
tables.pop('teams_post')

# join series_post.csv with teams.csv
wins = tables['series_post'].groupby(['year', 'tmIDWinner']).agg({'W': 'sum'}).reset_index()
wins.rename(columns={'tmIDWinner': 'tmID'}, inplace=True)
losses = tables['series_post'].groupby(['year', 'tmIDLoser']).agg({'L': 'sum'}).reset_index()
losses.rename(columns={'tmIDLoser': 'tmID'}, inplace=True)
results_post = pd.merge(wins, losses, on=['year', 'tmID'], how='outer').fillna(0)
results_post.rename(columns={'W': 'wonPointsPost', 'L': 'lostPointsPost'}, inplace=True)
tables['teams'] = pd.merge(tables['teams'], results_post, on=['tmID', 'year'], how='left')
tables['teams']['wonPointsPost'] = tables['teams']['wonPointsPost'].fillna(0).astype(int)
tables['teams']['lostPointsPost'] = tables['teams']['lostPointsPost'].fillna(0).astype(int)
tables.pop('series_post')

# join awards column of players_teams.csv and coaches.csv with teams.csv
team_players_count = tables['players_teams'].groupby(['tmID', 'year'])['awards'].sum().reset_index()
team_players_count = team_players_count.rename(columns={'awards': 'awards_players'})
tables['teams'] = tables['teams'].merge(team_players_count, on=['year','tmID'], how='left')
tables['teams']['awards_players'] = tables['teams']['awards_players'].fillna(0).astype(int)
team_coaches_count = tables['coaches'].groupby(['tmID', 'year'])['awards'].sum().reset_index()
team_coaches_count = team_coaches_count.rename(columns={'awards': 'awards_coaches'})
tables['teams'] = tables['teams'].merge(team_coaches_count, on=['tmID', 'year'], how='left')
tables['teams']['awards_coaches'] = tables['teams']['awards_coaches'].fillna(0).astype(int)

# join teams.csv with players_teams.to_csv
tables['players_teams'] = tables['players_teams'].merge(tables['teams'], on=['year', 'tmID'], how='left')

## 4 - Feature Engineering

In [None]:
tables['teams']['offensive_efficiency'] = round(tables['teams']['o_pts'] / (tables['teams']['o_fgm'] + 0.44 * tables['teams']['o_fta'] - tables['teams']['o_reb']),2)
tables['teams']['defensive_efficiency'] = round(tables['teams']['d_pts'] / (tables['teams']['d_fgm'] + 0.44 * tables['teams']['d_fta'] - tables['teams']['d_reb']),2)
tables['teams']['play_percent'] = round(tables['teams']['o_fgm'] / (tables['teams']['o_fga'] + tables['teams']['o_reb'] + tables['teams']['o_to']),2)

factor_sh = 0.40 * (tables['teams']['o_fgm'] + 0.5 * tables['teams']['o_3pm'] / tables['teams']['o_fga'])
factor_tu = 0.25 * (tables['teams']['o_to'] / (tables['teams']['o_fga'] + 0.44 * tables['teams']['o_fta'] + tables['teams']['o_to']))
factor_re = 0.20 * (tables['teams']['o_oreb'] / (tables['teams']['o_oreb'] + tables['teams']['d_dreb']))
factor_th =  0.15 *( tables['teams']['o_ftm'] / tables['teams']['o_fga'])
tables['teams']['factors4'] = round(factor_sh + factor_tu + factor_re + factor_th,2)

tables['teams']['possession'] = round(0.96 * (tables['teams']['o_fga'] + tables['teams']['o_to'] + 0.44 * tables['teams']['o_fta'] - tables['teams']['o_oreb']),2)
tables['teams']['opponent_possession'] = round(0.96 * (tables['teams']['d_fga'] + tables['teams']['d_to'] + 0.44 * tables['teams']['d_fta'] - tables['teams']['d_oreb']),2)
tables['teams']['pace'] = round((200 / tables['teams']['min']) * (tables['teams']['possession'] + tables['teams']['opponent_possession']) / 2,2)

for year, year_data in tables['teams'].groupby("year"):
    lg_pace = tables['teams'][tables['teams']['year'] == year]['pace'].mean()
    tables['teams'].loc[tables['teams']['year'] == year,'lg_pace'] = lg_pace

player_metric = tables['players_teams']['points'] + tables['players_teams']['fgMade'] + tables['players_teams']['ftMade'] - tables['players_teams']['fgAttempted'] + tables['players_teams']['dRebounds'] + tables['players_teams']['oRebounds'] / 2 + tables['players_teams']['assists'] + tables['players_teams']['steals'] + tables['players_teams']['blocks'] - tables['players_teams']['PF'] - tables['players_teams']['turnovers']
team_metric = tables['players_teams']['o_pts'] + tables['players_teams']['o_fgm'] + tables['players_teams']['o_ftm'] - tables['players_teams']['o_fga'] + tables['players_teams']['o_fta'] + tables['players_teams']['o_dreb'] + tables['players_teams']['o_oreb'] / 2 + tables['players_teams']['o_asts'] + tables['players_teams']['o_stl'] + tables['players_teams']['o_blk'] / 2 - tables['players_teams']['o_pf'] - tables['players_teams']['o_to']
tables['players_teams']['pie'] = round(player_metric / team_metric,2)

## 5 - Feature Selection

In [None]:
tables['teams'].drop(columns=['o_reb','d_reb','GP'], inplace=True)
# tables['players_teams'].drop(columns=['rebounds','PostRebounds'], inplace=True)

## 6 - Integration Test

In [None]:
#test_coaches = pd.read_csv('data_test/coaches.csv')
#test_players_teams = pd.read_csv('data_test/players_teams.csv')
#test_teams = pd.read_csv('data_test/teams.csv')
#
#mapTeam = {}
#for index, row in test_teams[['tmID', 'franchID']].iterrows():
#    mapTeam [row['tmID']] = row['franchID']
#    
#test_coaches['tmID'] = test_coaches['tmID'].replace(mapTeam) 
#test_players_teams['tmID'] = test_players_teams['tmID'].replace(mapTeam) 
#test_teams['tmID'] = test_teams['tmID'].replace(mapTeam)
#
#test_coaches = test_coaches.drop(['lgID', 'stint'], axis=1)
#test_players_teams = test_players_teams.drop(['lgID', 'stint'], axis=1)
#test_teams = test_teams.drop(['lgID', 'confID', 'name', 'arena'], axis=1)
#
#tables['teams'] = pd.concat([tables['teams'], test_teams], axis=0).fillna(0)
#tables['players_teams'] = pd.concat([tables['players_teams'], test_players_teams], axis=0).fillna(0)

for year, year_data in tables['players_teams'].groupby("year"):
    lg_ast = year_data['assists'].sum()             + year_data['PostAssists'].sum()
    lg_fg = year_data['fgMade'].sum()               + year_data['PostfgMade'].sum()
    lg_ft = year_data['ftMade'].sum()               + year_data['PostftMade'].sum()
    lg_fga = year_data['fgAttempted'].sum()         + year_data['PostfgAttempted'].sum()
    lg_fta = year_data['ftAttempted'].sum()         + year_data['PostftAttempted'].sum()
    lg_trb = year_data['rebounds'].sum()            + year_data['PostRebounds'].sum()
    lg_orb = year_data['oRebounds'].sum()           + year_data['PostoRebounds'].sum()
    lg_drb = year_data['dRebounds'].sum()           + year_data['PostdRebounds'].sum()
    lg_to = year_data['turnovers'].sum()            + year_data['PostTurnovers'].sum()
    lg_pf = year_data['PF'].sum()                   + year_data['PostPF'].sum()
    lg_pts = year_data['points'].sum()              + year_data['PostPoints'].sum()
    
    factor = (2 / 3) - (0.5 * (lg_ast / lg_fg)) / (2 * (lg_fg / lg_ft))
    vop    = lg_pts / (lg_fga - lg_orb + lg_to + 0.44 * lg_fta)
    drbP  = lg_drb /lg_trb
    
    for tmID, tm_data in year_data.groupby('tmID'):
        tm_ast = tm_data['assists'].sum()       + tm_data['PostAssists'].sum()
        tm_fg = tm_data['fgMade'].sum()         + tm_data['PostfgMade'].sum()
        tm_pts = tm_data['points'].sum()        + year_data['PostPoints'].sum()
  
        for player, player_data in tm_data.groupby('playerID'):
            player_min = player_data['minutes'].sum() + player_data['PostMinutes'].sum()
            player_3p = player_data['threeMade'].sum() + player_data['PostthreeMade'].sum()
            player_ast = player_data['assists'].sum() + player_data['PostAssists'].sum()
            player_fg = player_data['fgMade'].sum() + player_data['PostfgMade'].sum()
            player_ft = player_data['ftMade'].sum() + player_data['PostftMade'].sum()
            player_fga = player_data['fgAttempted'].sum() + player_data['PostfgAttempted'].sum()
            player_fta = player_data['ftAttempted'].sum() + player_data['PostftAttempted'].sum()
            player_to = player_data['turnovers'].sum() + player_data['PostTurnovers'].sum()
            player_trb = player_data['rebounds'].sum() + player_data['PostRebounds'].sum()
            player_orb = player_data['oRebounds'].sum() + player_data['PostoRebounds'].sum()
            player_drb = player_data['dRebounds'].sum() + player_data['PostdRebounds'].sum()
            player_stl = player_data['steals'].sum() + player_data['PostSteals'].sum()
            player_blk = player_data['blocks'].sum() + player_data['PostBlocks'].sum()
            player_pf = player_data['PF'].sum() + player_data['PostPF'].sum()

            if(player_min == 0):
                uper = 0
            else:
                uper = (1 / player_min) * ( player_3p 
                    + (2/3) * player_ast
                    + (2 - factor * (tm_ast / tm_fg)) * player_fg
                    + (player_ft * 0.5 * (2 - (1/3) * (tm_ast / tm_fg)))
                    - vop * player_to
                    - vop * drbP * (player_fga - player_fg)
                    - vop * 0.44 * (0.44 + (0.56 * drbP)) * (player_fta - player_ft)
                    + vop * (1 -drbP) * (player_trb - player_orb)
                    + vop * drbP * player_orb
                    + vop * player_stl
                    + vop * drbP * player_blk
                    - player_pf * ((lg_ft / lg_pf) - 0.44 * (lg_fta / lg_pf) * vop)
                )

            tables['players_teams'].loc[
                (tables['players_teams']['year'] == year) &
                (tables['players_teams']['playerID'] == player) &
                (tables['players_teams']['tmID'] == tmID),
                'uper'] = uper
            
    lg_uper = tables['players_teams'][tables['players_teams']['year'] == year]['uper'].mean()
    tables['players_teams'].loc[tables['players_teams']['year'] == year,'lg_uper'] = lg_uper

teams_copy = tables['teams'].copy()
teams_copy = teams_copy[["year", "tmID", "pace", "lg_pace"]]
tables['players_teams'] = tables['players_teams'].merge(teams_copy, on=['year', 'tmID'], how='left')
tables['players_teams']["per"] = (tables['players_teams']["uper"] * (tables['players_teams']["lg_pace"]/tables['players_teams']["pace"])) * (15/tables['players_teams']["lg_uper"])
tables['players_teams'] = tables['players_teams'].drop(columns=['lg_pace'])

df_prev = tables['players_teams'][['year', 'playerID', 'pie']].copy()
df_prev['year'] += 1
df_merged = tables['players_teams'].merge(df_prev, on=['year', 'playerID'], how='left', suffixes=('', '_prev'))
df_result = df_merged[['year', 'playerID', 'tmID', 'pie_prev','minutes']]
df_result = df_result.sort_values(by=['year', 'tmID', 'minutes'], ascending=[True, True, False])
top_5_per_group = df_result.groupby(['year', 'tmID']).head(5)

avg_pie_per_team = (top_5_per_group.groupby(['year', 'tmID'])['pie_prev'].mean().reset_index().rename(columns={'pie_prev': 'avg_pie'}))
tables['teams'] = tables['teams'].merge(avg_pie_per_team, on=['year', 'tmID'], how='left')
# tables['teams'] = tables['teams'].drop(['franchID'],axis=1)
tables['teams'] = tables['teams'].fillna(0)

df_prev = tables['players_teams'][['year', 'playerID', 'per']].copy()
df_prev['year'] += 1
df_merged = tables['players_teams'].merge(df_prev, on=['year', 'playerID'], how='left', suffixes=('', '_prev'))
df_result = df_merged[['year', 'playerID', 'tmID', 'per_prev','minutes']]
df_result = df_result.sort_values(by=['year', 'tmID', 'minutes'], ascending=[True, True, False])
top_5_per_group = df_result.groupby(['year', 'tmID']).head(5)
#
avg_pie_per_team = (top_5_per_group.groupby(['year', 'tmID'])['per_prev'].mean().reset_index().rename(columns={'per_prev': 'avg_per'}))
tables['teams'] = tables['teams'].merge(avg_pie_per_team, on=['year', 'tmID'], how='left')
tables['teams'] = tables['teams'].fillna(0)


## 6 - Transformation

In [None]:
# shift for statistics from one year to be associated with those of the following year
tables['teams'] = tables['teams'].sort_values(by=['tmID', 'year','playoff'])
attributes = tables['teams'].drop(columns=['tmID', 'year','playoff']).columns
last_year_team = tables['teams'].groupby('tmID').last().reset_index()
last_year_lines = last_year_team.copy()
last_year_lines['year'] += 1
tables['teams'][attributes] = tables['teams'].groupby('tmID')[attributes].shift(fill_value=0)
tables['teams'] = tables['teams'].sort_values(by=['tmID', 'year'])
first_year_index = tables['teams'].groupby('tmID').head(1).index
tables['teams'] = tables['teams'].drop(first_year_index).reset_index(drop=True)
tables['teams'] = pd.concat([tables['teams'], last_year_lines], ignore_index=True)
tables['teams'] = tables['teams'].sort_values(by=['tmID', 'year']).reset_index(drop=True)

# shift for statistics from one year to be associated with those of the following year
# tables['players_teams'] = tables['players_teams'].sort_values(by=['playerID', 'year'])
# attributes = tables['players_teams'].drop(columns=['playerID', 'year', 'tmID']).columns
# tables['players_teams'][attributes] = tables['players_teams'].groupby('playerID')[attributes].shift(fill_value=0)
# first_year_index = tables['players_teams'].groupby('playerID')['year'].idxmin()
# tables['players_teams'] = tables['players_teams'].drop(first_year_index).reset_index(drop=True)
# tables['players_teams'] = tables['players_teams'].sort_values(by=['playerID', 'year']).reset_index(drop=True)
# tables['players_teams'] = tables['players_teams'].apply(lambda x: x.fillna(0) if x.dtype in ['float64', 'int64'] else x)
# 
# # merge teams.csv with players_teams.csv
# tables['players_teams'] = pd.merge(tables['players_teams'], tables['teams'], on=['year', 'tmID'], how='inner')
# tables['players_teams'] = tables['players_teams'].sort_values(by=['tmID', 'year', 'minutes'], ascending=[True, True, False])
# tables['players_teams'] = tables['players_teams'].groupby(['tmID', 'year']).head(5).reset_index(drop=True)

tables['teams'] = tables['teams'][tables['teams']['year'] != 12]
tables['teams'] = tables['teams'].drop(columns=['lg_pace','pace'], axis=1)
tables['teams'] = tables['teams'][~((tables['teams']['year'] == 11) & (tables['teams']['tmID'] == 'SAC'))]

## 7 - Saving

In [None]:
from sklearn.calibration import LabelEncoder



def table_to_csv(name):
    os.makedirs('data_prepared', exist_ok=True)
    path = os.path.join('data_prepared', name.lower().replace(' ', '_') + '.csv')
    tables[name].to_csv(path, index=False)

for table in tables.keys():
    table_to_csv(table)