___
This notebook decides on the intermediate variables being used

In [1]:
import pandas as pd
import numpy as np
import scipy
from sklearn import *



## Read data
- regularseason detailed results
- ~~cities~~
- teams
- coaches
    - there is a problem if a coach is new, so to prevent this from happening
    - coach will have a proxy variables of
        1. number of years of experience up to that year
        1. number of championship
        1. number of playoffs made

In [2]:
raw_data_regularseason = pd.read_csv("data/DataFiles/RegularSeasonDetailedResults.csv")

In [3]:
raw_data_teams = pd.read_csv("data/DataFiles/Teams.csv")

In [4]:
raw_data_coaches = pd.read_csv("data/DataFiles/TeamCoaches.csv")

In [5]:
raw_data_teams_coaches = (
    raw_data_teams
    .merge(raw_data_coaches, how='left', on=['TeamID'])
)

In [6]:
raw_data_regularseason.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [7]:
raw_data_regularseason.dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
WFGM        int64
WFGA        int64
WFGM3       int64
WFGA3       int64
WFTM        int64
WFTA        int64
WOR         int64
WDR         int64
WAst        int64
WTO         int64
WStl        int64
WBlk        int64
WPF         int64
LFGM        int64
LFGA        int64
LFGM3       int64
LFGA3       int64
LFTM        int64
LFTA        int64
LOR         int64
LDR         int64
LAst        int64
LTO         int64
LStl        int64
LBlk        int64
LPF         int64
dtype: object

## Features to be included
- Season year
- winning/losing teamid
- winning/losing score
- winning/losing field goal percentage
- winning/losing field goal 3 point percentage
- winning/losing free throw percentage
- overall win rate

In [8]:
winning_teams_score_up_to_2013 = (
    raw_data_regularseason
    .pipe(lambda x:x.assign(winning_num_counts=1))
    .query("Season <= 2013")
    .groupby(['Season','WTeamID'])
    .agg({"WScore":"sum","WFGM":"sum","WFGA":"sum","WFGM3":"sum","WFGA3":"sum","WFTM":"sum","WFTA":"sum","LScore":"sum","winning_num_counts":"sum"})
    .reset_index()
    .rename(columns={"LScore":"losing_opponent_score"})
)

In [9]:
winning_teams_score_up_to_2013.head()

Unnamed: 0,Season,WTeamID,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,losing_opponent_score,winning_num_counts
0,2003,1102,825,271,480,120,259,163,249,638,12
1,2003,1103,1141,390,720,71,187,290,402,1019,13
2,2003,1104,1270,439,992,120,354,272,383,1046,17
3,2003,1105,556,179,433,64,157,134,180,465,7
4,2003,1106,888,322,700,76,207,168,270,753,13


In [10]:
losing_teams_score_up_to_2013 = (
    raw_data_regularseason
    .pipe(lambda x:x.assign(losing_num_counts=1))
    .query("Season <= 2013")
    .groupby(['Season','LTeamID'])
    .agg({"WScore":"sum","LScore":"sum","LFGM":"sum","LFGA":"sum","LFGM3":"sum","LFGA3":"sum","LFTM":"sum","LFTA":"sum","losing_num_counts":"sum"})
    .reset_index()
    .rename(columns={"WScore":"winning_opponent_score"})
)

In [11]:
losing_teams_score_up_to_2013.head()

Unnamed: 0,Season,LTeamID,winning_opponent_score,LScore,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,losing_num_counts
0,2003,1102,958,778,265,634,99,324,149,230,16
1,2003,1103,1091,986,343,788,76,247,224,296,14
2,2003,1104,774,670,234,609,58,202,144,203,11
3,2003,1105,1528,1310,455,1169,133,383,267,388,19
4,2003,1106,1032,893,334,848,95,287,130,191,15


In [12]:
combine_winning_losing_stats_for_year = (
    winning_teams_score_up_to_2013
    .merge(losing_teams_score_up_to_2013, how='left',left_on=['Season','WTeamID'],right_on=['Season','LTeamID'])
    .pipe(lambda x:x.assign(total_score = x.WScore + x.LScore))
    .pipe(lambda x:x.assign(total_opponent_score = x.winning_opponent_score + x.losing_opponent_score))
    .pipe(lambda x:x.assign(total_fgm = x.WFGM + x.LFGM))
    .pipe(lambda x:x.assign(total_fga = x.WFGA + x.LFGA))
    .pipe(lambda x:x.assign(total_fg3m = x.WFGM3 + x.LFGM3))
    .pipe(lambda x:x.assign(total_fg3a = x.WFGA3 + x.LFGA3))
    .pipe(lambda x:x.assign(total_ftm = x.WFTM + x.LFTM))
    .pipe(lambda x:x.assign(total_fta = x.WFTA + x.LFTA))
    .pipe(lambda x:x.assign(win_rate = x.winning_num_counts/(x.winning_num_counts + x.losing_num_counts)))
    .sort_values(['WTeamID','Season'])
)

In [13]:
combine_winning_losing_stats_for_year.head()
combine_winning_losing_stats_for_year.dtypes

Season                      int64
WTeamID                     int64
WScore                      int64
WFGM                        int64
WFGA                        int64
WFGM3                       int64
WFGA3                       int64
WFTM                        int64
WFTA                        int64
losing_opponent_score       int64
winning_num_counts          int64
LTeamID                     int64
winning_opponent_score      int64
LScore                      int64
LFGM                        int64
LFGA                        int64
LFGM3                       int64
LFGA3                       int64
LFTM                        int64
LFTA                        int64
losing_num_counts           int64
total_score                 int64
total_opponent_score        int64
total_fgm                   int64
total_fga                   int64
total_fg3m                  int64
total_fg3a                  int64
total_ftm                   int64
total_fta                   int64
win_rate      

In [14]:
cumulative_stats_for_team_each_year = (
    combine_winning_losing_stats_for_year
    .sort_values(['WTeamID','Season'])
    .groupby(['WTeamID'])
    .cumsum()
    .pipe(lambda x:x.assign(Season = combine_winning_losing_stats_for_year.Season.values))
    .pipe(lambda x:x.assign(TeamID = combine_winning_losing_stats_for_year.WTeamID.values))
    .drop(['LTeamID','win_rate'],1)
    .pipe(lambda x:x.assign(win_rate = x.winning_num_counts/(x.winning_num_counts + x.losing_num_counts)))
    .pipe(lambda x:x.assign(WFGP = x.WFGM/x.WFGA))
    .pipe(lambda x:x.assign(WFG3P = x.WFGM3/x.WFGA3))
    .pipe(lambda x:x.assign(WFTP = x.WFTM/x.WFTA))
    .pipe(lambda x:x.assign(LFGP = x.LFGM/x.LFGA))
    .pipe(lambda x:x.assign(LFG3P = x.LFGM3/x.LFGA3))
    .pipe(lambda x:x.assign(LFTP = x.LFTM/x.LFTA))
    .pipe(lambda x:x.assign(fgp = x.total_fgm/x.total_fga))
    .pipe(lambda x:x.assign(fg3p = x.total_fg3m/x.total_fg3a))
    .pipe(lambda x:x.assign(ftp = x.total_ftm/x.total_fta))
)

In [15]:
cumulative_stats_for_team_each_year.head()

Unnamed: 0,Season,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,losing_opponent_score,winning_num_counts,...,win_rate,WFGP,WFG3P,WFTP,LFGP,LFG3P,LFTP,fgp,fg3p,ftp
0,2003,825,271,480,120,259,163,249,638,12,...,0.428571,0.564583,0.46332,0.654618,0.417981,0.305556,0.647826,0.481149,0.375643,0.651357
327,2004,2229,737,1393,312,734,443,636,1709,34,...,0.607143,0.529074,0.425068,0.696541,0.408686,0.305732,0.639576,0.481886,0.378423,0.678999
653,2005,3326,1115,2180,458,1110,638,902,2533,51,...,0.6,0.511468,0.412613,0.707317,0.408027,0.318804,0.661616,0.469388,0.373236,0.693374
982,2006,4756,1604,3171,659,1588,889,1234,3676,73,...,0.646018,0.505834,0.414987,0.720421,0.412921,0.317597,0.669456,0.47243,0.378968,0.706192
1316,2007,6347,2135,4205,870,2061,1207,1652,4844,95,...,0.664336,0.507729,0.422125,0.73063,0.412256,0.315093,0.688119,0.475389,0.384158,0.719221


In [16]:
cumulative_stats_for_team_each_year.dtypes

Season                      int64
WScore                      int64
WFGM                        int64
WFGA                        int64
WFGM3                       int64
WFGA3                       int64
WFTM                        int64
WFTA                        int64
losing_opponent_score       int64
winning_num_counts          int64
winning_opponent_score      int64
LScore                      int64
LFGM                        int64
LFGA                        int64
LFGM3                       int64
LFGA3                       int64
LFTM                        int64
LFTA                        int64
losing_num_counts           int64
total_score                 int64
total_opponent_score        int64
total_fgm                   int64
total_fga                   int64
total_fg3m                  int64
total_fg3a                  int64
total_ftm                   int64
total_fta                   int64
TeamID                      int64
win_rate                  float64
WFGP          

## Some variations to try for features
- separate winning and losing
    - reconcilation of winning and losing will have to be done later
    - could be diff between percentage --> this might give an insight of when they are losing/winning?

## Intermediate Variables
- Coach stats
    - number of years till that season
    - number of championship till that season
    - number of playoffs made till that season
    - win rate of total games till that season
        - consider regular or playoff only?
- ~~win rate for home court~~
- ~~win rate for away court~~
- ~~win rate for neutral court~~
- offensive stats
    - offensive rebounds
    - points scored
    - might try play by play later?
- defensive stats
    - defensive rebounds
    - points scored by opponents
    - turn over from play by play???
    - might try play by play later?
- blocks, steals and personal fouls
- ~~expectation to win by how many points in a game~~
- 


#### reconcilation of intermediate variables
- relative scoring method
     - will have a score of between 0 to 1


#### features being throw into prediction model
- test out raw intermediate variables
    - then test out difference in values
    - or something else

In [17]:
#win rate for home court
#need to ensure that the joining is from a bigger table
raw_data_regularseason.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [18]:
win_test = (
    raw_data_regularseason
    .groupby(['Season','WTeamID','WLoc'])
    .count()
    .reset_index()
    [['Season','WTeamID','WLoc','DayNum']]
)

In [19]:
lose_test = (
    raw_data_regularseason
    .groupby(['Season','LTeamID','WLoc'])
    .count()
    .reset_index()
    [['Season','LTeamID','WLoc','DayNum']]
)

In [20]:
win_test.head()

Unnamed: 0,Season,WTeamID,WLoc,DayNum
0,2003,1102,A,3
1,2003,1102,H,9
2,2003,1103,A,4
3,2003,1103,H,9
4,2003,1104,A,1


In [21]:
lose_test.head()

Unnamed: 0,Season,LTeamID,WLoc,DayNum
0,2003,1102,A,4
1,2003,1102,H,10
2,2003,1102,N,2
3,2003,1103,A,5
4,2003,1103,H,9


In [22]:
test = (
    lose_test
    .drop(['DayNum'],1)
    .append(win_test.rename(columns={"WTeamID":"LTeamID"}).drop(['DayNum'],1))
    .groupby(['Season','LTeamID','WLoc'])
    .count()
    .reset_index()
)

win_rate_type_of_court = (
    test
    .merge(win_test,how='left',left_on=['Season','LTeamID','WLoc'], right_on=['Season','WTeamID','WLoc'])
    .merge(lose_test,how='left',left_on=['Season','LTeamID','WLoc'],right_on=['Season','LTeamID','WLoc'])
    .fillna(0)
    .rename(columns={"LTeamID":"TeamID","DayNum_x":"games_won","DayNum_y":"games_lost"})
    .drop(['WTeamID'],1)
    .pipe(lambda x:x.assign(win_rate = x.games_won/(x.games_won + x.games_lost)))
)


win_rate_type_of_court.head()

Unnamed: 0,Season,TeamID,WLoc,games_won,games_lost,win_rate
0,2003,1102,A,3.0,4.0,0.428571
1,2003,1102,H,9.0,10.0,0.473684
2,2003,1102,N,0.0,2.0,0.0
3,2003,1103,A,4.0,5.0,0.444444
4,2003,1103,H,9.0,9.0,0.5


In [23]:
win_rate_away = (
    win_rate_type_of_court
    .query("WLoc == 'A'")
    .rename(columns={"win_rate":"win_rate_away"})
    [['Season','TeamID','win_rate_away']]
)

win_rate_neutral = (
    win_rate_type_of_court
    .query("WLoc == 'N'")
    .rename(columns={"win_rate":"win_rate_neutral"})
    [['Season','TeamID','win_rate_neutral']]
)

win_rate_home = (
    win_rate_type_of_court
    .query("WLoc == 'H'")
    .rename(columns={"win_rate":"win_rate_home"})
    [['Season','TeamID','win_rate_home']]
)

more_testing = win_rate_type_of_court.sort_values(['TeamID','Season']).query("WLoc=='A'").head().groupby(['TeamID']).cumsum()

whatever = win_rate_away.sort_values(['TeamID','Season']).head()

more_testing.pipe(lambda x:x.assign(TeamID = whatever.TeamID.values))

Unnamed: 0,Season,games_won,games_lost,win_rate,TeamID
10783,2014,0.0,5.0,0.0,1101
11801,4029,1.0,11.0,0.142857,1101
12821,6045,3.0,16.0,0.428571,1101
13839,8062,7.0,21.0,0.873016,1101
0,2003,3.0,4.0,0.428571,1102


In [24]:
# combine back with cumulative table
cumulative_stats_for_team_each_year.head()

Unnamed: 0,Season,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,losing_opponent_score,winning_num_counts,...,win_rate,WFGP,WFG3P,WFTP,LFGP,LFG3P,LFTP,fgp,fg3p,ftp
0,2003,825,271,480,120,259,163,249,638,12,...,0.428571,0.564583,0.46332,0.654618,0.417981,0.305556,0.647826,0.481149,0.375643,0.651357
327,2004,2229,737,1393,312,734,443,636,1709,34,...,0.607143,0.529074,0.425068,0.696541,0.408686,0.305732,0.639576,0.481886,0.378423,0.678999
653,2005,3326,1115,2180,458,1110,638,902,2533,51,...,0.6,0.511468,0.412613,0.707317,0.408027,0.318804,0.661616,0.469388,0.373236,0.693374
982,2006,4756,1604,3171,659,1588,889,1234,3676,73,...,0.646018,0.505834,0.414987,0.720421,0.412921,0.317597,0.669456,0.47243,0.378968,0.706192
1316,2007,6347,2135,4205,870,2061,1207,1652,4844,95,...,0.664336,0.507729,0.422125,0.73063,0.412256,0.315093,0.688119,0.475389,0.384158,0.719221


In [25]:
intermediate_combine_stats_for_team_each_year = (
    cumulative_stats_for_team_each_year
    .merge(win_rate_away,how='left',on=['Season','TeamID'])
    .merge(win_rate_home,how='left',on=['Season','TeamID'])
    .merge(win_rate_neutral,how='left',on=['Season','TeamID'])
)

intermediate_combine_stats_for_team_each_year.head()

Unnamed: 0,Season,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,losing_opponent_score,winning_num_counts,...,WFTP,LFGP,LFG3P,LFTP,fgp,fg3p,ftp,win_rate_away,win_rate_home,win_rate_neutral
0,2003,825,271,480,120,259,163,249,638,12,...,0.654618,0.417981,0.305556,0.647826,0.481149,0.375643,0.651357,0.428571,0.473684,0.0
1,2004,2229,737,1393,312,734,443,636,1709,34,...,0.696541,0.408686,0.305732,0.639576,0.481886,0.378423,0.678999,1.0,0.722222,0.5
2,2005,3326,1115,2180,458,1110,638,902,2533,51,...,0.707317,0.408027,0.318804,0.661616,0.469388,0.373236,0.693374,0.8,0.55,0.5
3,2006,4756,1604,3171,659,1588,889,1234,3676,73,...,0.720421,0.412921,0.317597,0.669456,0.47243,0.378968,0.706192,1.0,0.736842,0.666667
4,2007,6347,2135,4205,870,2061,1207,1652,4844,95,...,0.73063,0.412256,0.315093,0.688119,0.475389,0.384158,0.719221,0.857143,0.722222,0.6


## offensive stats

In [26]:
# scored 
# offensive rebounds
# percentage of offensive rebounds to total rebounds
# offensive rebounding percentage, field goal missed
# defensive rebounds

In [27]:
# block % from opponent field goal attempted
# assist / turnover ratio
# assist per fgm

# win by how many points
# lose by how many points

In [28]:
# normalization on variables

In [29]:
raw_data_regularseason.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [30]:
#win and lose by how many points

In [31]:
combine_winning_losing_stats_for_year.dtypes

Season                      int64
WTeamID                     int64
WScore                      int64
WFGM                        int64
WFGA                        int64
WFGM3                       int64
WFGA3                       int64
WFTM                        int64
WFTA                        int64
losing_opponent_score       int64
winning_num_counts          int64
LTeamID                     int64
winning_opponent_score      int64
LScore                      int64
LFGM                        int64
LFGA                        int64
LFGM3                       int64
LFGA3                       int64
LFTM                        int64
LFTA                        int64
losing_num_counts           int64
total_score                 int64
total_opponent_score        int64
total_fgm                   int64
total_fga                   int64
total_fg3m                  int64
total_fg3a                  int64
total_ftm                   int64
total_fta                   int64
win_rate      

In [32]:
win_rate_df = (
    combine_winning_losing_stats_for_year
    [['Season','WTeamID','winning_num_counts','losing_num_counts','WScore','losing_opponent_score','LScore','winning_opponent_score']]
    .pipe(lambda x:x.assign(win_rate = x.winning_num_counts/(x.winning_num_counts + x.losing_num_counts)))
    .pipe(lambda x:x.assign(lose_rate = 1-x.win_rate))
    .pipe(lambda x:x.assign(win_score_by = x.WScore - x.losing_opponent_score))
    .pipe(lambda x:x.assign(lose_score_by = x.LScore - x.winning_opponent_score))
    .pipe(lambda x:x.assign(expectation_per_game = x.win_rate * x.win_score_by/x.winning_num_counts + x.lose_rate * x.lose_score_by/x.losing_num_counts))
    .pipe(lambda x:x.assign(avg_win_score_by = x.win_score_by/x.winning_num_counts))
    .pipe(lambda x:x.assign(avg_lose_score_by = x.lose_score_by/x.losing_num_counts))
    .rename(columns={"WTeamID":"TeamID"})
)

win_rate_df.head()

Unnamed: 0,Season,TeamID,winning_num_counts,losing_num_counts,WScore,losing_opponent_score,LScore,winning_opponent_score,win_rate,lose_rate,win_score_by,lose_score_by,expectation_per_game,avg_win_score_by,avg_lose_score_by
0,2003,1102,12,16,825,638,778,958,0.428571,0.571429,187,-180,0.25,15.583333,-11.25
327,2004,1102,22,6,1404,1071,281,341,0.785714,0.214286,333,-60,9.75,15.136364,-10.0
653,2005,1102,17,12,1097,824,679,775,0.586207,0.413793,273,-96,6.103448,16.058824,-8.0
982,2006,1102,22,6,1430,1143,348,385,0.785714,0.214286,287,-37,8.928571,13.045455,-6.166667
1316,2007,1102,22,8,1591,1168,464,539,0.733333,0.266667,423,-75,11.6,19.227273,-9.375


In [33]:
win_rate_cum_df = (
    cumulative_stats_for_team_each_year
    [['Season','TeamID','winning_num_counts','losing_num_counts','WScore','losing_opponent_score','LScore','winning_opponent_score']]
    .pipe(lambda x:x.assign(win_rate = x.winning_num_counts/(x.winning_num_counts + x.losing_num_counts)))
    .pipe(lambda x:x.assign(lose_rate = 1-x.win_rate))
    .pipe(lambda x:x.assign(win_score_by = x.WScore - x.losing_opponent_score))
    .pipe(lambda x:x.assign(lose_score_by = x.LScore - x.winning_opponent_score))
    .pipe(lambda x:x.assign(expectation_per_game = x.win_rate * x.win_score_by/x.winning_num_counts + x.lose_rate * x.lose_score_by/x.losing_num_counts))
    .pipe(lambda x:x.assign(avg_win_score_by = x.win_score_by/x.winning_num_counts))
    .pipe(lambda x:x.assign(avg_lose_score_by = x.lose_score_by/x.losing_num_counts))
)

win_rate_cum_df.head()

Unnamed: 0,Season,TeamID,winning_num_counts,losing_num_counts,WScore,losing_opponent_score,LScore,winning_opponent_score,win_rate,lose_rate,win_score_by,lose_score_by,expectation_per_game,avg_win_score_by,avg_lose_score_by
0,2003,1102,12,16,825,638,778,958,0.428571,0.571429,187,-180,0.25,15.583333,-11.25
327,2004,1102,34,22,2229,1709,1059,1299,0.607143,0.392857,520,-240,5.0,15.294118,-10.909091
653,2005,1102,51,34,3326,2533,1738,2074,0.6,0.4,793,-336,5.376471,15.54902,-9.882353
982,2006,1102,73,40,4756,3676,2086,2459,0.646018,0.353982,1080,-373,6.256637,14.794521,-9.325
1316,2007,1102,95,48,6347,4844,2550,2998,0.664336,0.335664,1503,-448,7.377622,15.821053,-9.333333


In [34]:
# rebounds
raw_data_regularseason.dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
WFGM        int64
WFGA        int64
WFGM3       int64
WFGA3       int64
WFTM        int64
WFTA        int64
WOR         int64
WDR         int64
WAst        int64
WTO         int64
WStl        int64
WBlk        int64
WPF         int64
LFGM        int64
LFGA        int64
LFGM3       int64
LFGA3       int64
LFTM        int64
LFTA        int64
LOR         int64
LDR         int64
LAst        int64
LTO         int64
LStl        int64
LBlk        int64
LPF         int64
dtype: object

In [35]:
rebounds_winning_teams_score_up_to_2013 = (
    raw_data_regularseason
    .pipe(lambda x:x.assign(winning_num_counts=1))
    .query("Season <= 2013")
    .groupby(['Season','WTeamID'])
    .agg({"WOR":"sum","WDR":"sum","WFGA":"sum","WFGM":"sum","LFGM":"sum","LFGA":"sum"})
    .reset_index()
    .pipe(lambda x:x.assign(total_winning_rebounds = x.WOR + x.WDR))
    .pipe(lambda x:x.assign(winning_off_rebounds_percent = x.WOR/x.total_winning_rebounds))
    .pipe(lambda x:x.assign(winning_def_rebounds_percent = x.WDR/x.total_winning_rebounds))
    .pipe(lambda x:x.assign(team_missed_attempts = x.WFGA - x.WFGM))
    .pipe(lambda x:x.assign(opp_team_missed_attempts = x.LFGA - x.LFGM))
    .pipe(lambda x:x.assign(winning_rebound_possession_percent = x.WOR/x.team_missed_attempts))
    .pipe(lambda x:x.assign(winning_rebound_possessiongain_percent = x.WDR/x.opp_team_missed_attempts))
)

In [36]:
rebounds_winning_teams_score_up_to_2013.head()

Unnamed: 0,Season,WTeamID,WOR,WDR,WFGA,WFGM,LFGM,LFGA,total_winning_rebounds,winning_off_rebounds_percent,winning_def_rebounds_percent,team_missed_attempts,opp_team_missed_attempts,winning_rebound_possession_percent,winning_rebound_possessiongain_percent
0,2003,1102,46,232,480,271,228,560,278,0.165468,0.834532,209,332,0.220096,0.698795
1,2003,1103,122,279,720,390,358,780,401,0.304239,0.695761,330,422,0.369697,0.661137
2,2003,1104,230,449,992,439,376,978,679,0.338733,0.661267,553,602,0.415913,0.745847
3,2003,1105,102,181,433,179,161,403,283,0.360424,0.639576,254,242,0.401575,0.747934
4,2003,1106,166,364,700,322,244,702,530,0.313208,0.686792,378,458,0.439153,0.79476


In [37]:
rebounds_losing_teams_score_up_to_2013 = (
    raw_data_regularseason
    .pipe(lambda x:x.assign(losing_num_counts=1))
    .query("Season <= 2013")
    .groupby(['Season','LTeamID'])
    .agg({"LOR":"sum","LDR":"sum","LFGM":"sum","LFGA":"sum","WFGA":"sum","WFGM":"sum"})
    .reset_index()
    .pipe(lambda x:x.assign(total_losing_rebounds = x.LOR + x.LDR))
    .pipe(lambda x:x.assign(losing_off_rebounds_percent = x.LOR/x.total_losing_rebounds))
    .pipe(lambda x:x.assign(losing_def_rebounds_percent = x.LDR/x.total_losing_rebounds))
    .pipe(lambda x:x.assign(losing_team_missed_attempts = x.LFGA - x.LFGM))
    .pipe(lambda x:x.assign(winning_opp_team_missed_attempts = x.WFGA - x.WFGM))
    .pipe(lambda x:x.assign(losing_rebound_possession_percent = x.LOR/x.losing_team_missed_attempts))
    .pipe(lambda x:x.assign(losing_rebound_possessiongain_percent = x.LDR/x.winning_opp_team_missed_attempts))
)

rebounds_losing_teams_score_up_to_2013.head()

Unnamed: 0,Season,LTeamID,LOR,LDR,LFGM,LFGA,WFGA,WFGM,total_losing_rebounds,losing_off_rebounds_percent,losing_def_rebounds_percent,losing_team_missed_attempts,winning_opp_team_missed_attempts,losing_rebound_possession_percent,losing_rebound_possessiongain_percent
0,2003,1102,71,239,265,634,628,312,310,0.229032,0.770968,369,316,0.192412,0.756329
1,2003,1103,142,259,343,788,759,392,401,0.354115,0.645885,445,367,0.319101,0.705722
2,2003,1104,150,221,234,609,576,275,371,0.404313,0.595687,375,301,0.4,0.734219
3,2003,1105,249,420,455,1169,1130,541,669,0.372197,0.627803,714,589,0.348739,0.713073
4,2003,1106,178,304,334,848,793,364,482,0.369295,0.630705,514,429,0.346304,0.708625


In [38]:
combine_winning_losing_rebounds_stats_for_year = (
    rebounds_winning_teams_score_up_to_2013
    .merge(rebounds_losing_teams_score_up_to_2013, how='left',left_on=['Season','WTeamID'],right_on=['Season','LTeamID'])
    .pipe(lambda x:x.assign(total_rebounds = x.total_winning_rebounds + x.total_losing_rebounds))
    .pipe(lambda x:x.assign(total_off_rebounds = x.WOR + x.LOR))
    .pipe(lambda x:x.assign(total_def_rebounds = x.WDR + x.LDR))
    .pipe(lambda x:x.assign(total_off_rebounds_percent = x.total_off_rebounds/x.total_rebounds))
    .pipe(lambda x:x.assign(total_def_rebounds_percent = x.total_def_rebounds/x.total_rebounds))
    .pipe(lambda x:x.assign(total_team_missed_attempts = x.team_missed_attempts + x.losing_team_missed_attempts))
    .pipe(lambda x:x.assign(total_opp_team_missed_attempts = x.opp_team_missed_attempts + x.winning_opp_team_missed_attempts))
    .pipe(lambda x:x.assign(total_rebound_possession_percent = x.total_off_rebounds/x.total_team_missed_attempts))
    .pipe(lambda x:x.assign(total_rebound_possessiongain_percent = x.total_def_rebounds/x.total_opp_team_missed_attempts))
    .rename(columns={"WTeamID":"TeamID"})
    [['Season','TeamID','total_rebounds','total_off_rebounds','total_def_rebounds','total_def_rebounds_percent',
      'total_off_rebounds_percent','total_rebound_possession_percent','total_rebound_possessiongain_percent',
      'total_team_missed_attempts','total_opp_team_missed_attempts']]
)

In [39]:
combine_winning_losing_rebounds_stats_for_year.head()

Unnamed: 0,Season,TeamID,total_rebounds,total_off_rebounds,total_def_rebounds,total_def_rebounds_percent,total_off_rebounds_percent,total_rebound_possession_percent,total_rebound_possessiongain_percent,total_team_missed_attempts,total_opp_team_missed_attempts
0,2003,1102,588,117,471,0.80102,0.19898,0.202422,0.726852,578,648
1,2003,1103,802,264,538,0.670823,0.329177,0.340645,0.681876,775,789
2,2003,1104,1050,380,670,0.638095,0.361905,0.409483,0.741971,928,903
3,2003,1105,952,351,601,0.631303,0.368697,0.362603,0.723225,968,831
4,2003,1106,1012,344,668,0.660079,0.339921,0.38565,0.7531,892,887


In [40]:
cumulative_winning_losing_rebounds_stats = (
    combine_winning_losing_rebounds_stats_for_year
    .sort_values(['TeamID','Season'])
    .groupby(['TeamID'])
    .cumsum()
    .pipe(lambda x:x.assign(total_def_rebounds_percent = x.total_def_rebounds/x.total_rebounds))
    .pipe(lambda x:x.assign(total_off_rebounds_percent = x.total_off_rebounds/x.total_rebounds))
    .pipe(lambda x:x.assign(total_rebound_possession_percent = x.total_off_rebounds/x.total_team_missed_attempts))
    .pipe(lambda x:x.assign(total_rebound_possessiongain_percent = x.total_def_rebounds/x.total_opp_team_missed_attempts))
    .pipe(lambda x:x.assign(Season = combine_winning_losing_stats_for_year.Season.values))
    .pipe(lambda x:x.assign(TeamID = combine_winning_losing_stats_for_year.WTeamID.values))
)

In [41]:
# blocks, steals, assists


In [42]:
raw_data_regularseason.dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
WFGM        int64
WFGA        int64
WFGM3       int64
WFGA3       int64
WFTM        int64
WFTA        int64
WOR         int64
WDR         int64
WAst        int64
WTO         int64
WStl        int64
WBlk        int64
WPF         int64
LFGM        int64
LFGA        int64
LFGM3       int64
LFGA3       int64
LFTM        int64
LFTA        int64
LOR         int64
LDR         int64
LAst        int64
LTO         int64
LStl        int64
LBlk        int64
LPF         int64
dtype: object

In [43]:
bl_sl_topf_winning_teams_score_up_to_2013 = (
    raw_data_regularseason
    .pipe(lambda x:x.assign(winning_num_counts=1))
    .query("Season <= 2013")
    .groupby(['Season','WTeamID'])
    .agg({"WAst":"sum","WTO":"sum","WStl":"sum","WBlk":"sum","WPF":"sum","LFGA":"sum","WFGM":"sum"})
    .reset_index()
    .pipe(lambda x:x.assign(winning_block_opp_FGA_percent = x.WBlk/x.LFGA))
    .pipe(lambda x:x.assign(winning_assist_per_fgm = x.WAst/x.WFGM))
    .pipe(lambda x:x.assign(winning_assist_turnover_ratio = x.WAst/x.WTO))
)

bl_sl_topf_winning_teams_score_up_to_2013.head()

Unnamed: 0,Season,WTeamID,WAst,WTO,WStl,WBlk,WPF,LFGA,WFGM,winning_block_opp_FGA_percent,winning_assist_per_fgm,winning_assist_turnover_ratio
0,2003,1102,203,133,88,34,193,560,271,0.060714,0.749077,1.526316
1,2003,1103,230,163,95,24,266,780,390,0.030769,0.589744,1.411043
2,2003,1104,238,222,123,71,280,978,439,0.072597,0.542141,1.072072
3,2003,1105,111,126,79,14,136,403,179,0.034739,0.620112,0.880952
4,2003,1106,169,230,122,49,239,702,322,0.069801,0.524845,0.734783


In [44]:
bl_sl_topf_losing_teams_score_up_to_2013 = (
    raw_data_regularseason
    .pipe(lambda x:x.assign(losing_num_counts=1))
    .query("Season <= 2013")
    .groupby(['Season','LTeamID'])
    .agg({"LAst":"sum","LTO":"sum","LStl":"sum","LBlk":"sum","LPF":"sum","WFGA":"sum","LFGM":"sum"})
    .reset_index()
    .pipe(lambda x:x.assign(losing_block_opp_FGA_percent = x.LBlk/x.WFGA))
    .pipe(lambda x:x.assign(losing_assist_per_fgm = x.LAst/x.LFGM))
    .pipe(lambda x:x.assign(losing_assist_turnover_ratio = x.LAst/x.LTO))
)

bl_sl_topf_losing_teams_score_up_to_2013.head()

Unnamed: 0,Season,LTeamID,LAst,LTO,LStl,LBlk,LPF,WFGA,LFGM,losing_block_opp_FGA_percent,losing_assist_per_fgm,losing_assist_turnover_ratio
0,2003,1102,161,187,79,16,332,628,265,0.025478,0.607547,0.860963
1,2003,1103,181,178,101,39,270,759,343,0.051383,0.527697,1.016854
2,2003,1104,101,150,62,35,225,576,234,0.060764,0.431624,0.673333
3,2003,1105,267,359,163,40,390,1130,455,0.035398,0.586813,0.743733
4,2003,1106,158,247,112,39,270,793,334,0.04918,0.473054,0.639676


In [45]:
combine_winning_losing_other_stats_for_year = (
    bl_sl_topf_winning_teams_score_up_to_2013
    .merge(bl_sl_topf_losing_teams_score_up_to_2013, how='left',left_on=['Season','WTeamID'],right_on=['Season','LTeamID'])
    .pipe(lambda x:x.assign(total_blocks = x.WBlk + x.LBlk))
    .pipe(lambda x:x.assign(total_assists = x.WAst + x.LAst))
    .pipe(lambda x:x.assign(total_steals = x.WStl + x.LStl))
    .pipe(lambda x:x.assign(total_turnover = x.WTO + x.LTO))
    .pipe(lambda x:x.assign(total_personalfoul = x.WPF + x.LPF))
    .pipe(lambda x:x.assign(total_opp_fga = x.LFGA + x.WFGA))
    .pipe(lambda x:x.assign(total_fgm = x.WFGM + x.LFGM))
    .pipe(lambda x:x.assign(total_block_opp_FGA_percent = x.total_blocks/x.total_opp_fga))
    .pipe(lambda x:x.assign(total_assist_per_fgm = x.total_assists/x.total_fgm))
    .pipe(lambda x:x.assign(total_assist_turnover_ratio = x.total_assists/x.total_turnover))
    .rename(columns={"WTeamID":"TeamID"})
    [['Season','TeamID','total_blocks','total_assists','total_steals','total_turnover','total_personalfoul','total_block_opp_FGA_percent','total_assist_per_fgm','total_assist_turnover_ratio','total_opp_fga','total_fgm']]
)

In [46]:
combine_winning_losing_other_stats_for_year.head()

Unnamed: 0,Season,TeamID,total_blocks,total_assists,total_steals,total_turnover,total_personalfoul,total_block_opp_FGA_percent,total_assist_per_fgm,total_assist_turnover_ratio,total_opp_fga,total_fgm
0,2003,1102,50,364,167,320,525,0.042088,0.679104,1.1375,1188,536
1,2003,1103,63,411,196,341,536,0.040936,0.560709,1.205279,1539,733
2,2003,1104,106,339,185,372,505,0.068211,0.503715,0.91129,1554,673
3,2003,1105,54,378,242,485,526,0.035225,0.596215,0.779381,1533,634
4,2003,1106,88,327,234,477,509,0.058863,0.498476,0.685535,1495,656


In [47]:
combine_winning_losing_other_stats_for_year.dtypes

Season                           int64
TeamID                           int64
total_blocks                     int64
total_assists                    int64
total_steals                     int64
total_turnover                   int64
total_personalfoul               int64
total_block_opp_FGA_percent    float64
total_assist_per_fgm           float64
total_assist_turnover_ratio    float64
total_opp_fga                    int64
total_fgm                        int64
dtype: object

In [48]:
cumulative_winning_losing_rebounds_stats = (
    combine_winning_losing_other_stats_for_year
    .sort_values(['TeamID','Season'])
    .groupby(['TeamID'])
    .cumsum()
    .pipe(lambda x:x.assign(total_block_opp_FGA_percent = x.total_blocks/x.total_opp_fga))
    .pipe(lambda x:x.assign(total_assist_per_fgm = x.total_assists/x.total_fgm))
    .pipe(lambda x:x.assign(total_assist_turnover_ratio = x.total_assists/x.total_turnover))
    .pipe(lambda x:x.assign(Season = combine_winning_losing_stats_for_year.Season.values))
    .pipe(lambda x:x.assign(TeamID = combine_winning_losing_stats_for_year.WTeamID.values))
)

In [49]:
cumulative_winning_losing_rebounds_stats.head()

Unnamed: 0,Season,total_blocks,total_assists,total_steals,total_turnover,total_personalfoul,total_block_opp_FGA_percent,total_assist_per_fgm,total_assist_turnover_ratio,total_opp_fga,total_fgm,TeamID
0,2003,50,364,167,320,525,0.042088,0.679104,1.1375,1188,536,1102
327,2004,116,735,385,622,991,0.050043,0.665761,1.181672,2318,1104,1102
653,2005,166,1131,640,914,1469,0.047715,0.655652,1.237418,3479,1725,1102
982,2006,225,1528,864,1220,1900,0.047249,0.653271,1.252459,4762,2339,1102
1316,2007,269,1979,1052,1525,2355,0.043255,0.654648,1.297705,6219,3023,1102


In [50]:
#min max standardization
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
minmax_scale = scaler.fit(combine_winning_losing_other_stats_for_year[['total_assists']])
df_minmax = minmax_scale.transform(combine_winning_losing_other_stats_for_year[['total_assists']])

In [51]:
winning_games_up_to_2013 = (
    raw_data_regularseason
    .pipe(lambda x:x.assign(winning_num_counts = 1))
    .query("Season <= 2013")
    .groupby(['Season','WTeamID'])
    .agg({"WScore":"sum","WFGM":"sum","WFGA":"sum","WFGM3":"sum","WFGA3":"sum","WFTM":"sum","WFTA":"sum","LScore":"sum","winning_num_counts":"sum",
          "WOR":"sum","WDR":"sum","LFGM":"sum","LFGA":"sum",
          "WAst":"sum","WTO":"sum","WStl":"sum","WBlk":"sum","WPF":"sum"})
    .reset_index()
    .rename(columns={"LScore":"losing_opponent_score"})
    # rebounds
    .pipe(lambda x:x.assign(total_winning_rebounds = x.WOR + x.WDR))
    .pipe(lambda x:x.assign(winning_off_rebounds_percent = x.WOR/x.total_winning_rebounds))
    .pipe(lambda x:x.assign(winning_def_rebounds_percent = x.WDR/x.total_winning_rebounds))
    .pipe(lambda x:x.assign(team_missed_attempts = x.WFGA - x.WFGM))
    .pipe(lambda x:x.assign(opp_team_missed_attempts = x.LFGA - x.LFGM))
    .pipe(lambda x:x.assign(winning_rebound_possession_percent = x.WOR/x.team_missed_attempts))
    .pipe(lambda x:x.assign(winning_rebound_possessiongain_percent = x.WDR/x.opp_team_missed_attempts))
    # blocks, steals, assists and turnovers
    .pipe(lambda x:x.assign(winning_block_opp_FGA_percent = x.WBlk/x.LFGA))
    .pipe(lambda x:x.assign(winning_assist_per_fgm = x.WAst/x.WFGM))
    .pipe(lambda x:x.assign(winning_assist_turnover_ratio = x.WAst/x.WTO))
    # rename columns to prevent duplication when joining with losing stats. example: WFGM_x
    .rename(columns={"LFGA":"LFGA_opp","LFGM":"LFGM_opp"})
)

In [52]:
winning_games_up_to_2013.head()

Unnamed: 0,Season,WTeamID,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,losing_opponent_score,...,total_winning_rebounds,winning_off_rebounds_percent,winning_def_rebounds_percent,team_missed_attempts,opp_team_missed_attempts,winning_rebound_possession_percent,winning_rebound_possessiongain_percent,winning_block_opp_FGA_percent,winning_assist_per_fgm,winning_assist_turnover_ratio
0,2003,1102,825,271,480,120,259,163,249,638,...,278,0.165468,0.834532,209,332,0.220096,0.698795,0.060714,0.749077,1.526316
1,2003,1103,1141,390,720,71,187,290,402,1019,...,401,0.304239,0.695761,330,422,0.369697,0.661137,0.030769,0.589744,1.411043
2,2003,1104,1270,439,992,120,354,272,383,1046,...,679,0.338733,0.661267,553,602,0.415913,0.745847,0.072597,0.542141,1.072072
3,2003,1105,556,179,433,64,157,134,180,465,...,283,0.360424,0.639576,254,242,0.401575,0.747934,0.034739,0.620112,0.880952
4,2003,1106,888,322,700,76,207,168,270,753,...,530,0.313208,0.686792,378,458,0.439153,0.79476,0.069801,0.524845,0.734783


In [53]:
losing_games_up_to_2013 = (
    raw_data_regularseason
    .pipe(lambda x:x.assign(losing_num_counts=1))
    .query("Season <= 2013")
    .groupby(['Season','LTeamID'])
    .agg({"WScore":"sum","LScore":"sum","LFGM":"sum","LFGA":"sum","LFGM3":"sum","LFGA3":"sum","LFTM":"sum","LFTA":"sum","losing_num_counts":"sum",
          "LOR":"sum","LDR":"sum","WFGA":"sum","WFGM":"sum",
          "LAst":"sum","LTO":"sum","LStl":"sum","LBlk":"sum","LPF":"sum"})
    .reset_index()
    .rename(columns={"WScore":"winning_opponent_score"})
    # rebounds
    .pipe(lambda x:x.assign(total_losing_rebounds = x.LOR + x.LDR))
    .pipe(lambda x:x.assign(losing_off_rebounds_percent = x.LOR/x.total_losing_rebounds))
    .pipe(lambda x:x.assign(losing_def_rebounds_percent = x.LDR/x.total_losing_rebounds))
    .pipe(lambda x:x.assign(losing_team_missed_attempts = x.LFGA - x.LFGM))
    .pipe(lambda x:x.assign(winning_opp_team_missed_attempts = x.WFGA - x.WFGM))
    .pipe(lambda x:x.assign(losing_rebound_possession_percent = x.LOR/x.losing_team_missed_attempts))
    .pipe(lambda x:x.assign(losing_rebound_possessiongain_percent = x.LDR/x.winning_opp_team_missed_attempts))
    # blocks, steals, assists and turnovers
    .pipe(lambda x:x.assign(losing_block_opp_FGA_percent = x.LBlk/x.WFGA))
    .pipe(lambda x:x.assign(losing_assist_per_fgm = x.LAst/x.LFGM))
    .pipe(lambda x:x.assign(losing_assist_turnover_ratio = x.LAst/x.LTO))
    # rename columns to prevent duplication when joining with losing stats. example: WFGM_x
    .rename(columns={"WFGA":"WFGA_opp","WFGM":"WFGM_opp"})
)

losing_games_up_to_2013.head()

Unnamed: 0,Season,LTeamID,winning_opponent_score,LScore,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,...,total_losing_rebounds,losing_off_rebounds_percent,losing_def_rebounds_percent,losing_team_missed_attempts,winning_opp_team_missed_attempts,losing_rebound_possession_percent,losing_rebound_possessiongain_percent,losing_block_opp_FGA_percent,losing_assist_per_fgm,losing_assist_turnover_ratio
0,2003,1102,958,778,265,634,99,324,149,230,...,310,0.229032,0.770968,369,316,0.192412,0.756329,0.025478,0.607547,0.860963
1,2003,1103,1091,986,343,788,76,247,224,296,...,401,0.354115,0.645885,445,367,0.319101,0.705722,0.051383,0.527697,1.016854
2,2003,1104,774,670,234,609,58,202,144,203,...,371,0.404313,0.595687,375,301,0.4,0.734219,0.060764,0.431624,0.673333
3,2003,1105,1528,1310,455,1169,133,383,267,388,...,669,0.372197,0.627803,714,589,0.348739,0.713073,0.035398,0.586813,0.743733
4,2003,1106,1032,893,334,848,95,287,130,191,...,482,0.369295,0.630705,514,429,0.346304,0.708625,0.04918,0.473054,0.639676


In [54]:
combine_both_winning_losing_games_stats = (
    winning_games_up_to_2013
    .merge(losing_games_up_to_2013, how='left',left_on=['Season','WTeamID'],right_on=['Season','LTeamID'])
    # on field goal percentage and winning counts
    .pipe(lambda x:x.assign(total_score = x.WScore + x.LScore))
    .pipe(lambda x:x.assign(total_opponent_score = x.winning_opponent_score + x.losing_opponent_score))
    .pipe(lambda x:x.assign(total_fgm = x.WFGM + x.LFGM))
    .pipe(lambda x:x.assign(total_fga = x.WFGA + x.LFGA))
    .pipe(lambda x:x.assign(total_fg3m = x.WFGM3 + x.LFGM3))
    .pipe(lambda x:x.assign(total_fg3a = x.WFGA3 + x.LFGA3))
    .pipe(lambda x:x.assign(total_ftm = x.WFTM + x.LFTM))
    .pipe(lambda x:x.assign(total_fta = x.WFTA + x.LFTA))
    .pipe(lambda x:x.assign(win_rate = x.winning_num_counts/(x.winning_num_counts + x.losing_num_counts)))
    .sort_values(['WTeamID','Season'])
    # on offensive and defensive rebounds
    .pipe(lambda x:x.assign(total_rebounds = x.total_winning_rebounds + x.total_losing_rebounds))
    .pipe(lambda x:x.assign(total_off_rebounds = x.WOR + x.LOR))
    .pipe(lambda x:x.assign(total_def_rebounds = x.WDR + x.LDR))
    .pipe(lambda x:x.assign(total_off_rebounds_percent = x.total_off_rebounds/x.total_rebounds))
    .pipe(lambda x:x.assign(total_def_rebounds_percent = x.total_def_rebounds/x.total_rebounds))
    .pipe(lambda x:x.assign(total_team_missed_attempts = x.team_missed_attempts + x.losing_team_missed_attempts))
    .pipe(lambda x:x.assign(total_opp_team_missed_attempts = x.opp_team_missed_attempts + x.winning_opp_team_missed_attempts))
    .pipe(lambda x:x.assign(total_rebound_possession_percent = x.total_off_rebounds/x.total_team_missed_attempts))
    .pipe(lambda x:x.assign(total_rebound_possessiongain_percent = x.total_def_rebounds/x.total_opp_team_missed_attempts))
    # on steals, turnovers, assists, blocks and personal fouls
    .pipe(lambda x:x.assign(total_blocks = x.WBlk + x.LBlk))
    .pipe(lambda x:x.assign(total_assists = x.WAst + x.LAst))
    .pipe(lambda x:x.assign(total_steals = x.WStl + x.LStl))
    .pipe(lambda x:x.assign(total_turnover = x.WTO + x.LTO))
    .pipe(lambda x:x.assign(total_personalfoul = x.WPF + x.LPF))
    .pipe(lambda x:x.assign(total_opp_fga = x.LFGA_opp + x.WFGA_opp))
    .pipe(lambda x:x.assign(total_fgm = x.WFGM + x.LFGM))
    .pipe(lambda x:x.assign(total_block_opp_FGA_percent = x.total_blocks/x.total_opp_fga))
    .pipe(lambda x:x.assign(total_assist_per_fgm = x.total_assists/x.total_fgm))
    .pipe(lambda x:x.assign(total_assist_turnover_ratio = x.total_assists/x.total_turnover))
)

combine_both_winning_losing_games_stats.head()

Unnamed: 0,Season,WTeamID,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,losing_opponent_score,...,total_rebound_possessiongain_percent,total_blocks,total_assists,total_steals,total_turnover,total_personalfoul,total_opp_fga,total_block_opp_FGA_percent,total_assist_per_fgm,total_assist_turnover_ratio
0,2003,1102,825,271,480,120,259,163,249,638,...,0.726852,50,364,167,320,525,1188,0.042088,0.679104,1.1375
327,2004,1102,1404,466,913,192,475,280,387,1071,...,0.691318,66,371,218,302,466,1130,0.058407,0.653169,1.228477
653,2005,1102,1097,378,787,146,376,195,266,824,...,0.740066,50,396,255,292,478,1161,0.043066,0.637681,1.356164
982,2006,1102,1430,489,991,201,478,251,332,1143,...,0.683128,59,397,224,306,431,1283,0.045986,0.64658,1.297386
1316,2007,1102,1591,531,1034,211,473,318,418,1168,...,0.758788,44,451,188,305,455,1457,0.030199,0.659357,1.478689


In [55]:
cumulative_stats_for_team_each_year.dtypes[0:33]

Season                      int64
WScore                      int64
WFGM                        int64
WFGA                        int64
WFGM3                       int64
WFGA3                       int64
WFTM                        int64
WFTA                        int64
losing_opponent_score       int64
winning_num_counts          int64
winning_opponent_score      int64
LScore                      int64
LFGM                        int64
LFGA                        int64
LFGM3                       int64
LFGA3                       int64
LFTM                        int64
LFTA                        int64
losing_num_counts           int64
total_score                 int64
total_opponent_score        int64
total_fgm                   int64
total_fga                   int64
total_fg3m                  int64
total_fg3a                  int64
total_ftm                   int64
total_fta                   int64
TeamID                      int64
win_rate                  float64
WFGP          

In [56]:
cumulative_stats_for_team_each_year.dtypes[34:67]

LFTP    float64
fgp     float64
fg3p    float64
ftp     float64
dtype: object

In [57]:
cumulative_stats_for_team_each_year.dtypes[68:100]

Series([], dtype: object)

In [58]:
cumulative_stats_for_team_each_year = (
    combine_both_winning_losing_games_stats
    .sort_values(['WTeamID','Season'])
    .groupby(['WTeamID'])
    .cumsum()
    .pipe(lambda x:x.assign(Season = combine_both_winning_losing_games_stats.Season.values))
    .pipe(lambda x:x.assign(TeamID = combine_both_winning_losing_games_stats.WTeamID.values))
    .drop(['LTeamID','win_rate'],1)
    .pipe(lambda x:x.assign(win_rate = x.winning_num_counts/(x.winning_num_counts + x.losing_num_counts)))
    .pipe(lambda x:x.assign(WFGP = x.WFGM/x.WFGA))
    .pipe(lambda x:x.assign(WFG3P = x.WFGM3/x.WFGA3))
    .pipe(lambda x:x.assign(WFTP = x.WFTM/x.WFTA))
    .pipe(lambda x:x.assign(LFGP = x.LFGM/x.LFGA))
    .pipe(lambda x:x.assign(LFG3P = x.LFGM3/x.LFGA3))
    .pipe(lambda x:x.assign(LFTP = x.LFTM/x.LFTA))
    .pipe(lambda x:x.assign(fgp = x.total_fgm/x.total_fga))
    .pipe(lambda x:x.assign(fg3p = x.total_fg3m/x.total_fg3a))
    .pipe(lambda x:x.assign(ftp = x.total_ftm/x.total_fta))
    # rebounds cumsum stats
    .pipe(lambda x:x.assign(total_def_rebounds_percent = x.total_def_rebounds/x.total_rebounds))
    .pipe(lambda x:x.assign(total_off_rebounds_percent = x.total_off_rebounds/x.total_rebounds))
    .pipe(lambda x:x.assign(total_rebound_possession_percent = x.total_off_rebounds/x.total_team_missed_attempts))
    .pipe(lambda x:x.assign(total_rebound_possessiongain_percent = x.total_def_rebounds/x.total_opp_team_missed_attempts))
    # assists, turnovers, steals, blocks and personal fouls
    .pipe(lambda x:x.assign(total_block_opp_FGA_percent = x.total_blocks/x.total_opp_fga))
    .pipe(lambda x:x.assign(total_assist_per_fgm = x.total_assists/x.total_fgm))
    .pipe(lambda x:x.assign(total_assist_turnover_ratio = x.total_assists/x.total_turnover))
    # win or lose by how many points
    .pipe(lambda x:x.assign(lose_rate = 1-x.win_rate))
    .pipe(lambda x:x.assign(win_score_by = x.WScore - x.losing_opponent_score))
    .pipe(lambda x:x.assign(lose_score_by = x.LScore - x.winning_opponent_score))
    .pipe(lambda x:x.assign(expectation_per_game = x.win_rate * x.win_score_by/x.winning_num_counts + x.lose_rate * x.lose_score_by/x.losing_num_counts))
    .pipe(lambda x:x.assign(avg_win_score_by = x.win_score_by/x.winning_num_counts))
    .pipe(lambda x:x.assign(avg_lose_score_by = x.lose_score_by/x.losing_num_counts))
)

In [59]:
cumulative_stats_for_team_each_year.head()

Unnamed: 0,Season,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,losing_opponent_score,winning_num_counts,...,LFTP,fgp,fg3p,ftp,lose_rate,win_score_by,lose_score_by,expectation_per_game,avg_win_score_by,avg_lose_score_by
0,2003,825,271,480,120,259,163,249,638,12,...,0.647826,0.481149,0.375643,0.651357,0.571429,187,-180,0.25,15.583333,-11.25
327,2004,2229,737,1393,312,734,443,636,1709,34,...,0.639576,0.481886,0.378423,0.678999,0.392857,520,-240,5.0,15.294118,-10.909091
653,2005,3326,1115,2180,458,1110,638,902,2533,51,...,0.661616,0.469388,0.373236,0.693374,0.4,793,-336,5.376471,15.54902,-9.882353
982,2006,4756,1604,3171,659,1588,889,1234,3676,73,...,0.669456,0.47243,0.378968,0.706192,0.353982,1080,-373,6.256637,14.794521,-9.325
1316,2007,6347,2135,4205,870,2061,1207,1652,4844,95,...,0.688119,0.475389,0.384158,0.719221,0.335664,1503,-448,7.377622,15.821053,-9.333333


In [60]:
from aggregate_function import build_features_table, win_rate_type_of_location

In [61]:
test_features = build_features_table.BuildFeaturesTable("data/DataFiles/RegularSeasonDetailedResults.csv")

In [62]:
win_rate_location_test = win_rate_type_of_location.WinRateTypeLocation("data/DataFiles/RegularSeasonDetailedResults.csv")

In [63]:
win_rate_location_test.processed_cumulative_win_rate_df.head()

Unnamed: 0,Season,TeamID,win_rate_away,win_rate_home,win_rate_neutral
0,2014,1101,0.0,0.125,0.0
1,2015,1101,0.083333,0.176471,0.666667
2,2016,1101,0.157895,0.230769,0.6
3,2017,1101,0.25,0.25,0.0
4,2003,1102,0.428571,0.473684,0.0


In [64]:
test = test_features.processed_overall

In [65]:
test.head()

Unnamed: 0,Season,TeamID,win_rate,total_score,total_opponent_score,fgp,fg3p,ftp,total_rebounds,total_off_rebounds,...,total_assists,total_steals,total_turnover,total_personalfoul,total_block_opp_FGA_percent,total_assist_per_fgm,total_assist_turnover_ratio,expectation_per_game,avg_lose_score_by,avg_win_score_by
0,2003,1102,0.428571,1603,1596,0.481149,0.375643,0.651357,588,117,...,364,167,320,525,0.042088,0.679104,1.1375,0.25,-11.25,15.583333
327,2004,1102,0.785714,1685,1412,0.482583,0.381029,0.709091,602,172,...,371,218,302,466,0.058407,0.653169,1.228477,9.75,-10.0,15.136364
653,2005,1102,0.586207,1776,1599,0.448699,0.364407,0.728232,668,221,...,396,255,292,478,0.043066,0.637681,1.356164,6.103448,-8.0,16.058824
982,2006,1102,0.785714,1778,1528,0.481191,0.397035,0.746377,688,190,...,397,224,306,431,0.045986,0.64658,1.297386,8.928571,-6.166667,13.045455
1316,2007,1102,0.733333,2055,1707,0.485795,0.403561,0.760073,820,194,...,451,188,305,455,0.030199,0.659357,1.478689,11.6,-9.375,19.227273


In [66]:
# this combines type of win rate to build features table
win_rate_features_combine = (
    test
    .merge(win_rate_location_test.processed_win_rate_df, how='left',on=['Season','TeamID'])
    .fillna(0)
)

In [67]:
win_rate_features_combine.head()

Unnamed: 0,Season,TeamID,win_rate,total_score,total_opponent_score,fgp,fg3p,ftp,total_rebounds,total_off_rebounds,...,total_personalfoul,total_block_opp_FGA_percent,total_assist_per_fgm,total_assist_turnover_ratio,expectation_per_game,avg_lose_score_by,avg_win_score_by,win_rate_away,win_rate_home,win_rate_neutral
0,2003,1102,0.428571,1603,1596,0.481149,0.375643,0.651357,588,117,...,525,0.042088,0.679104,1.1375,0.25,-11.25,15.583333,0.428571,0.473684,0.0
1,2004,1102,0.785714,1685,1412,0.482583,0.381029,0.709091,602,172,...,466,0.058407,0.653169,1.228477,9.75,-10.0,15.136364,1.0,0.722222,0.5
2,2005,1102,0.586207,1776,1599,0.448699,0.364407,0.728232,668,221,...,478,0.043066,0.637681,1.356164,6.103448,-8.0,16.058824,0.8,0.55,0.5
3,2006,1102,0.785714,1778,1528,0.481191,0.397035,0.746377,688,190,...,431,0.045986,0.64658,1.297386,8.928571,-6.166667,13.045455,1.0,0.736842,0.666667
4,2007,1102,0.733333,2055,1707,0.485795,0.403561,0.760073,820,194,...,455,0.030199,0.659357,1.478689,11.6,-9.375,19.227273,0.857143,0.722222,0.6


In [68]:
coach_file = 'data/DataFiles/TeamCoaches.csv'
regularseason_file = 'data/DataFiles/RegularSeasonDetailedResults.csv'
postseason_file = 'data/DataFiles/NCAATourneyCompactResults.csv'

In [69]:
from aggregate_function import coach_stats
testing_df = coach_stats.CoachStats(coach_file,regularseason_file,postseason_file)

In [70]:
testing_df.cumulative_final_coach_stats_table.head()

Unnamed: 0,Season,TeamID,daysexp,season_max_days,num_season,is_playoff,is_champion,post_games_lost,post_games_won,win_rate_post,games_lost,games_won,win_rate_regular,overall_games_won,overall_games_lost,win_rate_overall,CoachName
0,1985,1224,154,154,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
1,1986,1224,308,308,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
2,1987,1224,462,462,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
3,1988,1224,616,616,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson
4,1989,1224,770,770,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a_b_williamson


In [71]:
final_table = (
    win_rate_features_combine
    .merge(testing_df.cumulative_final_coach_stats_table[['Season','TeamID','num_season',
                                               'is_playoff','is_champion','win_rate_post',
                                               'win_rate_regular','win_rate_overall','CoachName']],
          how='left',on=['Season','TeamID'])
)
final_table.head()

Unnamed: 0,Season,TeamID,win_rate,total_score,total_opponent_score,fgp,fg3p,ftp,total_rebounds,total_off_rebounds,...,win_rate_away,win_rate_home,win_rate_neutral,num_season,is_playoff,is_champion,win_rate_post,win_rate_regular,win_rate_overall,CoachName
0,2003,1102,0.428571,1603,1596,0.481149,0.375643,0.651357,588,117,...,0.428571,0.473684,0.0,3.0,0.0,0.0,0.0,0.0,0.0,joe_scott
1,2004,1102,0.785714,1685,1412,0.482583,0.381029,0.709091,602,172,...,1.0,0.722222,0.5,4.0,1.0,0.0,0.0,0.785714,0.758621,joe_scott
2,2005,1102,0.586207,1776,1599,0.448699,0.364407,0.728232,668,221,...,0.8,0.55,0.5,1.0,0.0,0.0,0.0,0.0,0.0,chris_mooney
3,2006,1102,0.785714,1778,1528,0.481191,0.397035,0.746377,688,190,...,1.0,0.736842,0.666667,3.0,1.0,0.0,0.0,0.785714,0.758621,jeff_bzdelik
4,2007,1102,0.733333,2055,1707,0.485795,0.403561,0.760073,820,194,...,0.857143,0.722222,0.6,4.0,1.0,0.0,0.0,0.785714,0.758621,jeff_bzdelik


In [72]:
final_table_copy = final_table.drop(['Season','TeamID','CoachName','win_rate','fgp','fg3p','ftp',
                                     'total_off_rebounds_percent','total_def_rebounds_percent',
                                     'total_rebound_possession_percent','total_rebound_possessiongain_percent',
                                     'total_block_opp_FGA_percent','win_rate_away','win_rate_home','win_rate_neutral',
                                     'win_rate_post','win_rate_regular','win_rate_overall'],1)
final_table_copy.dtypes

total_score                      int64
total_opponent_score             int64
total_rebounds                   int64
total_off_rebounds               int64
total_def_rebounds               int64
total_blocks                     int64
total_assists                    int64
total_steals                     int64
total_turnover                   int64
total_personalfoul               int64
total_assist_per_fgm           float64
total_assist_turnover_ratio    float64
expectation_per_game           float64
avg_lose_score_by              float64
avg_win_score_by               float64
num_season                     float64
is_playoff                     float64
is_champion                    float64
dtype: object

In [73]:
final_table_copy

Unnamed: 0,total_score,total_opponent_score,total_rebounds,total_off_rebounds,total_def_rebounds,total_blocks,total_assists,total_steals,total_turnover,total_personalfoul,total_assist_per_fgm,total_assist_turnover_ratio,expectation_per_game,avg_lose_score_by,avg_win_score_by,num_season,is_playoff,is_champion
0,1603,1596,588,117,471,50,364,167,320,525,0.679104,1.137500,0.250000,-11.250000,15.583333,3.000000,0.0,0.0
1,1685,1412,602,172,430,66,371,218,302,466,0.653169,1.228477,9.750000,-10.000000,15.136364,4.000000,1.0,0.0
2,1776,1599,668,221,447,50,396,255,292,478,0.637681,1.356164,6.103448,-8.000000,16.058824,1.000000,0.0,0.0
3,1778,1528,688,190,498,59,397,224,306,431,0.646580,1.297386,8.928571,-6.166667,13.045455,3.000000,1.0,0.0
4,2055,1707,820,194,626,44,451,188,305,455,0.659357,1.478689,11.600000,-9.375000,19.227273,4.000000,1.0,0.0
5,1612,1605,745,147,598,50,334,173,355,481,0.643545,0.940845,0.250000,-11.571429,12.071429,1.000000,0.0,0.0
6,1665,1807,807,201,606,63,345,147,393,516,0.620504,0.877863,-4.896552,-12.523810,15.125000,2.000000,0.0,0.0
7,1613,1826,772,196,576,44,356,140,371,509,0.613793,0.959569,-7.344828,-15.238095,13.375000,3.000000,0.0,0.0
8,1799,1814,792,164,628,71,432,174,334,488,0.687898,1.293413,-0.535714,-12.500000,11.428571,4.000000,0.0,0.0
9,1638,1708,760,158,602,80,361,169,351,459,0.640071,1.028490,-2.592593,-11.937500,11.000000,0.350649,0.0,0.0


In [74]:
scaler = MinMaxScaler()
minmax_scale = scaler.fit(final_table_copy)
df_minmax = minmax_scale.transform(final_table_copy)

In [75]:
test_out = pd.DataFrame(df_minmax)

In [76]:
test_out.columns = ['total_score', 'total_opponent_score', 'total_rebounds',
       'total_off_rebounds', 'total_def_rebounds', 'total_blocks',
       'total_assists', 'total_steals', 'total_turnover',
       'total_personalfoul', 'total_assist_per_fgm',
       'total_assist_turnover_ratio', 'expectation_per_game',
       'avg_lose_score_by', 'avg_win_score_by', 'num_season', 'is_playoff',
       'is_champion']

In [77]:
pd.DataFrame(final_table_copy.dtypes).index.values

array(['total_score', 'total_opponent_score', 'total_rebounds',
       'total_off_rebounds', 'total_def_rebounds', 'total_blocks',
       'total_assists', 'total_steals', 'total_turnover',
       'total_personalfoul', 'total_assist_per_fgm',
       'total_assist_turnover_ratio', 'expectation_per_game',
       'avg_lose_score_by', 'avg_win_score_by', 'num_season', 'is_playoff',
       'is_champion'], dtype=object)

In [78]:
test_out.columns = pd.DataFrame(final_table_copy.dtypes).index.values

In [79]:
from aggregate_function import combine_features_table

In [80]:
combine_features_table.CombineFeaturesTable(test_features,win_rate_location_test,testing_df)

AttributeError: 'DataFrame' object has no attribute 'Season'