___
This notebook decides on the intermediate variables being used

In [2]:
import pandas as pd
import numpy as np
import scipy
from sklearn import *

## Read data
- regularseason detailed results
- ~~cities~~
- teams
- coaches
    - there is a problem if a coach is new, so to prevent this from happening
    - coach will have a proxy variables of
        1. number of years of experience up to that year
        1. number of championship
        1. number of playoffs made

In [11]:
raw_data_regularseason = pd.read_csv("data/DataFiles/RegularSeasonDetailedResults.csv")

In [13]:
raw_data_teams = pd.read_csv("data/DataFiles/Teams.csv")

In [25]:
raw_data_coaches = pd.read_csv("data/DataFiles/TeamCoaches.csv")

In [29]:
raw_data_teams_coaches = (
    raw_data_teams
    .merge(raw_data_coaches, how='left', on=['TeamID'])
)

In [38]:
raw_data_regularseason.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [41]:
raw_data_regularseason.dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
WFGM        int64
WFGA        int64
WFGM3       int64
WFGA3       int64
WFTM        int64
WFTA        int64
WOR         int64
WDR         int64
WAst        int64
WTO         int64
WStl        int64
WBlk        int64
WPF         int64
LFGM        int64
LFGA        int64
LFGM3       int64
LFGA3       int64
LFTM        int64
LFTA        int64
LOR         int64
LDR         int64
LAst        int64
LTO         int64
LStl        int64
LBlk        int64
LPF         int64
dtype: object

## Features to be included
- Season year
- winning/losing teamid
- winning/losing score
- winning/losing field goal percentage
- winning/losing field goal 3 point percentage
- winning/losing free throw percentage
- win rate

In [80]:
winning_teams_score_up_to_2013 = (
    raw_data_regularseason
    .pipe(lambda x:x.assign(winning_num_counts=1))
    .query("Season <= 2013")
    .groupby(['Season','WTeamID'])
    .agg({"WScore":"sum","WFGM":"sum","WFGA":"sum","WFGM3":"sum","WFGA3":"sum","WFTM":"sum","WFTA":"sum","LScore":"sum","winning_num_counts":"sum"})
    .reset_index()
    .rename(columns={"LScore":"losing_opponent_score"})
)

In [81]:
winning_teams_score_up_to_2013.head()

Unnamed: 0,Season,WTeamID,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,losing_opponent_score,winning_num_counts
0,2003,1102,825,271,480,120,259,163,249,638,12
1,2003,1103,1141,390,720,71,187,290,402,1019,13
2,2003,1104,1270,439,992,120,354,272,383,1046,17
3,2003,1105,556,179,433,64,157,134,180,465,7
4,2003,1106,888,322,700,76,207,168,270,753,13


In [82]:
losing_teams_score_up_to_2013 = (
    raw_data_regularseason
    .pipe(lambda x:x.assign(losing_num_counts=1))
    .query("Season <= 2013")
    .groupby(['Season','LTeamID'])
    .agg({"WScore":"sum","LScore":"sum","LFGM":"sum","LFGA":"sum","LFGM3":"sum","LFGA3":"sum","LFTM":"sum","LFTA":"sum","losing_num_counts":"sum"})
    .reset_index()
    .rename(columns={"WScore":"winning_opponent_score"})
)

In [83]:
losing_teams_score_up_to_2013.head()

Unnamed: 0,Season,LTeamID,winning_opponent_score,LScore,LFGM,LFGA,LFGM3,LFGA3,LFTM,LFTA,losing_num_counts
0,2003,1102,958,778,265,634,99,324,149,230,16
1,2003,1103,1091,986,343,788,76,247,224,296,14
2,2003,1104,774,670,234,609,58,202,144,203,11
3,2003,1105,1528,1310,455,1169,133,383,267,388,19
4,2003,1106,1032,893,334,848,95,287,130,191,15


In [125]:
combine_winning_losing_stats_for_year = (
    winning_teams_score_up_to_2013
    .merge(losing_teams_score_up_to_2013, how='left',left_on=['Season','WTeamID'],right_on=['Season','LTeamID'])
    .pipe(lambda x:x.assign(total_score = x.WScore + x.LScore))
    .pipe(lambda x:x.assign(total_opponent_score = x.winning_opponent_score + x.losing_opponent_score))
    .pipe(lambda x:x.assign(total_fgm = x.WFGM + x.LFGM))
    .pipe(lambda x:x.assign(total_fga = x.WFGA + x.LFGA))
    .pipe(lambda x:x.assign(total_ftm = x.WFTM + x.LFTM))
    .pipe(lambda x:x.assign(total_fta = x.WFTA + x.LFTA))
    .pipe(lambda x:x.assign(win_rate = x.winning_num_counts/(x.winning_num_counts + x.losing_num_counts)))
    .sort_values(['WTeamID','Season'])
)

In [126]:
combine_winning_losing_stats_for_year.head()

Unnamed: 0,Season,WTeamID,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,losing_opponent_score,...,LFTM,LFTA,losing_num_counts,total_score,total_opponent_score,total_fgm,total_fga,total_ftm,total_fta,win_rate
0,2003,1102,825,271,480,120,259,163,249,638,...,149,230,16,1603,1596,536,1114,312,479,0.428571
327,2004,1102,1404,466,913,192,475,280,387,1071,...,32,53,6,1685,1412,568,1177,312,440,0.785714
653,2005,1102,1097,378,787,146,376,195,266,824,...,81,113,12,1776,1599,621,1384,276,379,0.586207
982,2006,1102,1430,489,991,201,478,251,332,1143,...,58,82,6,1778,1528,614,1276,309,414,0.785714
1316,2007,1102,1591,531,1034,211,473,318,418,1168,...,97,128,8,2055,1707,684,1408,415,546,0.733333


In [145]:
cumulative_stats_for_team_each_year = (
    combine_winning_losing_stats_for_year
    .sort_values(['WTeamID','Season'])
    .groupby(['WTeamID'])
    .cumsum()
    .pipe(lambda x:x.assign(Season = combine_winning_losing_stats_for_year.Season.values))
    .pipe(lambda x:x.assign(TeamID = combine_winning_losing_stats_for_year.WTeamID.values))
    .drop(['LTeamID','win_rate'],1)
    .pipe(lambda x:x.assign(win_rate = x.winning_num_counts/(x.winning_num_counts + x.losing_num_counts)))
    .pipe(lambda x:x.assign(WFGP = x.WFGM/x.WFGA))
    .pipe(lambda x:x.assign(WFG3P = x.WFGM3/x.WFGA3))
    .pipe(lambda x:x.assign(WFTP = x.WFTM/x.WFTA))
    .pipe(lambda x:x.assign())
)

In [144]:
cumulative_stats_for_team_each_year.head()

Unnamed: 0,Season,WScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,losing_opponent_score,winning_num_counts,...,losing_num_counts,total_score,total_opponent_score,total_fgm,total_fga,total_ftm,total_fta,TeamID,win_rate,WFGP
0,2003,825,271,480,120,259,163,249,638,12,...,16,1603,1596,536,1114,312,479,1102,0.428571,0.564583
327,2004,2229,737,1393,312,734,443,636,1709,34,...,22,3288,3008,1104,2291,624,919,1102,0.607143,0.529074
653,2005,3326,1115,2180,458,1110,638,902,2533,51,...,34,5064,4607,1725,3675,900,1298,1102,0.6,0.511468
982,2006,4756,1604,3171,659,1588,889,1234,3676,73,...,40,6842,6135,2339,4951,1209,1712,1102,0.646018,0.505834
1316,2007,6347,2135,4205,870,2061,1207,1652,4844,95,...,48,8897,7842,3023,6359,1624,2258,1102,0.664336,0.507729


In [142]:
combine_winning_losing_stats_for_year.dtypes

Season                      int64
WTeamID                     int64
WScore                      int64
WFGM                        int64
WFGA                        int64
WFGM3                       int64
WFGA3                       int64
WFTM                        int64
WFTA                        int64
losing_opponent_score       int64
winning_num_counts          int64
LTeamID                     int64
winning_opponent_score      int64
LScore                      int64
LFGM                        int64
LFGA                        int64
LFGM3                       int64
LFGA3                       int64
LFTM                        int64
LFTA                        int64
losing_num_counts           int64
total_score                 int64
total_opponent_score        int64
total_fgm                   int64
total_fga                   int64
total_ftm                   int64
total_fta                   int64
win_rate                  float64
dtype: object

## Some variations to try for features
- separate winning and losing
- 

## Intermediate Variables
- Coach stats
- win rate for home court
- win rate for away court
- win rate for neutral court
- offensive stats
- defensive stats
- blocks, steals and personal fouls

In [48]:
train_df = prediction_df.query("Season <= 2013")
test_df = prediction_df.query("Season >= 2014")

In [82]:
train_data_x = train_df[['diff_seed']]
train_data_y = train_df['yhat']

test_data_x = test_df[['diff_seed']]
test_data_y = test_df['yhat']

## Initializing Logistics Regression

In [83]:
logreg = linear_model.LogisticRegression()

In [84]:
logreg.fit(train_data_x,train_data_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [85]:
#logreg.predict(test_df[['diff_seed']])

array([1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 1,

In [86]:
test_results = pd.DataFrame(logreg.predict(test_df[['diff_seed']])).rename(columns={0:"prediction_result"})

In [87]:
logreg.score(test_data_x,test_data_y)

0.70895522388059706

In [93]:
test_df['prediction_results'] = test_results.prediction_result.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [96]:
test_df.tail(20)

Unnamed: 0,Season,WTeamID,W_seed,LTeamID,L_seed,diff_seed,yhat,prediction_results
2097,2017,1276,7,1257,2,5,0,1
2098,2017,1314,1,1116,8,-7,0,0
2099,2017,1332,3,1348,11,-8,0,0
2100,2017,1376,7,1181,2,5,0,1
2101,2017,1417,3,1153,6,-3,0,0
2102,2017,1211,1,1452,4,-3,0,0
2103,2017,1242,1,1345,4,-3,0,0
2104,2017,1332,3,1276,7,-4,0,0
2105,2017,1462,11,1112,2,9,0,1
2106,2017,1196,4,1458,8,-4,0,0


In [97]:
metrics.confusion_matrix(test_df.yhat,test_df.prediction_results)

array([[180,  88],
       [ 68, 200]])

http://blog.yhat.com/posts/roc-curves.html

In [100]:
pd.read_csv("data/DataFiles/RegularSeasonDetailedResults.csv").head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [101]:
pd.read_csv("data/DataFiles/RegularSeasonCompactResults.csv").head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


## Features selected
- season
- region --> perhaps encode to a number. example: west - east = 1001. west = victor, east = loser
- wteamid
- wscore
- lteamid
- lscore
- wloc
- winning field goal percentage
- winning three point percentage
- winning free throw percentage
- transformed variable for rebounds (offensive and defensive)
- transformed assist
- transformed turnovers
- transformed steals
- transformed blocks
- transformed personal fouls
- repeat for losing team

*transformed variables exclude first

In [102]:
pd.read_csv("data/DataFiles/RegularSeasonDetailedResults.csv").dtypes

Season      int64
DayNum      int64
WTeamID     int64
WScore      int64
LTeamID     int64
LScore      int64
WLoc       object
NumOT       int64
WFGM        int64
WFGA        int64
WFGM3       int64
WFGA3       int64
WFTM        int64
WFTA        int64
WOR         int64
WDR         int64
WAst        int64
WTO         int64
WStl        int64
WBlk        int64
WPF         int64
LFGM        int64
LFGA        int64
LFGM3       int64
LFGA3       int64
LFTM        int64
LFTA        int64
LOR         int64
LDR         int64
LAst        int64
LTO         int64
LStl        int64
LBlk        int64
LPF         int64
dtype: object