In [2]:
import pandas as pd
import re
import statistics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.feature_selection import chi2
from sklearn.metrics import r2_score
import numpy as np
from sklearn.model_selection import train_test_split



In [3]:
%run ../functions.ipynb

### Import data

In [6]:
data_path = '../../data/MDataFiles_Stage1/'

#raw data
rankings = pd.read_csv(data_path + 'MMasseyOrdinals.csv')
detailed_reg_season = pd.read_csv(data_path +'MRegularSeasonDetailedResults.csv')
team_coaches = pd.read_csv(data_path + 'MTeamCoaches.csv')
detailed_tourney_results = pd.read_csv(data_path + 'MNCAATourneyDetailedResults.csv')
conf_tourney = pd.read_csv(data_path + 'MConferenceTourneyGames.csv')
team_conferences = pd.read_csv(data_path + 'MTeamConferences.csv')
tourney_seeds = pd.read_csv(data_path + 'MNCAATourneySeeds.csv')
teams = pd.read_csv(data_path + 'MTeams.csv')

#processed data
all_games = pd.read_csv('../processing/output/all_games.csv')

#sample submission
sub = pd.read_csv('../../data/MSampleSubmissionStage1_2020.csv')
#sub = process_sample_sub(test)


### Get conf features

In [7]:
team_conferences['conf'] = np.where(team_conferences['ConfAbbrev'].isin(['acc', 'aac', 'sec','big_twelve', \
                        'pac_twelve', 'big_east', 'pac_ten', 'big_ten']), team_conferences['ConfAbbrev'], 
                                        'other')

team_conferences.rename(columns = {'TeamID': 'Team1'}, inplace = True)


In [8]:
tourney_games = all_games[(all_games.TourneyGame == 1)]

In [9]:
conf_me = tourney_games.merge(team_conferences, on = ['Team1', 'Season'])

In [10]:
conf_me = conf_me.groupby(['Season', 'conf']).agg({'Outcome': np.mean}).sort_values(by = 'Outcome', ascending = False). \
                            reset_index()   



In [11]:
conf_me.sort_values(by = 'Season', inplace = True)

In [12]:
pd.set_option('max_rows', 10000)

In [13]:
conf_me['prior_conf_win_rate'] = conf_me.groupby(['conf'])['Outcome'].apply(lambda x: x.rolling(7).mean().shift())

In [14]:
conf_me[['Outcome', 'prior_conf_win_rate']].corr()

Unnamed: 0,Outcome,prior_conf_win_rate
Outcome,1.0,0.623148
prior_conf_win_rate,0.623148,1.0


### Get conf tourney

In [15]:
wins = conf_tourney.groupby(['Season', 'WTeamID'])['LTeamID'].count().reset_index()

wins.rename(columns = {'WTeamID': 'Team1',
                           'LTeamID': 'num_wins'}, inplace = True)

losses = conf_tourney.groupby(['Season', 'LTeamID'])['WTeamID'].count().reset_index()

losses.rename(columns = {'LTeamID': 'Team1',
                           'WTeamID': 'num_losses'}, inplace = True)

ct = wins.merge(losses, on = ['Season', 'Team1'], how = 'outer')

ct = ct.fillna(0)

ct['ct_win_rate'] = ct['num_wins'] / (ct['num_wins'] + ct['num_losses'])

### Get Seeds

In [16]:
tourney_seeds.rename(columns = {'TeamID': 'Team1'}, inplace = True)
tourney_seeds['Seed'] = tourney_seeds['Seed'].apply(lambda x: re.sub('[^0-9]','', x)).apply(int)

#join_key = ['Season', 'Team1']
#all_games = all_games.merge(tourney_seeds[join_key + ['Seed']], on = join_key, how = 'left')
#all_games['Seed'] = all_games['Seed'].fillna('9999').apply(lambda x: re.sub('[^0-9]','', x)).apply(int)


### Get Pre-Season Rankings

In [17]:
rankings.sort_values(by = ['RankingDayNum', 'OrdinalRank'], inplace = True)


In [18]:
ap_rankings = rankings[(rankings.SystemName == 'AP')]
first_day = ap_rankings.groupby('Season').agg({'RankingDayNum':'min'}).rename(
                            columns = {'RankingDayNum': 'first_day'})

ap_rankings = ap_rankings.join(first_day, on = 'Season')
ap_rankings = ap_rankings[ap_rankings.RankingDayNum == ap_rankings.first_day]
ap_rankings.rename(columns = {'TeamID':'Team1'}, inplace = True)
ap_rankings['pre_season_top_25_flag'] = 1

### Get end of season rankings

In [19]:
all_rankings = list(rankings.SystemName.unique())

In [20]:

''' 
feature_list = ['t1_final_rank', 't2_final_rank']

from tqdm import tqdm
system_rank = {}

for system in tqdm(rankings.SystemName.unique()):
    
    end_rankings = get_rankings(rankings, [system])
    system_rank[system] = full_pipeline(feature_list)
    
'''    

" \nfeature_list = ['t1_final_rank', 't2_final_rank']\n\nfrom tqdm import tqdm\nsystem_rank = {}\n\nfor system in tqdm(rankings.SystemName.unique()):\n    \n    end_rankings = get_rankings(rankings, [system])\n    system_rank[system] = full_pipeline(feature_list)\n    \n"

In [21]:

'''
df = pd.DataFrame.from_dict(system_rank, orient = 'index')
df.columns = ['score']
df = df.sort_values(by = 'score').reset_index()
df.columns = ['system', 'score']
'''

"\ndf = pd.DataFrame.from_dict(system_rank, orient = 'index')\ndf.columns = ['score']\ndf = df.sort_values(by = 'score').reset_index()\ndf.columns = ['system', 'score']\n"

In [22]:
def get_rankings(rankings, system):
    
    end_rankings = rankings[(rankings.RankingDayNum == 133) & (rankings.SystemName.isin(system))]

    end_rankings = end_rankings.groupby(['TeamID', 'Season']).agg({'OrdinalRank':np.mean})
    end_rankings.reset_index(inplace = True)
    end_rankings.columns = ['Team1', 'Season', 'avg_rank']
    end_rankings['final_rank'] = 100-4*np.log(end_rankings['avg_rank']+1)-end_rankings['avg_rank']/22
    
    return end_rankings
    

In [23]:
end_rankings = get_rankings(rankings, ['WLK'])

In [24]:
#end_rankings = rankings[(rankings.RankingDayNum == 133)]
#end_rankings = end_rankings.groupby(['TeamID', 'Season']).agg({'OrdinalRank':np.mean})
#end_rankings.reset_index(inplace = True)
#end_rankings.columns = ['Team1', 'Season', 'avg_rank']
#end_rankings['final_rank'] = 100-4*np.log(end_rankings['avg_rank']+1)-end_rankings['avg_rank']/22

### Field goal efficiency

In [25]:
all_games.columns

Index(['Unnamed: 0', 'TourneyGame', 'Season', 'DayNum', 'Team1', 'Team2',
       'Team1_score', 'Team2_score', 'WLoc', 'NumOT', 'Team1_FGM', 'Team1_FGA',
       'Team1_FGM3', 'Team1_FGA3', 'Team1_FTM', 'Team1_FTA', 'Team1_OR',
       'Team1_DR', 'Team1_Ast', 'Team1_TO', 'Team1_Stl', 'Team1_Blk',
       'Team1_PF', 'Team2_FGM', 'Team2_FGA', 'Team2_FGM3', 'Team2_FGA3',
       'Team2_FTM', 'Team2_FTA', 'Team2_OR', 'Team2_DR', 'Team2_Ast',
       'Team2_TO', 'Team2_Stl', 'Team2_Blk', 'Team2_PF', 'Outcome', 'Loc'],
      dtype='object')

In [26]:
all_games['fg_eff'] = all_games['Team1_FGM3']/all_games['Team1_FGA3']
fg_eff = all_games.groupby(['Team1', 'Season']).agg(mean_fg_eff=('fg_eff', np.mean), 
                                          std_fg_eff=('fg_eff', np.std), 
                                          ).reset_index()

### 14 day feature

In [27]:
reg_season_games = all_games[all_games.TourneyGame == 0]

In [28]:
recent = reg_season_games[all_games.DayNum > 0]

  """Entry point for launching an IPython kernel.


In [29]:
recent = recent.groupby(['Team1', 'Season']).agg(
                                          total_wins=('Outcome', sum), 
                                          total_games=('Outcome', 'count'), 
                                          ).reset_index()

In [30]:
recent['WinRatio14d'] = recent['total_wins'] / recent['total_games']

### Get efficiency metrics

In [31]:
all_games = get_def_off(all_games)


In [32]:
all_games.head()

Unnamed: 0.1,Unnamed: 0,TourneyGame,Season,DayNum,Team1,Team2,Team1_score,Team2_score,WLoc,NumOT,...,Team2_TO,Team2_Stl,Team2_Blk,Team2_PF,Outcome,Loc,fg_eff,Pos,OffRtg,DefRtg
0,0,1,2003,134,1421,1411,92,84,N,1,...,15,5,0,22,1,N,0.37931,75.3024,122.174061,111.550229
1,1,1,2003,136,1112,1436,80,51,N,0,...,17,10,3,15,1,N,0.304348,74.0736,108.000691,68.850441
2,2,1,2003,136,1113,1272,84,71,N,0,...,12,2,5,18,1,N,0.428571,64.9728,129.28487,109.276497
3,3,1,2003,136,1141,1166,79,73,N,0,...,21,6,6,21,1,N,0.428571,68.16,115.903756,107.100939
4,4,1,2003,136,1143,1301,76,74,N,1,...,14,5,8,19,1,N,0.35,66.3552,114.535108,111.521026


In [33]:
all_games['OffRtg'] = np.where(all_games['Loc'] == 'H', all_games['OffRtg'] * (1 - 0),
         np.where(all_games['Loc'] == 'A', all_games['OffRtg'] * (1 + 0),
                  all_games['OffRtg']))
         
         
all_games['DefRtg'] = np.where(all_games['Loc'] == 'H', all_games['DefRtg'] * (1 + 0),
         np.where(all_games['Loc'] == 'A', all_games['DefRtg'] * (1 - 0),
                  all_games['DefRtg']))

In [34]:
all_games.sort_values(by = ['Season', 'Team1', 'DayNum'], inplace = True)
all_games.reset_index(drop=True, inplace = True)

In [35]:
all_games.head()

Unnamed: 0.1,Unnamed: 0,TourneyGame,Season,DayNum,Team1,Team2,Team1_score,Team2_score,WLoc,NumOT,...,Team2_TO,Team2_Stl,Team2_Blk,Team2_PF,Outcome,Loc,fg_eff,Pos,OffRtg,DefRtg
0,133,0,2003,19,1102,1257,47,65,H,0,...,14,8,4,23,0,A,0.285714,58.5984,80.206968,110.92453
1,246,0,2003,22,1102,1391,72,43,H,0,...,11,5,0,12,1,H,0.535714,50.9568,141.296157,84.385205
2,339,0,2003,25,1102,1117,57,52,A,0,...,8,3,0,19,1,A,0.35,50.5728,112.708808,102.82207
3,473,0,2003,27,1102,1399,47,60,H,0,...,14,4,1,13,0,A,0.526316,49.2288,95.472569,121.879875
4,612,0,2003,31,1102,1410,65,44,H,0,...,13,3,0,13,1,H,0.545455,43.2768,150.195948,101.671103


In [41]:
all_games['GameEfficiency'] = all_games['OffRtg'] - all_games['DefRtg']

In [43]:
all_games = all_games[['Season', 'DayNum','Team1', 'Team2', 'GameEfficiency', 'Outcome']]

In [46]:
all_games.iloc[26:,:].head(5)

Unnamed: 0,Season,DayNum,Team1,Team2,GameEfficiency,Outcome
26,2003,124,1102,1307,14.230419,1
27,2003,129,1102,1428,-8.608815,0
28,2003,19,1103,1460,-4.609145,0
29,2003,26,1103,1324,5.597349,1
30,2003,30,1103,1156,-4.65584,0


In [44]:
all_games.to_csv('sample_basketball_data.csv')

In [36]:
pd.set_option('max_rows', 1000)


all_games['test'] = all_games.groupby(['Season', 'Team1'])['OffRtg'].shift(1).expanding().mean()

all_games[['Team1', 'test']].head(100)

Unnamed: 0,Team1,test
0,1102,
1,1102,80.206968
2,1102,110.751562
3,1102,111.403978
4,1102,107.421125
5,1102,115.97609
6,1102,118.953602
7,1102,124.515169
8,1102,118.815041
9,1102,117.729593


In [58]:
type(all_games.groupby(['Season', 'Team1'])[['OffRtg']])

pandas.core.groupby.generic.DataFrameGroupBy

In [60]:
all_games['test'] =  all_games.groupby(['Season', 'Team1'])[['OffRtg']].transform(lambda x: x.shift(1).expanding().mean())

all_games[['Team1', 'test']].head(100)


Unnamed: 0,Team1,test
0,1102,
1,1102,80.206968
2,1102,110.751562
3,1102,111.403978
4,1102,107.421125
5,1102,115.97609
6,1102,118.953602
7,1102,124.515169
8,1102,118.815041
9,1102,117.729593


In [46]:
all_games.groupby(['Season', 'Team1'])['OffRtg'].shift(1).expanding().mean()


0                NaN
1          80.206968
2         110.751562
3         111.403978
4         107.421125
             ...    
177233    107.659910
177234    107.659976
177235    107.659862
177236    107.659842
177237    107.659807
Name: OffRtg, Length: 177238, dtype: float64

In [42]:
all_games['OffRtg'].shift(1).expanding().mean()


0                NaN
1          80.206968
2         110.751562
3         111.403978
4         107.421125
             ...    
177233    107.410610
177234    107.410676
177235    107.410567
177236    107.410548
177237    107.410516
Name: OffRtg, Length: 177238, dtype: float64

In [38]:
all_games.head()

Unnamed: 0.1,Unnamed: 0,TourneyGame,Season,DayNum,Team1,Team2,Team1_score,Team2_score,WLoc,NumOT,...,Team2_Stl,Team2_Blk,Team2_PF,Outcome,Loc,fg_eff,Pos,OffRtg,DefRtg,avg_oe
0,133,0,2003,19,1102,1257,47,65,H,0,...,8,4,23,0,A,0.285714,58.5984,80.206968,110.92453,
1,246,0,2003,22,1102,1391,72,43,H,0,...,5,0,12,1,H,0.535714,50.9568,141.296157,84.385205,80.206968
2,339,0,2003,25,1102,1117,57,52,A,0,...,3,0,19,1,A,0.35,50.5728,112.708808,102.82207,110.751562
3,473,0,2003,27,1102,1399,47,60,H,0,...,4,1,13,0,A,0.526316,49.2288,95.472569,121.879875,111.403978
4,612,0,2003,31,1102,1410,65,44,H,0,...,3,0,13,1,H,0.545455,43.2768,150.195948,101.671103,107.421125


In [66]:
import numpy as np

def get_adj_eff(all_games, location = ['H', 'A', 'N'], away_bonus = 0, start_num = 0):
    
    
    all_games['OffRtg'] = np.where(all_games['Loc'] == 'H', all_games['OffRtg'] * (1 - away_bonus),
         np.where(all_games['Loc'] == 'A', all_games['OffRtg'] * (1 + away_bonus),
                  all_games['OffRtg']))
         
         
    all_games['DefRtg'] = np.where(all_games['Loc'] == 'H', all_games['DefRtg'] * (1 + away_bonus),
         np.where(all_games['Loc'] == 'A', all_games['DefRtg'] * (1 - away_bonus),
                  all_games['DefRtg'])) 
        
    #sort values for rolling
    all_games.sort_values(by = ['Season', 'Team1', 'DayNum'], inplace = True)
    all_games.reset_index(drop=True, inplace = True)

    all_games['avg_oe'] = all_games.groupby(['Season', 'Team1'])['OffRtg'].transform(lambda x: x.shift(1).expanding().mean())
    all_games['avg_de'] = all_games.groupby(['Season', 'Team1'])['DefRtg'].transform(lambda x: x.shift(1).expanding().mean())

    #get opponents rolling averages "at that point in the season"
    all_games2 = all_games.rename(columns = {'Team1': 'Team2', 'Team2': 'Team1',
                                             'avg_oe': 'opp_avg_oe', 'avg_de':'opp_avg_de'})
    join_key = ['Team2', 'Season', 'DayNum']
    all_games3 = all_games.merge(all_games2[join_key + ['opp_avg_oe', 'opp_avg_de']], on = join_key, how = 'left')

    #get league's rolling averages "at that point in the season"
    all_games3.sort_values(by = ['Season', 'DayNum'], inplace = True)

    all_games3['league_avg_oe'] = all_games3.groupby(['Season'])['OffRtg'].transform(lambda x: x.shift(1).expanding().mean())
    all_games3['league_avg_de'] = all_games3.groupby(['Season'])['DefRtg'].transform(lambda x: x.shift(1).expanding().mean())

    #adj. oe and de based on opponents
    all_games3.sort_values(by = ['Season', 'Team1', 'DayNum'], inplace = True)
    all_games3['adj_oe'] = (1 - (all_games3['opp_avg_de']/all_games3['league_avg_de'] - 1) ) * all_games3['OffRtg']
    all_games3['adj_de'] = (1 - (all_games3['opp_avg_oe']/all_games3['league_avg_oe'] - 1) ) * all_games3['DefRtg']

    all_games3 = all_games3[all_games3['Loc'].isin(location)]
    
    all_games3 = all_games3[all_games3.DayNum > start_num]
    
    season_adj_oe_de = all_games3.groupby(['Season', 'Team1']).agg(
                
                adj_oe=('adj_oe', np.mean), 
                adj_de=('adj_de', np.mean), 
                #std_adj_oe=('adj_oe', np.std),
                #std_adj_de=('adj_de', np.std),
        
                ).reset_index()


    return season_adj_oe_de


In [67]:
reg_season_games = all_games[all_games.TourneyGame == 0]

In [68]:
def get_adj_eff_time_limited(df, start, end, location = ['H', 'A', 'N'], away_bonus = 0, start_num =0):
    
    df = df[ (df.DayNum >= start) & (df.DayNum <= end)]
    
    df = get_adj_eff(df, location = location, away_bonus = away_bonus, start_num = start_num)
    
    location_string = '_'.join(location)
    
    #df.rename(columns = {'adj_oe': 'adj_oe_' + str(start) + '_' + str(end),'adj_de': 'adj_de_' + str(start) + '_' + str(end)}, inplace = True)
    
    df.rename(columns = {'adj_oe': 'adj_oe_' + str(start_num),
                         'adj_de': 'adj_de_' + str(start_num)}, inplace = True)

    
    return df
    

In [69]:
season_adj_oe_de_0_end = get_adj_eff_time_limited(reg_season_games, 0, 999, away_bonus = 0)

In [239]:
for x in range(0, 143, 3):
    for y in range(0, 143, 3):
        print(x,y)

0 0
0 3
0 6
0 9
0 12
0 15
0 18
0 21
0 24
0 27
0 30
0 33
0 36
0 39
0 42
0 45
0 48
0 51
0 54
0 57
0 60
0 63
0 66
0 69
0 72
0 75
0 78
0 81
0 84
0 87
0 90
0 93
0 96
0 99
0 102
0 105
0 108
0 111
0 114
0 117
0 120
0 123
0 126
0 129
0 132
0 135
0 138
0 141
3 0
3 3
3 6
3 9
3 12
3 15
3 18
3 21
3 24
3 27
3 30
3 33
3 36
3 39
3 42
3 45
3 48
3 51
3 54
3 57
3 60
3 63
3 66
3 69
3 72
3 75
3 78
3 81
3 84
3 87
3 90
3 93
3 96
3 99
3 102
3 105
3 108
3 111
3 114
3 117
3 120
3 123
3 126
3 129
3 132
3 135
3 138
3 141
6 0
6 3
6 6
6 9
6 12
6 15
6 18
6 21
6 24
6 27
6 30
6 33
6 36
6 39
6 42
6 45
6 48
6 51
6 54
6 57
6 60
6 63
6 66
6 69
6 72
6 75
6 78
6 81
6 84
6 87
6 90
6 93
6 96
6 99
6 102
6 105
6 108
6 111
6 114
6 117
6 120
6 123
6 126
6 129
6 132
6 135
6 138
6 141
9 0
9 3
9 6
9 9
9 12
9 15
9 18
9 21
9 24
9 27
9 30
9 33
9 36
9 39
9 42
9 45
9 48
9 51
9 54
9 57
9 60
9 63
9 66
9 69
9 72
9 75
9 78
9 81
9 84
9 87
9 90
9 93
9 96
9 99
9 102
9 105
9 108
9 111
9 114
9 117
9 120
9 123
9 126
9 129
9 132
9 135
9 138
9 141


141 18
141 21
141 24
141 27
141 30
141 33
141 36
141 39
141 42
141 45
141 48
141 51
141 54
141 57
141 60
141 63
141 66
141 69
141 72
141 75
141 78
141 81
141 84
141 87
141 90
141 93
141 96
141 99
141 102
141 105
141 108
141 111
141 114
141 117
141 120
141 123
141 126
141 129
141 132
141 135
141 138
141 141


In [70]:
#season_adj_oe_de_0_end = get_adj_eff_time_limited(reg_season_games, 0, 999, away_bonus = 0)
#season_adj_oe_de_67_end = get_adj_eff_time_limited(reg_season_games, 0, 999, away_bonus = 0, start_num = 67)
#season_adj_oe_de_100_end = get_adj_eff_time_limited(reg_season_games, 0, 999, away_bonus = 0, start_num = 100)
#season_adj_oe_de_118_end = get_adj_eff_time_limited(reg_season_games, 0, 999, away_bonus = 0, start_num = 118)

In [71]:
#season_adj_oe_de_0_32 = get_adj_eff_time_limited(reg_season_games, 0, 32, away_bonus = 0)
#season_adj_oe_de_33_65 = get_adj_eff_time_limited(reg_season_games, 33, 65, away_bonus = 0)
#season_adj_oe_de_66_98 = get_adj_eff_time_limited(reg_season_games, 66, 98, away_bonus = 0)
#season_adj_oe_de_99_999 = get_adj_eff_time_limited(reg_season_games, 99, 999, away_bonus = 0)

In [72]:
#season_adj_oe_de_118_end = get_adj_eff_time_limited(reg_season_games, 118, 999)


In [73]:
#season_adj_oe_de_0_end = get_adj_eff_time_limited(reg_season_games, 0, 999)
#season_adj_oe_de_0_50 = get_adj_eff_time_limited(reg_season_games, 0, 50)
#season_adj_oe_de_50_100 = get_adj_eff_time_limited(reg_season_games, 50, 100)
#season_adj_oe_de_0_100 = get_adj_eff_time_limited(reg_season_games, 0, 100)

#season_adj_oe_de_80_end = get_adj_eff_time_limited(reg_season_games, 80, 999)
#season_adj_oe_de_90_end = get_adj_eff_time_limited(reg_season_games, 90, 999)
#season_adj_oe_de_100_end = get_adj_eff_time_limited(reg_season_games, 100, 999)



### Set up data

In [74]:
def setup(all_games, sub):
    
    tourney_games = all_games[(all_games.TourneyGame == 1)]

    tourney_games['ID'] = tourney_games['Season'].astype(str) + '_' + \
                             tourney_games['Team1'].astype(str) + '_' + \
                             tourney_games['Team2'].astype(str)

    tourney_games['Pred'] = None

    tourney_games['type'] = 0

    sub['Outcome'] = None
    sub['type'] = 1

    tourney_games = tourney_games[['type', 'ID', 'Pred', 'Season', 'Team1', 'Team2', 'Outcome']]

    sub = sub[['type', 'ID', 'Pred', 'Season', 'Team1', 'Team2', 'Outcome']]

    tourney_games = pd.concat([tourney_games, sub], axis = 0)
    
    return tourney_games

In [75]:
tourney_games = setup(all_games, sub)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


### Add features

In [76]:
def add_all_features(tourney_games):
    
    #end rankings
    tourney_games = add_features(tourney_games, end_rankings, ['final_rank'])
    
    #seed diff
    tourney_games = add_features(tourney_games, tourney_seeds, ['Seed'])
    tourney_games['seed_diff'] = tourney_games.t1_Seed - tourney_games.t2_Seed
    
    #pre season ranks
    tourney_games = add_features(tourney_games, ap_rankings[['Season','Team1','OrdinalRank', 'pre_season_top_25_flag']], ['OrdinalRank', 'pre_season_top_25_flag'])
    tourney_games['t1_OrdinalRank'] = tourney_games['t1_OrdinalRank'].fillna(25)
    tourney_games['t2_OrdinalRank'] = tourney_games['t2_OrdinalRank'].fillna(25)
    
    #adj oe and de 
    tourney_games = add_features(tourney_games, season_adj_oe_de_0_end, ['adj_oe_0', 'adj_de_0'])
    tourney_games['t1_adj_margin'] = tourney_games['t1_adj_oe_0'] - tourney_games['t1_adj_de_0']
    tourney_games['t2_adj_margin'] = tourney_games['t2_adj_oe_0'] - tourney_games['t2_adj_de_0']


    return tourney_games
    

### Run Model

In [228]:
def build_model(df, feature_list):
    
    df = df[(df.Season < 2015)]
    df = df.fillna(0)
    
    X = df[feature_list].values
    df['Outcome']=df['Outcome'].astype('int') 
    y = df['Outcome'].values.ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    #Grid search to get best params
    clf = LogisticRegression(random_state = 0)
    params = {'C': np.logspace(start=-5, stop=3, num=9), 'penalty': ['l2', 'l1']}
    clf = GridSearchCV(clf, params, scoring='neg_log_loss', refit=True)
    clf.fit(X_train, y_train)
    best_param = clf.best_params_

    #Use best params to train final model
    logreg = LogisticRegression(**best_param)
    logreg.fit(X_train, y_train)
    
    #Cross validation
    seasons = list(df.Season.unique())

    log_loss_list = []

    for test_season in seasons:

        train_seasons = seasons.copy()
        train_seasons.remove(test_season)

        X_train = df[df['Season'].isin(train_seasons)][feature_list].values
        X_test = df[df.Season == test_season][feature_list].values

        y_train = df[df['Season'].isin(train_seasons)]['Outcome'].values.ravel()
        y_test = df[df.Season == test_season]['Outcome'].values.ravel()

        logreg = LogisticRegression(**best_param)
        logreg.fit(X_train, y_train)

        #Evaluate score on test set 
        y_pred = logreg.predict_proba(X_test)

        ll = log_loss(y_test, y_pred)
        log_loss_list.append(ll) 
        
    avg_log_loss = sum(log_loss_list) / len(log_loss_list)

    #print('avg. log loss: {}'.format(avg_log_loss))
    #print('min log loss: {}'.format(min(log_loss_list)))
    #print('max log loss: {}'.format(max(log_loss_list)))
    #print('std dev log loss: {}'.format(statistics.stdev(log_loss_list)))
    #print(log_loss_list)
    
    return avg_log_loss


### export

In [229]:
tourney_games = setup(all_games, sub)
tourney_games = add_all_features(tourney_games)

In [230]:
tourney_games.to_csv('output/all_games.csv')


### Complete pipeline

In [117]:
feature_list = ['seed_diff',  
                't1_adj_margin','t2_adj_margin',
                't1_final_rank', 't2_final_rank',
                't1_OrdinalRank', 't2_OrdinalRank']

def full_pipeline(feature_list):

    tourney_games = setup(all_games, sub)
    tourney_games = add_all_features(tourney_games)
    avg_log_loss = build_model(tourney_games, feature_list)
    
    return avg_log_loss
    

In [118]:
import warnings
warnings.filterwarnings('ignore') 

In [231]:
full_pipeline(feature_list)

0.5357415850410575

In [665]:
tourney_games

Unnamed: 0,type,ID,Pred,Season,Team1,Team2,Outcome,avg_rank_x,t1_final_rank,avg_rank_y,...,t2_Seed,seed_diff,t1_OrdinalRank,t1_pre_season_top_25_flag,t2_OrdinalRank,t2_pre_season_top_25_flag,t1_adj_oe_0,t1_adj_de_0,t2_adj_oe_0,t2_adj_de_0
0,0,2003_1421_1411,,2003,1421,1411,1,,,,...,16,0,25.0,,25.0,,104.526528,119.437414,106.509926,111.907479
1,0,2003_1112_1436,,2003,1112,1436,1,,,,...,16,-15,1.0,1.0,25.0,,119.071254,93.286794,106.884988,101.850357
2,0,2003_1113_1272,,2003,1113,1272,1,,,,...,7,3,25.0,,25.0,,121.219248,101.541410,114.009765,96.248862
3,0,2003_1141_1166,,2003,1141,1166,1,,,,...,6,5,25.0,,23.0,1.0,113.038016,105.370558,117.078846,97.251061
4,0,2003_1143_1301,,2003,1143,1301,1,,,,...,9,-1,25.0,,25.0,,111.121076,101.087582,116.019889,102.254106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13615,1,2019_1449_1459,0.5,2019,1449,1459,,32.0,84.559424,28.0,...,7,2,25.0,,25.0,,112.209691,97.002177,121.896135,100.694552
13616,1,2019_1449_1463,0.5,2019,1449,1463,,32.0,84.559424,60.0,...,14,-5,25.0,,25.0,,112.209691,97.002177,116.598684,107.806743
13617,1,2019_1458_1459,0.5,2019,1458,1459,,25.0,85.831250,28.0,...,7,-2,25.0,,25.0,,114.171157,93.066219,121.896135,100.694552
13618,1,2019_1458_1463,0.5,2019,1458,1463,,25.0,85.831250,60.0,...,14,-9,25.0,,25.0,,114.171157,93.066219,116.598684,107.806743


### Legacy feature adding process

In [621]:
end_rankings.head()

tourney_games = add_features(tourney_games, end_rankings, ['final_rank'])


In [622]:
tourney_games = add_features(tourney_games, recent, ['WinRatio14d'])

tourney_games.head()

Unnamed: 0,type,ID,Pred,Season,Team1,Team2,Outcome,avg_rank_x,t1_final_rank,avg_rank_y,t2_final_rank,total_wins_x,total_games_x,t1_WinRatio14d,total_wins_y,total_games_y,t2_WinRatio14d
0,0,2003_1421_1411,,2003,1421,1411,1,,,,,13,29,0.448276,18,30,0.6
1,0,2003_1112_1436,,2003,1112,1436,1,,,,,25,28,0.892857,19,29,0.655172
2,0,2003_1113_1272,,2003,1113,1272,1,,,,,18,29,0.62069,23,29,0.793103
3,0,2003_1141_1166,,2003,1141,1166,1,,,,,23,29,0.793103,29,33,0.878788
4,0,2003_1143_1301,,2003,1143,1301,1,,,,,21,29,0.724138,18,30,0.6


In [623]:
tourney_games = add_features(tourney_games, fg_eff, ['mean_fg_eff','std_fg_eff'])


In [624]:
tourney_games = add_features(tourney_games, tourney_seeds, ['Seed'])


In [625]:
tourney_games['seed_diff'] = tourney_games.t1_Seed - tourney_games.t2_Seed

In [626]:
tourney_games = add_features(tourney_games, ct, 
                             ['num_wins', 'num_losses', 'ct_win_rate'])


In [627]:
tourney_games = add_features(tourney_games, ap_rankings[['Season','Team1','OrdinalRank', 'pre_season_top_25_flag']], ['OrdinalRank', 'pre_season_top_25_flag'])

tourney_games['t1_OrdinalRank'] = tourney_games['t1_OrdinalRank'].fillna(25)
tourney_games['t2_OrdinalRank'] = tourney_games['t2_OrdinalRank'].fillna(25)

tourney_games['t1_pre_season_top_25_flag'] = tourney_games['t1_pre_season_top_25_flag'].fillna(0)
tourney_games['t2_pre_season_top_25_flag'] = tourney_games['t2_pre_season_top_25_flag'].fillna(0)


In [628]:
tourney_games.head()

Unnamed: 0,type,ID,Pred,Season,Team1,Team2,Outcome,avg_rank_x,t1_final_rank,avg_rank_y,...,t1_num_wins,t1_num_losses,t1_ct_win_rate,t2_num_wins,t2_num_losses,t2_ct_win_rate,t1_OrdinalRank,t1_pre_season_top_25_flag,t2_OrdinalRank,t2_pre_season_top_25_flag
0,0,2003_1421_1411,,2003,1421,1411,1,,,,...,3.0,0.0,1.0,3.0,0.0,1.0,25.0,0.0,25.0,0.0
1,0,2003_1112_1436,,2003,1112,1436,1,,,,...,0.0,1.0,0.0,3.0,0.0,1.0,1.0,1.0,25.0,0.0
2,0,2003_1113_1272,,2003,1113,1272,1,,,,...,0.0,1.0,0.0,1.0,1.0,0.5,25.0,0.0,25.0,0.0
3,0,2003_1141_1166,,2003,1141,1166,1,,,,...,3.0,0.0,1.0,3.0,0.0,1.0,25.0,0.0,23.0,1.0
4,0,2003_1143_1301,,2003,1143,1301,1,,,,...,1.0,1.0,0.5,2.0,1.0,0.666667,25.0,0.0,25.0,0.0


In [629]:
#Mean encoding for top 25
tourney_games1 = tourney_games [~tourney_games['Outcome'].isna()]
tourney_games1['Outcome'] = tourney_games1['Outcome'].astype(float)
me_top25 = tourney_games1.groupby('t1_OrdinalRank').agg({'Outcome':np.mean}).reset_index()
me_top25.rename(columns = {'Outcome': 'top_25'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [630]:
me_top25.head()

Unnamed: 0,t1_OrdinalRank,top_25
0,1.0,0.782609
1,2.0,0.754098
2,3.0,0.686275
3,4.0,0.793103
4,5.0,0.740741


In [631]:
tourney_games = tourney_games.merge(me_top25, on = ['t1_OrdinalRank'])
tourney_games.rename(columns = {'top_25': 't1_top_25'}, inplace = True)

me_top25.rename(columns = {'t1_OrdinalRank':'t2_OrdinalRank'}, inplace = True)
tourney_games = tourney_games.merge(me_top25, on = ['t2_OrdinalRank'])
tourney_games.rename(columns = {'top_25': 't2_top_25'}, inplace = True)


In [632]:
tourney_games.head()

Unnamed: 0,type,ID,Pred,Season,Team1,Team2,Outcome,avg_rank_x,t1_final_rank,avg_rank_y,...,t1_ct_win_rate,t2_num_wins,t2_num_losses,t2_ct_win_rate,t1_OrdinalRank,t1_pre_season_top_25_flag,t2_OrdinalRank,t2_pre_season_top_25_flag,t1_top_25,t2_top_25
0,0,2003_1421_1411,,2003,1421,1411,1,,,,...,1.0,3.0,0.0,1.0,25.0,0.0,25.0,0.0,0.376592,0.376592
1,0,2003_1113_1272,,2003,1113,1272,1,,,,...,0.0,1.0,1.0,0.5,25.0,0.0,25.0,0.0,0.376592,0.376592
2,0,2003_1143_1301,,2003,1143,1301,1,,,,...,0.5,2.0,1.0,0.666667,25.0,0.0,25.0,0.0,0.376592,0.376592
3,0,2003_1211_1153,,2003,1211,1153,1,,,,...,0.5,0.0,1.0,0.0,25.0,0.0,25.0,0.0,0.376592,0.376592
4,0,2003_1458_1451,,2003,1458,1451,1,,,,...,0.0,2.0,0.0,1.0,25.0,0.0,25.0,0.0,0.376592,0.376592


In [633]:
#tourney_games = add_features(tourney_games, season_adj_oe_de_0_end, ['adj_oe_0_999_H_A_N', 'adj_de_0_999_H_A_N'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_0_end_H, ['adj_oe_0_999_H', 'adj_de_0_999_H'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_0_end_A, ['adj_oe_0_999_A_N', 'adj_de_0_999_A_N'])


In [634]:
#tourney_games = add_features(tourney_games, season_adj_oe_de_0_end, ['adj_oe_0_999', 'adj_de_0_999',
            #'std_adj_oe', 'std_adj_de'])
    
tourney_games = add_features(tourney_games, season_adj_oe_de_0_end, ['adj_oe_0', 'adj_de_0'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_67_end, ['adj_oe_67', 'adj_de_67'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_100_end, ['adj_oe_100', 'adj_de_100'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_118_end, ['adj_oe_118', 'adj_de_118'])


#tourney_games = add_features(tourney_games, season_adj_oe_de_118_end, ['adj_oe_118_999', 'adj_de_118_999'])
#tourney_games = add_features(tourney_games, season_adj_oe_de2, ['adj_oe', 'adj_de'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_0_50, ['adj_oe_0_50', 'adj_de_0_50'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_50_100, ['adj_oe_50_100', 'adj_de_50_100'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_0_100, ['adj_oe_0_100', 'adj_de_0_100'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_80_end, ['adj_oe_80_999', 'adj_de_80_999'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_90_end, ['adj_oe_90_999', 'adj_de_90_999'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_100_end, ['adj_oe_100_999', 'adj_de_100_999'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_110_end, ['adj_oe_110_999', 'adj_de_110_999'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_120_end, ['adj_oe_120_999', 'adj_de_120_999'])



In [635]:
#tourney_games = add_features(tourney_games, season_adj_oe_de_0_32, ['adj_oe_0_32', 'adj_de_0_32'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_33_65, ['adj_oe_33_65', 'adj_de_33_65'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_66_98, ['adj_oe_66_98', 'adj_de_66_98'])
#tourney_games = add_features(tourney_games, season_adj_oe_de_99_999, ['adj_oe_99_999', 'adj_de_99_999'])

In [636]:
#tourney_games['t1_oe_improvement'] = tourney_games['t1_adj_oe_100_999'] / tourney_games['t1_adj_oe_0_100']
#tourney_games['t2_oe_improvement'] = tourney_games['t2_adj_oe_100_999'] / tourney_games['t2_adj_oe_0_100']
#tourney_games['t1_de_improvement'] = tourney_games['t1_adj_de_100_999'] / tourney_games['t1_adj_de_0_100']
#tourney_games['t2_de_improvement'] = tourney_games['t2_adj_de_100_999'] / tourney_games['t2_adj_de_0_100']
#tourney_games['oe_de_ratio'] = tourney_games['t1_adj_oe_0_999_H_A_N'] / tourney_games['t2_adj_de_0_999_H_A_N']
#tourney_games['de_oe_ratio'] = tourney_games['t1_adj_de_0_999_H_A_N'] / tourney_games['t2_adj_oe_0_999_H_A_N']


In [637]:
tourney_games.head()

Unnamed: 0,type,ID,Pred,Season,Team1,Team2,Outcome,avg_rank_x,t1_final_rank,avg_rank_y,...,t2_adj_oe_67,t2_adj_de_67,t1_adj_oe_100,t1_adj_de_100,t2_adj_oe_100,t2_adj_de_100,t1_adj_oe_118,t1_adj_de_118,t2_adj_oe_118,t2_adj_de_118
0,0,2003_1421_1411,,2003,1421,1411,1,,,,...,105.341005,109.981801,108.203685,118.849998,107.962741,118.069851,112.254852,110.323822,114.157146,109.160068
1,0,2003_1113_1272,,2003,1113,1272,1,,,,...,116.817293,100.233367,123.597248,110.374428,119.357181,95.841234,121.700858,101.788465,121.974701,104.864738
2,0,2003_1143_1301,,2003,1143,1301,1,,,,...,115.568826,106.065366,109.219208,105.255664,115.236653,106.766659,100.029999,100.540329,122.619398,108.788385
3,0,2003_1211_1153,,2003,1211,1153,1,,,,...,112.859192,100.771307,116.116042,102.458978,104.913123,101.183426,112.919073,98.561573,109.003994,100.152815
4,0,2003_1458_1451,,2003,1458,1451,1,,,,...,125.893964,110.260054,116.405184,98.13755,121.376063,106.780195,109.57657,104.948527,117.262063,99.762571


In [638]:
tourney_games.to_csv('output/all_games.csv')
#season_adj_oe_de.to_csv('output/pred.csv')

#### Add team names 

In [None]:
teams.rename(columns = {'TeamID': 'Team1', 'TeamName': 'TeamName1'}, inplace = True)
join_key = ['Team1']
season_adj_oe_de = season_adj_oe_de.merge(teams[join_key + ['TeamName']], on = join_key)

In [37]:
tourney_games.head()

Unnamed: 0.1,Unnamed: 0,TourneyGame,Season,DayNum,Team1,Team2,Team1_score,Team2_score,WLoc,NumOT,...,t1_adj_oe,t1_adj_de,TeamName_x,t2_adj_oe,t2_adj_de,TeamName_y,t1_adj_oe_100_y,t1_adj_de_100_y,t2_adj_oe_100_y,t2_adj_de_100_y
0,20,1,2003,137,1104,1231,62,67,N,0,...,113.680842,99.834794,Alabama,117.478661,103.478134,Indiana,110.296428,106.965071,118.374155,111.785162
1,1,1,2003,136,1112,1436,80,51,N,0,...,119.490384,93.288067,Arizona,105.836663,101.753053,Vermont,116.41834,95.983003,109.981695,97.981018
2,33,1,2003,138,1112,1211,96,95,N,2,...,119.490384,93.288067,Arizona,119.624007,103.464641,Gonzaga,116.41834,95.983003,121.797517,105.091126
3,49,1,2003,143,1112,1323,88,71,N,0,...,119.490384,93.288067,Arizona,121.933163,98.983228,Notre Dame,116.41834,95.983003,113.952873,111.759944
4,57,1,2003,145,1112,1242,75,78,N,0,...,119.490384,93.288067,Arizona,122.03704,89.102885,Kansas,116.41834,95.983003,116.510857,93.701459


In [19]:
features = ['adj_oe', 'adj_de']
season_adj_oe_de1 = season_adj_oe_de.rename(columns = {f: 't1_' + f for f in features})
season_adj_oe_de2 = season_adj_oe_de.rename(columns ={f: 't2_' + f for f in features})
season_adj_oe_de2 = season_adj_oe_de2.rename(columns ={'Team1': 'Team2'})

In [149]:
tourney_games = tourney_games.merge(season_adj_oe_de1, on = ['Team1', 'Season'], how = 'left')
tourney_games = tourney_games.merge(season_adj_oe_de2, on = ['Team2', 'Season'], how = 'left')

In [154]:
tourney_games.to_csv('output/all_games.csv')
season_adj_oe_de.to_csv('output/pred.csv')

#### Add seeds