In [2]:
import pandas as pd
import numpy as np

In [3]:
RegSD = pd.read_csv('RegularSeasonDetailedResults.csv')
RegSD.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,10,1104,68,1328,62,N,0,27,58,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,1272,70,1393,63,N,0,26,62,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,1266,73,1437,61,N,0,24,58,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,1296,56,1457,50,N,0,18,38,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,1400,77,1208,71,N,0,30,61,...,16,17,27,21,15,12,10,7,1,14


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
sns.set_style('darkgrid')
matplotlib.rc('font', size=10)
matplotlib.rc('axes', titlesize=10)
matplotlib.rc('axes', labelsize=10)
matplotlib.rc('xtick', labelsize=10)
matplotlib.rc('ytick', labelsize=10)
matplotlib.rc('legend', fontsize=10)
matplotlib.rc('figure', titlesize=10)
%matplotlib inline

In [5]:
df1 = pd.DataFrame()
df1[["team1","team2"]] = RegSD[["Wteam","Lteam"]].copy()
df1['pred'] = 1

df2 = pd.DataFrame()
df2[["team1","team2"]] = RegSD[["Lteam","Wteam"]].copy()
df2['pred'] = 0

final1 = pd.concat((df1,df2), axis=0)
n = final1["team1"].nunique()
n

355

In [6]:
unique_team = {t:i for i, t in enumerate(final1.team1.unique())}
final1["team1"] = final1["team1"].apply(lambda x: unique_team[x])
final1["team2"] = final1["team2"].apply(lambda x: unique_team[x])
final1.head()

Unnamed: 0,team1,team2,pred
0,0,67,1
1,1,164,1
2,2,93,1
3,3,221,1
4,4,110,1


In [19]:
t_seeds = pd.read_csv("TourneySeeds.csv")
t = pd.read_csv("TourneyCompactResults.csv")
t.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [20]:
t.drop(labels=['Daynum', 'Wscore', 'Lscore', 'Wloc', 'Numot'], inplace=True, axis=1)
t.head()

Unnamed: 0,Season,Wteam,Lteam
0,1985,1116,1234
1,1985,1120,1345
2,1985,1207,1250
3,1985,1229,1425
4,1985,1242,1325


In [21]:
t_seeds.head()

Unnamed: 0,Season,Seed,Team
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [22]:
def convert_to_seed(seed):
    return int(seed[1:3])
t_seeds["Seed (INT)"] = t_seeds.Seed.apply(convert_to_seed)
t_seeds.drop("Seed",inplace=True, axis=1)
t_seeds.head(10)

Unnamed: 0,Season,Team,Seed (INT)
0,1985,1207,1
1,1985,1210,2
2,1985,1228,3
3,1985,1260,4
4,1985,1374,5
5,1985,1208,6
6,1985,1393,7
7,1985,1396,8
8,1985,1439,9
9,1985,1177,10


In [24]:
df_winseeds = t_seeds.rename(columns={'Team':'Wteam', 'Seed (INT)':'win_seed'})
df_lossseeds = t_seeds.rename(columns={'Team':'Lteam', 'Seed (INT)':'loss_seed'})
df_dummy = pd.merge(left=t , right=df_winseeds, how='left', on=['Season', 'Wteam'])
df_concat = pd.merge(left=df_dummy, right=df_lossseeds, on=['Season', 'Lteam'])
df_concat['seed_diff'] = df_concat.win_seed - df_concat.loss_seed
df_concat.head()

Unnamed: 0,Season,Wteam,Lteam,win_seed,loss_seed,seed_diff
0,1985,1116,1234,9,8,1
1,1985,1120,1345,11,6,5
2,1985,1207,1250,1,16,-15
3,1985,1229,1425,9,8,1
4,1985,1242,1325,3,14,-11


In [25]:
df_wins = pd.DataFrame()
df_wins['seed_diff'] = df_concat['seed_diff']
df_wins['result'] = 1

df_losses = pd.DataFrame()
df_losses['seed_diff'] = -df_concat['seed_diff']
df_losses['result'] = 0

df_for_predictions = pd.concat((df_wins, df_losses))
df_for_predictions.head()

Unnamed: 0,seed_diff,result
0,1,1
1,5,1
2,-15,1
3,1,1
4,-11,1


In [26]:
X_train = df_for_predictions.seed_diff.values.reshape(-1,1)
y_train = df_for_predictions.result.values
X_train, y_train = shuffle(X_train, y_train)

In [29]:
y_train

array([0, 1, 1, ..., 1, 1, 0])

In [31]:
logreg = LogisticRegression()
params = {'C':np.logspace(start=-4, stop = 5, num=7)}
clf = GridSearchCV(logreg,params,scoring='neg_log_loss',refit=True)

In [32]:
clf.fit(X_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-04,   3.16228e-03,   1.00000e-01,   3.16228e+00,
         1.00000e+02,   3.16228e+03,   1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [34]:
df_sample_sub = pd.read_csv('sample_submission.csv')
n_test_games = len(df_sample_sub)
def required_format(id):
    return (int(x) for x in id.split('_'))

In [35]:
X_test = np.zeros(shape=(n_test_games, 1))
for i, row in df_sample_sub.iterrows():
    year, t1, t2 = required_format(row.id)
    t1_seed = t_seeds[(t_seeds.Team == t1) & (t_seeds.Season == year)]["Seed (INT)"].values[0]
    t2_seed = t_seeds[(t_seeds.Team == t2) & (t_seeds.Season == year)]["Seed (INT)"].values[0]
    diff_seed = t1_seed - t2_seed
    X_test[i, 0] = diff_seed

In [37]:
preds = clf.predict_proba(X_test)[:,1]

In [38]:
clipped_preds = np.clip(preds, 0.05, 0.95)
df_sample_sub.pred = clipped_preds
df_sample_sub.head()

Unnamed: 0,id,pred
0,2013_1103_1107,0.621578
1,2013_1103_1112,0.270419
2,2013_1103_1125,0.45874
3,2013_1103_1129,0.54126
4,2013_1103_1137,0.45874


In [39]:
df_sample_sub.to_csv('logreg_on_seed.csv', index=False)