In [None]:
import cupy as cp
import cuml, cudf
from sklearn.model_selection import train_test_split 
from cuml.linear_model import LogisticRegression
from cuml.neighbors import KNeighborsClassifier
from cuml.svm import SVC
from cuml.ensemble import RandomForestClassifier
from cuml.preprocessing.TargetEncoder import TargetEncoder
from sklearn.model_selection import GroupKFold, KFold
from cuml.metrics import log_loss
from tqdm.notebook import tqdm

In [None]:
train_cr = cudf.read_csv('../input/ncaaw-march-mania-2021/WNCAATourneyCompactResults.csv')
train_seeds = cudf.read_csv('../input/ncaaw-march-mania-2021/WNCAATourneySeeds.csv')
submission = cudf.read_csv('../input/ncaaw-march-mania-2021/WSampleSubmissionStage1.csv')
train_rs_cr = cudf.read_csv('../input/ncaaw-march-mania-2021/WRegularSeasonCompactResults.csv')

In [None]:
train_rs_cr.head()

In [None]:
train_cr.head()

In [None]:

A_w = train_rs_cr[train_rs_cr.WLoc == 'A']\
    .groupby(['Season','WTeamID'])['WTeamID'].count().to_frame()\
    .rename(columns={"WTeamID": "win_A"})
N_w = train_rs_cr[train_rs_cr.WLoc == 'N']\
    .groupby(['Season','WTeamID'])['WTeamID'].count().to_frame()\
    .rename(columns={"WTeamID": "win_N"})
H_w = train_rs_cr[train_rs_cr.WLoc == 'H']\
    .groupby(['Season','WTeamID'])['WTeamID'].count().to_frame()\
    .rename(columns={"WTeamID": "win_H"})
win = A_w.join(N_w, how='outer').join(H_w, how='outer').fillna(0)

H_l = train_rs_cr[train_rs_cr.WLoc == 'A']\
    .groupby(['Season','LTeamID'])['LTeamID'].count().to_frame()\
    .rename(columns={"LTeamID": "lost_H"})
N_l = train_rs_cr[train_rs_cr.WLoc == 'N']\
    .groupby(['Season','LTeamID'])['LTeamID'].count().to_frame()\
    .rename(columns={"LTeamID": "lost_N"})
A_l = train_rs_cr[train_rs_cr.WLoc == 'H']\
    .groupby(['Season','LTeamID'])['LTeamID'].count().to_frame()\
    .rename(columns={"LTeamID": "lost_A"})
lost = A_l.join(N_l, how='outer').join(H_l, how='outer').fillna(0)

win.index = win.index.rename(['Season', 'TeamID'])
lost.index = lost.index.rename(['Season', 'TeamID'])
wl = win.join(lost, how='outer').reset_index()
wl['win_pct_A'] = wl['win_A'] / (wl['win_A'] + wl['lost_A'])
wl['win_pct_N'] = wl['win_N'] / (wl['win_N'] + wl['lost_N'])
wl['win_pct_H'] = wl['win_H'] / (wl['win_H'] + wl['lost_H'])
wl['win_pct_All'] = (wl['win_A'] + wl['win_N'] + wl['win_H']) / \
    (wl['win_A'] + wl['win_N'] + wl['win_H'] + wl['lost_A']\
     + wl['lost_N'] + wl['lost_H'])

del A_w, N_w, H_w, H_l, N_l, A_l, win, lost

In [None]:
wl.head(10)

In [None]:
wl['win_A'] = wl.groupby('WTeamID')['win_A'].apply(lambda x:x.fillna(int(x.mean())))
wl.head()

In [None]:
wl['win_H'] = wl.groupby('WTeamID')['win_H'].apply(lambda x:x.fillna(int(x.mean())))
wl.head()

In [None]:
wl['win_N'] = wl.groupby('WTeamID')['win_N'].apply(lambda x:x.fillna(int(x.mean())))
wl.head()

In [None]:
wl['win_pct_A'] = wl.groupby('WTeamID')['win_pct_A'].apply(lambda x:x.fillna(x.mean()))
wl['win_pct_N'] = wl.groupby('WTeamID')['win_pct_N'].apply(lambda x:x.fillna(x.mean()))
wl['win_pct_H'] = wl.groupby('WTeamID')['win_pct_H'].apply(lambda x:x.fillna(x.mean()))
wl['win_pct_All'] = wl.groupby('WTeamID')['win_pct_All'].apply(lambda x:x.fillna(x.mean()))

wl.head()

In [None]:
train_seeds['seed_int'] = [int(train_seeds['Seed'][x][1:3]) for x in range(len(train_seeds))]


In [None]:
drop_lbls = ['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT']
train_seeds.drop(labels=['Seed'], inplace=True, axis=1)
train_cr.drop(labels=drop_lbls, inplace=True, axis=1)

In [None]:
train_cr.head()

In [None]:
train_seeds.head()

In [None]:
ren1 = {'TeamID':'WTeamID', 'seed_int':'WS'}
ren2 = {'TeamID':'LTeamID', 'seed_int':'LS'}

In [None]:
df1 = cudf.merge(left=train_cr, right=train_seeds.rename(columns=ren1), how='left', on=['Season', 'WTeamID'])
df2 = cudf.merge(left=df1, right=train_seeds.rename(columns=ren2), on=['Season', 'LTeamID'])

df_w = cudf.DataFrame()
df_w['dff'] = df2.WS - df2.LS
df_w['rsl'] = 1

df_l = cudf.DataFrame()
df_l['dff'] = -df_w['dff']
df_l['rsl'] = 0

df_prd = cudf.concat((df_w, df_l))

In [None]:
df2.head()

In [None]:
ren3 = {'win_A':'W_win_A', 'win_N':'W_win_N','win_H':'W_win_H',
       'win_pct_A':'W_win_pct_A', 'win_pct_N':'W_win_pct_N','win_pct_H':'W_win_pct_H',
       'win_pct_All':'W_win_pct_All'}

In [None]:
df3 = cudf.merge(left=df2, right=wl.rename(columns=ren3), on=['Season', 'WTeamID'])

In [None]:
df3.head()

In [None]:
X = df_prd.dff.values.astype('float32').reshape(-1,1)
y = df_prd.rsl.values.astype('float32')

In [None]:
X_test = cp.zeros(shape=(len(submission), 1))

for ind, row in submission.to_pandas().iterrows():
    yr, o, t = [int(x) for x in row.ID.split('_')]  
    X_test[ind, 0] = train_seeds[(train_seeds.TeamID == o) & (train_seeds.Season == yr)].seed_int.values[0] - train_seeds[(train_seeds.TeamID == t) & (train_seeds.Season == yr)].seed_int.values[0]

In [None]:
rr_train_oof = cp.zeros((X.shape[0],))
rr_test_preds = 0
rr_train_oof.shape

In [None]:
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(X, y))):
        #print(f'Fold {f}')
        train, val = X[train_ind], X[val_ind]
        train_target, val_target = y[train_ind], y[val_ind]
        

            
        model = LogisticRegression()
        model.fit(train, train_target)
        temp_oof = model.predict(val)
        temp_test = model.predict_proba(X_test)[:,1]

        rr_train_oof[val_ind] = temp_oof
        rr_test_preds += temp_test/NUM_FOLDS
        
        print(log_loss(val_target, temp_oof))
        

In [None]:
print(log_loss(y, rr_train_oof))

In [None]:
rr_test_preds

In [None]:
knn_train_oof = cp.zeros((X.shape[0],))
knn_test_preds = 0
knn_train_oof.shape

In [None]:
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=0)

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(X, y))):
        #print(f'Fold {f}')
        train, val = X[train_ind], X[val_ind]
        train_target, val_target = y[train_ind], y[val_ind]
        

            
        model = KNeighborsClassifier(n_neighbors=200)
        model.fit(train, train_target)
        temp_oof = model.predict_proba(val)[:,1]
        temp_test = model.predict_proba(X_test)[:,1]

        knn_train_oof[val_ind] = temp_oof
        knn_test_preds += temp_test/NUM_FOLDS
        
        print(log_loss(val_target, temp_oof))

In [None]:
print(log_loss(y, knn_train_oof))

In [None]:
print(log_loss(y, 0.9*knn_train_oof+0.1*rr_train_oof))

In [None]:
submission.Pred = rr_test_preds   
submission.to_csv('submission.csv', index=False)

In [None]:
submission.Pred = knn_test_preds   
submission.to_csv('knn_submission.csv', index=False)

In [None]:
submission.Pred = 0.9*knn_test_preds+0.1*rr_test_preds
submission.to_csv('blend_submission.csv', index=False)