# March Machine Learning Mania 2021 - NCAAW prediction (LightGBM)

## Introduction: 
  Competition URL: [March Machine Learning Mania 2021 - NCAAW](https://www.kaggle.com/c/ncaaw-march-mania-2021/overview)


## Task:
March Machine Learning Mania challenges data scientists to predict winners and losers of the women's 2021 NCAA basketball tournament.

## Data
1. Team ID's and Team Names
2. Tournament seeds since 1997-98 season
3. Final scores of all regular season, conference tournament, and NCAA® tournament games since 1997-98 season
4. **Game-by-game stats at a team level (free throws attempted, defensive rebounds, turnovers, etc.) for all regular season, conference tournament, and NCAA® tournament games since the 2009-10 season.**
5. Example submission 

## Evaluation
Submissions are scored on the log loss: <p>
LogLoss = $-\frac{1}{n}\sum_{i-1}^n[y_i\log{(\hat{y_i})}+(1-y_i)\log{(1-\hat{y_i})}]$
    
where

 $n$ is the number of games played <p>
 $\hat{y_i}$ is the predicted probability of team 1 beating team 2<p>
 ${y_i}$ is 1 if team 1 wins, 0 if team 2 wins<p>
 $\log$ is the natural logarithm<p>

## Modeling idea:
**Label:**  Game results of NCAA tournaments (ground truth) <p>
**Basic Features:**  Average regular season stats grouping by team and year<p>
**Advanced Features:**  Helpful features for prediction generated from Basic Features<p>
**Model:**  LightGBM
    

~-------------------------------------------------------------------------------------------~
## Code

### Load data

Feature engineering: WRegularSeasonDetailedResults.csv & WNCAATourneySeeds.csv <p>
Ground Truth(Labels): WNCAATourneyDetailedResults.csv <p>
Submission format: WSampleSubmissionStage2.csv

In [None]:
import pandas as pd
import numpy as np

pd.options.display.max_columns = None

df_sub = pd.read_csv('../input/ncaaw-march-mania-2021/WDataFiles_Stage2/WSampleSubmissionStage2.csv',sep=',',header=0,index_col=None)
df_T = pd.read_csv('../input/ncaaw-march-mania-2021/WDataFiles_Stage2/WNCAATourneyDetailedResults.csv',sep=',',header=0,index_col=None)
df_R = pd.read_csv('../input/ncaaw-march-mania-2021/WDataFiles_Stage2/WRegularSeasonDetailedResults.csv',sep=',',header=0,index_col=None)
df_seed = pd.read_csv('../input/ncaaw-march-mania-2021/WDataFiles_Stage2/WNCAATourneySeeds.csv',sep=',',header=0,index_col=None)

### Data viewing

In [None]:
# regular season stats
df_R.head()

In [None]:
# tournament stats
df_T.head()

In [None]:
# seed data
df_seed.head()

In [None]:
#submission format
df_sub.head()

### Feature engineering

#### 1. Seed

In [None]:
# seed
df_seed.Seed = df_seed.Seed.apply(lambda x: int(x[1:3]))
df_seed.head()

#### 2. Basic features

In [None]:
# select basic features: one game can generate two rows of features by reversing team1 and team2
r1 = df_R[["Season", "DayNum", "WTeamID", "WScore", "LTeamID", "LScore", "WFGM","WFGA", 'WFGM3', 'WFGA3','WFTM', 'WFTA','WOR', 'WDR', "WAst",'WTO',     "LFGM", "LFGA", 'LFGM3', 'LFGA3','LFTM', 'LFTA','LOR', 'LDR',"LAst",'LTO']]
r2 = df_R[["Season", "DayNum", "LTeamID", "LScore", "WTeamID", "WScore", "LFGM","LFGA", 'LFGM3', 'LFGA3','LFTM', 'LFTA','LOR', 'LDR', "LAst",'LTO',     "WFGM", "WFGA", 'WFGM3', 'WFGA3','WFTM', 'WFTA','WOR', 'WDR',"WAst",'WTO']]
# rename features
r1.columns=["Season", "DayNum", "T1", "T1_Points", "T2", "T2_Points", "T1_fgm","T1_fga", "T1_fgm3","T1_fga3","T1_ftm", "T1_fta","T1_or","T1_dr", "T1_ast","T1_to",   "T2_fgm", "T2_fga", "T2_fgm3","T2_fga3","T2_ftm", "T2_fta","T2_or","T2_dr", "T2_ast","T2_to"]
r2.columns=["Season", "DayNum", "T1", "T1_Points", "T2", "T2_Points", "T1_fgm","T1_fga", "T1_fgm3","T1_fga3","T1_ftm", "T1_fta","T1_or","T1_dr", "T1_ast","T1_to",   "T2_fgm", "T2_fga", "T2_fgm3","T2_fga3","T2_ftm", "T2_fta","T2_or","T2_dr", "T2_ast","T2_to"]
# merge 2 tables
df_regular = pd.concat([r1,r2])
df_regular.head()

#### 3. Adding advanced features

In [None]:
# true shooting percentage
df_regular['T1_ts'] = df_regular['T1_Points']/(2*(df_regular['T1_fga']+0.475*df_regular['T1_fta']))
df_regular['T2_ts'] = df_regular['T2_Points']/(2*(df_regular['T2_fga']+0.475*df_regular['T2_fta']))

# winning of last 14 days before tournament
df_regular['last14'] = df_regular['DayNum'].apply(lambda x: 1 if x>118 else 0)
df_regular['Wlast14'] = np.where(df_regular['last14'] & (df_regular['T1_Points']>df_regular['T2_Points']), 1, 0) 

# point difference
df_regular['Pdiff'] = df_regular['T1_Points'] - df_regular['T2_Points']

df_regular.head()

#### 4. Grouping by team and season

In [None]:
# T1 target team, T2 opponent team
# introducing points std to represent stability
group = df_regular.groupby(['Season','T1'])
df_features = group.agg({'T1_Points':['mean','std'],'T2_Points':['mean','std'],'T1_or':'mean','T1_dr':'mean','T1_ast':'mean','T1_to':'mean','T1_ts':'mean',
           'T2_or':'mean','T2_dr':'mean','T2_ast':'mean','T2_to':'mean','T2_ts':'mean','Pdiff':'mean',
          'last14':'sum','Wlast14':'sum'}) 

# win rate of last 14 days before tournament
df_features['14winrate'] = df_features['Wlast14']/df_features['last14']
df_features.head()

In [None]:
#rename
df_features.columns = ['T1_Points', 'T1_PointsStd', 'T2_Points', 'T2_PointsStd', 'T1_or', 'T1_dr', 'T1_ast', 'T1_to', 'T1_ts',
       'T2_or', 'T2_dr', 'T2_ast', 'T2_to', 'T2_ts', 'Pdiff', 'last14',
       'Wlast14', '14winrate']
df_features.drop(['last14','Wlast14'], axis=1,inplace = True)
df_features

#### 5. Merging with tournament table

In [None]:
# two candidate tables of regular seaon for merging, T1 and T2
df_train1 = df_features.copy()
df_train2 = df_features.copy()
df_train1.columns=['T1_Points','T1_Pstd', 'T1op_Points','T1op_Pstd', 'T1_or', 'T1_dr', 'T1_ast', 'T1_to', 'T1_ts',
       'T1op_or', 'T1op_dr', 'T1op_ast', 'T1op_to', 'T1op_ts', 'T1_Pdiff', 'T1_14winrate']
df_train2.columns=['T2_Points','T2_Pstd', 'T2op_Points','T2op_Pstd', 'T2_or', 'T2_dr', 'T2_ast', 'T2_to', 'T2_ts',
       'T2op_or', 'T2op_dr', 'T2op_ast', 'T2op_to', 'T2op_ts', 'T2_Pdiff', 'T2_14winrate']

In [None]:
# preprocess tournament table
t1 = df_T[["Season", "DayNum", "WTeamID", "LTeamID", "WScore", "LScore"]]
t2 = df_T[["Season", "DayNum", "LTeamID", "WTeamID", "LScore", "WScore"]]
# t1['ResultDiff'] = t1["WScore"]-t1["LScore"]
# t2['ResultDiff'] = t2["LScore"]-t2["WScore"]

t1.columns=["Season", "DayNum", "T1", "T2", "T1_Points", "T2_Points"]
t2.columns=["Season", "DayNum", "T1", "T2", "T1_Points", "T2_Points"]

df_tourney = pd.concat([t1,t2])
df_tourney

In [None]:
# merging seed 
df_seed1 = pd.merge(df_tourney, df_seed, how='left', left_on=['Season','T1'], right_on = ['Season','TeamID'])
df_seed1.rename(columns={'Seed':'Seed1'}, inplace =True)
df_seed1.drop('TeamID',axis=1, inplace =True)

df_seed2 = pd.merge(df_seed1, df_seed, how='left', left_on=['Season','T2'], right_on = ['Season','TeamID'])
df_seed2.rename(columns={'Seed':'Seed2'}, inplace =True)
df_seed2.drop('TeamID',axis=1, inplace =True)

df_seed2['Seed_diff']=df_seed2['Seed1'] - df_seed2['Seed2']

#### 6. Final table

In [None]:
# merging all
df_final = pd.merge(df_seed2, df_train1, how='left', left_on=['Season','T1'], right_on = ['Season','T1'])
df_final = pd.merge(df_final, df_train2, how='left', left_on=['Season','T2'], right_on = ['Season','T1'])
df_final['Label'] = (df_final['T1_Points_x']-df_final['T2_Points_x']).apply(lambda x:1 if x >0 else 0)
df_final.head()

#### Modeling

In [None]:
import numpy as np
import pandas as pd
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [None]:
# train & test set split
X = df_final.drop(['T1_Points_x','T2_Points_x','Season','DayNum','T1','T2','Label'], axis=1)
y = df_final['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
# Gridsearch for best hyper parameter
model_lgb = lgb.LGBMClassifier(class_weight='balanced',num_leaves=50,
                              learning_rate=0.02, n_estimators=100, max_depth=3
                              ,min_child_samples = 20,min_child_weight = 0.001,
)

parameters={  'max_depth': range(3,8,2),  'num_leaves':range(50, 170, 30)}

clf  = GridSearchCV(estimator=model_lgb, param_grid=parameters, cv=StratifiedKFold(n_splits=5).split(X_train, y_train), verbose=0, n_jobs=4)
clf.fit(X_train, y_train)

print(clf.best_params_)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
# save model
# pickle.dump(clf, open('LGB.weight', 'wb'))

# load model
# clf = pickle.load(open('LGB.weight', 'rb'))

#### Submission

In [None]:
df_sub['Season'] = df_sub['ID'].apply(lambda x: x.split('_')[0])
df_sub['Team1'] = df_sub['ID'].apply(lambda x: x.split('_')[1])
df_sub['Team2'] = df_sub['ID'].apply(lambda x: x.split('_')[2])
df_sub['Season']=df_sub['Season'].astype('int64')
df_sub['Team1']=df_sub['Team1'].astype('int64')
df_sub['Team2']=df_sub['Team2'].astype('int64')
df_sub

In [None]:
# get full features for prediction
df_sub1 = pd.merge(df_sub, df_seed, how='left', left_on=['Season','Team1'], right_on = ['Season','TeamID'])
df_sub1.rename(columns={'Seed':'Seed1'}, inplace =True)
df_sub1.drop('TeamID',axis=1, inplace =True)

df_sub2 = pd.merge(df_sub1, df_seed, how='left', left_on=['Season','Team2'], right_on = ['Season','TeamID'])
df_sub2.rename(columns={'Seed':'Seed2'}, inplace =True)
df_sub2.drop('TeamID',axis=1, inplace =True)

df_sub2['Seed_diff']=df_sub2['Seed1'] - df_sub2['Seed2']
df_sub3 = pd.merge(df_sub2, df_train1, how='left', left_on=['Season','Team1'], right_on = ['Season','T1'])
df_sub4 = pd.merge(df_sub3, df_train2, how='left', left_on=['Season','Team2'], right_on = ['Season','T1'])

df_sub4.head()

In [None]:
# prediction
predict_features = df_sub4[df_sub4.columns[5:]].values
df_final_sub = df_sub4
df_final_sub['Pred'] = clf.predict_proba(predict_features)[:,1]
df_final_sub.head()

In [None]:
# submission file
df_final_sub[['ID','Pred']]
df_final_sub[['ID','Pred']].set_index('ID').to_csv('submissionW_stage2_LGB.csv')