# *__Make March less Mad 2020__*

## Import libraries

In [31]:
# Libraries
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
%matplotlib inline
import copy
import datetime
import lightgbm as lgb
from scipy import stats
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import xgboost as xgb
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, classification_report, confusion_matrix
import json
import ast
import time
from sklearn import linear_model

import warnings
warnings.filterwarnings('ignore')

import os
import glob

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

In [0]:
class LGBWrapper(object):
    """
    A wrapper for lightgbm model so that we will have a single api for various models.
    """

    def __init__(self):
        self.model = lgb.LGBMClassifier()

    def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None):

        eval_set = [(X_train, y_train)]
        eval_names = ['train']
        self.model = self.model.set_params(**params)

        if X_valid is not None:
            eval_set.append((X_valid, y_valid))
            eval_names.append('valid')

        if X_holdout is not None:
            eval_set.append((X_holdout, y_holdout))
            eval_names.append('holdout')

        if 'cat_cols' in params.keys():
            cat_cols = [col for col in params['cat_cols'] if col in X_train.columns]
            if len(cat_cols) > 0:
                categorical_columns = params['cat_cols']
            else:
                categorical_columns = 'auto'
        else:
            categorical_columns = 'auto'

        self.model.fit(X=X_train, y=y_train,
                       eval_set=eval_set, eval_names=eval_names,
                       verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds'])

        self.best_score_ = self.model.best_score_
        self.feature_importances_ = self.model.feature_importances_

    def predict_proba(self, X_test):
        if self.model.objective == 'binary':
            return self.model.predict_proba(X_test, num_iteration=self.model.best_iteration_)[:, 1]
        else:
            return self.model.predict_proba(X_test, num_iteration=self.model.best_iteration_)

In [0]:
class MainTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, convert_cyclical: bool = False, create_interactions: bool = False, n_interactions: int = 20):
        """
        Main transformer for the data. Can be used for processing on the whole data.

        :param convert_cyclical: convert cyclical features into continuous
        :param create_interactions: create interactions between features
        """

        self.convert_cyclical = convert_cyclical
        self.create_interactions = create_interactions
        self.feats_for_interaction = None
        self.n_interactions = n_interactions

    def fit(self, X, y=None):

        if self.create_interactions:
            pass
        return self

    def transform(self, X, y=None):
        data = copy.deepcopy(X)

        # data['installation_event_code_count_mean'] = data.groupby(['installation_id'])['sum_event_code_count'].transform('mean')

        return data

    def fit_transform(self, X, y=None, **fit_params):
        data = copy.deepcopy(X)
        self.fit(data)
        return self.transform(data)


class FeatureTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, main_cat_features: list = None, num_cols: list = None):
        """

        :param main_cat_features:
        :param num_cols:
        """
        self.main_cat_features = main_cat_features
        self.num_cols = num_cols

    def fit(self, X, y=None):

        self.num_cols = [col for col in X.columns if 'sum' in col or 'mean' in col or 'max' in col or 'std' in col
                         or 'attempt' in col]

        return self

    def transform(self, X, y=None):
        data = copy.deepcopy(X)
#         for col in self.num_cols:
#             data[f'{col}_to_mean'] = data[col] / data.groupby('installation_id')[col].transform('mean')
#             data[f'{col}_to_std'] = data[col] / data.groupby('installation_id')[col].transform('std')

        return data

    def fit_transform(self, X, y=None, **fit_params):
        data = copy.deepcopy(X)
        self.fit(data)
        return self.transform(data)

In [0]:
class ClassifierModel(object):
    """
    A wrapper class for classification models.
    It can be used for training and prediction.
    Can plot feature importance and training progress (if relevant for model).

    """

    def __init__(self, columns: list = None, model_wrapper=None):
        """

        :param original_columns:
        :param model_wrapper:
        """
        self.columns = columns
        self.model_wrapper = model_wrapper
        self.result_dict = {}
        self.train_one_fold = False
        self.preprocesser = None

    def fit(self, X: pd.DataFrame, y,
            X_holdout: pd.DataFrame = None, y_holdout=None,
            folds=None,
            params: dict = None,
            eval_metric='auc',
            cols_to_drop: list = None,
            preprocesser=None,
            transformers: dict = None,
            adversarial: bool = False,
            plot: bool = True):
        """
        Training the model.

        :param X: training data
        :param y: training target
        :param X_holdout: holdout data
        :param y_holdout: holdout target
        :param folds: folds to split the data. If not defined, then model will be trained on the whole X
        :param params: training parameters
        :param eval_metric: metric for validataion
        :param cols_to_drop: list of columns to drop (for example ID)
        :param preprocesser: preprocesser class
        :param transformers: transformer to use on folds
        :param adversarial
        :return:
        """
        self.cols_to_drop = cols_to_drop

        if folds is None:
            folds = KFold(n_splits=3, random_state=42)
            self.train_one_fold = True

        self.columns = X.columns if self.columns is None else self.columns
        self.feature_importances = pd.DataFrame(columns=['feature', 'importance'])
        self.trained_transformers = {k: [] for k in transformers}
        self.transformers = transformers
        self.models = []
        self.folds_dict = {}
        self.eval_metric = eval_metric
        n_target = 1 if len(set(y.values)) == 2 else len(set(y.values))
        self.oof = np.zeros((len(X), n_target))
        self.n_target = n_target

        X = X[self.columns]
        if X_holdout is not None:
            X_holdout = X_holdout[self.columns]

        if preprocesser is not None:
            self.preprocesser = preprocesser
            self.preprocesser.fit(X, y)
            X = self.preprocesser.transform(X, y)
            self.columns = X.columns.tolist()
            if X_holdout is not None:
                X_holdout = self.preprocesser.transform(X_holdout)
            # y = X['accuracy_group']

        for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
            if X_holdout is not None:
                X_hold = X_holdout.copy()
            else:
                X_hold = None
            self.folds_dict[fold_n] = {}
            if params['verbose']:
                print(f'Fold {fold_n + 1} started at {time.ctime()}')
            self.folds_dict[fold_n] = {}

            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            if self.train_one_fold:
                X_train = X[self.original_columns]
                y_train = y
                X_valid = None
                y_valid = None

            datasets = {'X_train': X_train, 'X_valid': X_valid, 'X_holdout': X_hold, 'y_train': y_train}
            X_train, X_valid, X_hold = self.transform_(datasets, cols_to_drop)

            self.folds_dict[fold_n]['columns'] = X_train.columns.tolist()

            model = copy.deepcopy(self.model_wrapper)

            if adversarial:
                X_new1 = X_train.copy()
                if X_valid is not None:
                    X_new2 = X_valid.copy()
                elif X_holdout is not None:
                    X_new2 = X_holdout.copy()
                X_new = pd.concat([X_new1, X_new2], axis=0)
                y_new = np.hstack((np.zeros((X_new1.shape[0])), np.ones((X_new2.shape[0]))))
                X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new)

            model.fit(X_train, y_train, X_valid, y_valid, X_hold, y_holdout, params=params)

            self.folds_dict[fold_n]['scores'] = model.best_score_
            if self.oof.shape[0] != len(X):
                self.oof = np.zeros((X.shape[0], self.oof.shape[1]))
            if not adversarial:
                self.oof[valid_index] = model.predict_proba(X_valid).reshape(-1, n_target)

            fold_importance = pd.DataFrame(list(zip(X_train.columns, model.feature_importances_)),
                                           columns=['feature', 'importance'])
            self.feature_importances = self.feature_importances.append(fold_importance)
            self.models.append(model)

        self.feature_importances['importance'] = self.feature_importances['importance'].astype(float)

        # if params['verbose']:
        self.calc_scores_()

        if plot:
#             print(classification_report(y, self.oof.argmax(1)))
            print(classification_report(y, (self.oof > 0.5) * 1))
            fig, ax = plt.subplots(figsize=(16, 12))
            plt.subplot(2, 2, 1)
            self.plot_feature_importance(top_n=25)
            plt.subplot(2, 2, 2)
            self.plot_metric()
            plt.subplot(2, 2, 3)
            g = sns.heatmap(confusion_matrix(y, (self.oof > 0.5) * 1), annot=True, cmap=plt.cm.Blues,fmt="d")
            g.set(ylim=(-0.5, 4), xlim=(-0.5, 4), title='Confusion matrix')

            plt.subplot(2, 2, 4)
            plt.hist(self.oof)
            plt.xticks(range(self.n_target), range(self.n_target))
            plt.title('Distribution of oof predictions');

    def transform_(self, datasets, cols_to_drop):
        for name, transformer in self.transformers.items():
            transformer.fit(datasets['X_train'], datasets['y_train'])
            datasets['X_train'] = transformer.transform(datasets['X_train'])
            if datasets['X_valid'] is not None:
                datasets['X_valid'] = transformer.transform(datasets['X_valid'])
            if datasets['X_holdout'] is not None:
                datasets['X_holdout'] = transformer.transform(datasets['X_holdout'])
            self.trained_transformers[name].append(transformer)
        if cols_to_drop is not None:
            cols_to_drop = [col for col in cols_to_drop if col in datasets['X_train'].columns]
            self.cols_to_drop = cols_to_drop
            datasets['X_train'] = datasets['X_train'].drop(cols_to_drop, axis=1)
            if datasets['X_valid'] is not None:
                datasets['X_valid'] = datasets['X_valid'].drop(cols_to_drop, axis=1)
            if datasets['X_holdout'] is not None:
                datasets['X_holdout'] = datasets['X_holdout'].drop(cols_to_drop, axis=1)

        return datasets['X_train'], datasets['X_valid'], datasets['X_holdout']

    def calc_scores_(self):
        print()
        datasets = [k for k, v in [v['scores'] for k, v in self.folds_dict.items()][0].items() if len(v) > 0]
        self.scores = {}
        for d in datasets:
            scores = [v['scores'][d][self.eval_metric] for k, v in self.folds_dict.items()]
            print(f"CV mean score on {d}: {np.mean(scores):.4f} +/- {np.std(scores):.4f} std.")
            self.scores[d] = np.mean(scores)

    def predict(self, X_test, averaging: str = 'usual'):
        """
        Make prediction

        :param X_test:
        :param averaging: method of averaging
        :return:
        """
        full_prediction = np.zeros((X_test.shape[0], self.oof.shape[1]))
        if self.preprocesser is not None:
            X_test = self.preprocesser.transform(X_test)
        for i in range(len(self.models)):
            X_t = X_test.copy()
            for name, transformers in self.trained_transformers.items():
                X_t = transformers[i].transform(X_t)
            if self.cols_to_drop:
                cols_to_drop = [col for col in self.cols_to_drop if col in X_t.columns]
                X_t = X_t.drop(cols_to_drop, axis=1)
            y_pred = self.models[i].predict_proba(X_t[self.folds_dict[i]['columns']]).reshape(-1, full_prediction.shape[1])

            # if case transformation changes the number of the rows
            if full_prediction.shape[0] != len(y_pred):
                full_prediction = np.zeros((y_pred.shape[0], self.oof.shape[1]))

            if averaging == 'usual':
                full_prediction += y_pred
            elif averaging == 'rank':
                full_prediction += pd.Series(y_pred).rank().values

        return full_prediction / len(self.models)

    def plot_feature_importance(self, drop_null_importance: bool = True, top_n: int = 10):
        """
        Plot default feature importance.

        :param drop_null_importance: drop columns with null feature importance
        :param top_n: show top n columns
        :return:
        """

        top_feats = self.get_top_features(drop_null_importance, top_n)
        feature_importances = self.feature_importances.loc[self.feature_importances['feature'].isin(top_feats)]
        feature_importances['feature'] = feature_importances['feature'].astype(str)
        top_feats = [str(i) for i in top_feats]
        sns.barplot(data=feature_importances, x='importance', y='feature', orient='h', order=top_feats)
        plt.title('Feature importances')

    def get_top_features(self, drop_null_importance: bool = True, top_n: int = 10):
        """
        Get top features by importance.

        :param drop_null_importance:
        :param top_n:
        :return:
        """
        grouped_feats = self.feature_importances.groupby(['feature'])['importance'].mean()
        if drop_null_importance:
            grouped_feats = grouped_feats[grouped_feats != 0]
        return list(grouped_feats.sort_values(ascending=False).index)[:top_n]

    def plot_metric(self):
        """
        Plot training progress.
        Inspired by `plot_metric` from https://lightgbm.readthedocs.io/en/latest/_modules/lightgbm/plotting.html

        :return:
        """
        full_evals_results = pd.DataFrame()
        for model in self.models:
            evals_result = pd.DataFrame()
            for k in model.model.evals_result_.keys():
                evals_result[k] = model.model.evals_result_[k][self.eval_metric]
            evals_result = evals_result.reset_index().rename(columns={'index': 'iteration'})
            full_evals_results = full_evals_results.append(evals_result)

        full_evals_results = full_evals_results.melt(id_vars=['iteration']).rename(columns={'value': self.eval_metric,
                                                                                            'variable': 'dataset'})
        full_evals_results[self.eval_metric] = np.abs(full_evals_results[self.eval_metric])
        sns.lineplot(data=full_evals_results, x='iteration', y=self.eval_metric, hue='dataset')
        plt.title('Training progress')

## Import Data from Kaggle

In [5]:
# Run this cell and select the kaggle.json file downloaded
# from the Kaggle account settings page

from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"robbiemoore","key":"eb58011570ab788e49cbda0546e027c5"}'}

In [6]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!mkdir NCAA_data
%cd NCAA_data


/content/NCAA_data


In [7]:
!kaggle competitions download -c google-cloud-ncaa-march-madness-2020-division-1-mens-tournament

Downloading MConferenceTourneyGames.csv to /content/NCAA_data
  0% 0.00/133k [00:00<?, ?B/s]
100% 133k/133k [00:00<00:00, 49.6MB/s]
Downloading MNCAATourneyCompactResults.csv to /content/NCAA_data
  0% 0.00/66.1k [00:00<?, ?B/s]
100% 66.1k/66.1k [00:00<00:00, 67.3MB/s]
Downloading MTeamCoaches.csv to /content/NCAA_data
  0% 0.00/331k [00:00<?, ?B/s]
100% 331k/331k [00:00<00:00, 107MB/s]
Downloading MMasseyOrdinals.csv.zip to /content/NCAA_data
 61% 9.00M/14.9M [00:00<00:00, 25.5MB/s]
100% 14.9M/14.9M [00:00<00:00, 37.5MB/s]
Downloading MTeamSpellings.csv to /content/NCAA_data
  0% 0.00/21.6k [00:00<?, ?B/s]
100% 21.6k/21.6k [00:00<00:00, 52.4MB/s]
Downloading MNCAATourneyDetailedResults.csv to /content/NCAA_data
  0% 0.00/109k [00:00<?, ?B/s]
100% 109k/109k [00:00<00:00, 111MB/s]
Downloading MRegularSeasonCompactResults.csv.zip to /content/NCAA_data
  0% 0.00/1.14M [00:00<?, ?B/s]
100% 1.14M/1.14M [00:00<00:00, 34.3MB/s]
Downloading MNCAATourneySeedRoundSlots.csv to /content/NCAA_data


## Loading the data

Let's load all useful data into a single dictionary!

In [0]:
data_dict = {}
for i in glob.glob('/content/NCAA_data/*'):
    name = i.split('/')[-1].split('.')[0]
    if name != 'MTeamSpellings':
        data_dict[name] = pd.read_csv(i)
    else:
        data_dict[name] = pd.read_csv(i, encoding='cp1252')

## Data overview

In [0]:
data_dict.keys()

dict_keys(['MSecondaryTourneyTeams', 'Cities', 'MRegularSeasonDetailedResults', 'MConferenceTourneyGames', 'MNCAATourneySeedRoundSlots', 'MRegularSeasonCompactResults', 'MNCAATourneyDetailedResults', 'MSeasons', 'MMasseyOrdinals', 'MSampleSubmissionStage1_2020', 'Conferences', 'MTeamConferences', 'MTeamCoaches', 'MEvents2018', 'MNCAATourneySeeds', 'MTeamSpellings', 'MNCAATourneyCompactResults', 'MEvents2017', 'MEvents2019', 'MPlayers', 'MEvents2016', 'MGameCities', 'MEvents2015', 'MNCAATourneySlots', 'MSecondaryTourneyCompactResults', 'MTeams'])

In [0]:
rs_df = data_dict['MRegularSeasonDetailedResults'].copy()
rs_df = rs_df[rs_df['WLoc'] != "N"]
rs_df = rs_df.drop(['NumOT'], axis=1)
#rs_df = rs_df.loc[rs_df['Season'] >= 2010]

conf = data_dict['MTeams'].copy()
teams = conf.loc[conf['LastD1Season'] != 2020]
not_d1 = teams['TeamID'].tolist()

for team in not_d1:
    rs_df = rs_df[rs_df['WTeamID'] != team]
    rs_df = rs_df[rs_df['LTeamID'] != team]

# **START SHIT ERIC DID**

In [0]:
# ALL OF OUR FUNCTIONS

import networkx as nx


# Used to swap from Winning/Losing Teams into Home/Away Teams
def home_conditions(s):
    if s['WLoc'] == 'H' or s['WLoc'] == 'N':
        return s['WTeamID'], s['WScore'],s['WFGM'], s['WFGA'], s['WFGM3'], s['WFGA3'], s['WFTM'], s['WFTA'], s['WOR'], s['WDR'], s['WAst'], s['WTO'], s['WStl'], s['WBlk'], s['WPF']
    else:
        return s['LTeamID'], s['LScore'], s['LFGM'], s['LFGA'], s['LFGM3'], s['LFGA3'], s['LFTM'], s['LFTA'], s['LOR'], s['LDR'], s['LAst'], s['LTO'], s['LStl'], s['LBlk'], s['LPF']

# Used to swap from Winning/Losing Teams into Home/Away Teams
def away_conditions(s):
    if s['WLoc'] == 'H' or s['WLoc'] == 'N':
        return s['LTeamID'], s['LScore'], s['LFGM'], s['LFGA'], s['LFGM3'], s['LFGA3'], s['LFTM'], s['LFTA'], s['LOR'], s['LDR'], s['LAst'], s['LTO'], s['LStl'], s['LBlk'], s['LPF']
    else:
        return s['WTeamID'], s['WScore'], s['WFGM'], s['WFGA'], s['WFGM3'], s['WFGA3'], s['WFTM'], s['WFTA'], s['WOR'], s['WDR'], s['WAst'], s['WTO'], s['WStl'], s['WBlk'], s['WPF'] 

# Calculate Effective Field Goal Percentage
def efg(s):
    return ((s['HFGM']+0.5*s['HFGM3'])/s['HFGA']), ((s['AFGM']+0.5*s['AFGM3'])/s['AFGA'])

# Calculate Turnover Percentage
def turnover(s):
    return (s['HTO']/(s['HFGA'] + 0.44*s['HFTA']+s['HTO'])), (s['ATO']/(s['AFGA'] + 0.44*s['AFTA']+s['ATO']))

# Calculate Offensive Rebounding Percentage
def rebound(s):
    hOR = s['HOR']/(s['HOR']+s['ADR'])
    hDR = s['HDR']/(s['HDR']+s['AOR'])
    aOR = s['AOR']/(s["AOR"]+s['HDR'])
    aDR = s['ADR']/(s["ADR"]+s['HOR'])
    return hOR, hDR, aOR, aDR

# Calculate Free Throw Percentage
def ft(s):
    return s['HFTA']/s['HFGA'], s['AFTA']/s['AFGA']

# Takes dataframe of last N games for Team X and swaps from Home/Away into Team X vs Opponent
def clean_last_n(df, teamID):
    cols = ['TeamID', 'OpponentID','Season', 'DayNum', 'Location', 'eFG', 'TOP', 'ORP','DRP', 'FTP', 'O_eFG', 'O_TOP', 'O_ORP', 'O_DRP', 'O_FTP', 'Winner']
    df_out = pd.DataFrame(columns = cols)
    for index, row in df.iterrows():
        new_row = {}
        new_row['TeamID'] = teamID
        new_row['Season'] = row['Season']
        new_row['DayNum'] = row['DayNum']
        new_row['Winner'] = row['Winner']
        if row['HomeTeamID'] == teamID:
            new_row['OpponentID'] = row['AwayTeamID']
            new_row['Location'] = 'H'
            new_row['eFG'] = row['H_eFG']
            new_row['TOP'] = row['H_TOP']
            new_row['ORP'] = row['H_ORP']
            new_row['DRP'] = row['H_DRP']
            new_row['FTP'] = row['H_FTP']
            new_row['O_eFG'] = row['A_eFG']
            new_row['O_TOP'] = row['A_TOP']
            new_row['O_ORP'] = row['A_ORP']
            new_row['O_DRP'] = row['A_DRP']
            new_row['O_FTP'] = row['A_FTP']
        else:
            new_row['OpponentID'] = row['HomeTeamID']
            new_row['Location'] = 'A'
            new_row['eFG'] = row['A_eFG']
            new_row['TOP'] = row['A_TOP']
            new_row['ORP'] = row['A_ORP']
            new_row['DRP'] = row['A_DRP']
            new_row['FTP'] = row['A_FTP']
            new_row['O_eFG'] = row['H_eFG']
            new_row['O_TOP'] = row['H_TOP']
            new_row['O_ORP'] = row['H_ORP']
            new_row['O_DRP'] = row['H_DRP']
            new_row['O_FTP'] = row['H_FTP']
        df_out = df_out.append(new_row, ignore_index=True)
    return df_out

# Takes a team, season and day, finds the last 5 games, cleans and returns df of last 5
def get_n_recent_games(df, teamID, season, day, num_games = 5):
    query = '(HomeTeamID=={} or AwayTeamID=={}) and Season=={}'.format(teamID, teamID, season)
    team_games = df.query(query)
    team_games.reset_index(inplace=True)
    game_index = team_games.loc[team_games['DayNum'] == day].index[0]
    if game_index >= num_games:
        cleaned = clean_last_n(team_games.iloc[game_index-num_games:game_index], teamID)
        return cleaned
    elif game_index == 0:
        return None
    else:
        cleaned = clean_last_n(team_games.head(game_index), teamID)
        return cleaned


# For Page Rank
def sortSecond(val):
    return val[1]

# Given a set of games (Organized into Home/Away) returns a pagerank list of all teams
def page_rank_df(rs_df, year = None, alpha = 0.85):
  if year:
      rs_df = rs_df.loc[rs_df['Season'] == year]
  pagerank_df = np.zeros((1466 - 1100,1466 - 1100))
  for ind in rs_df.index:
    if rs_df['WLoc'][ind] == 'H':
      pagerank_df[rs_df['AwayTeamID'][ind] - 1101, rs_df['HomeTeamID'][ind] - 1101] += 5
    else:
      pagerank_df[rs_df['HomeTeamID'][ind] - 1101, rs_df['AwayTeamID'][ind] - 1101] += 5
    if rs_df['H_eFG'][ind] > rs_df['A_eFG'][ind]:
      pagerank_df[rs_df['AwayTeamID'][ind] - 1101, rs_df['HomeTeamID'][ind] - 1101] += 4
    elif rs_df['H_eFG'][ind] < rs_df['A_eFG'][ind]:
      pagerank_df[rs_df['HomeTeamID'][ind] - 1101, rs_df['AwayTeamID'][ind] - 1101] += 4
    if rs_df['H_TOP'][ind] < rs_df['A_TOP'][ind]:
      pagerank_df[rs_df['AwayTeamID'][ind] - 1101, rs_df['HomeTeamID'][ind] - 1101] += 2.5
    elif rs_df['H_TOP'][ind] > rs_df['A_TOP'][ind]:
      pagerank_df[rs_df['HomeTeamID'][ind] - 1101, rs_df['AwayTeamID'][ind] - 1101] += 2.5
    if rs_df['H_ORP'][ind] > rs_df['A_ORP'][ind] :
      pagerank_df[rs_df['AwayTeamID'][ind] - 1101, rs_df['HomeTeamID'][ind] - 1101] += 2
    elif rs_df['H_ORP'][ind] < rs_df['A_ORP'][ind]:
      pagerank_df[rs_df['HomeTeamID'][ind] - 1101, rs_df['AwayTeamID'][ind] - 1101] += 2
    if rs_df['H_FTP'][ind] > rs_df['A_FTP'][ind]:
      pagerank_df[rs_df['AwayTeamID'][ind] - 1101, rs_df['HomeTeamID'][ind] - 1101] += 1.5
    elif rs_df['H_FTP'][ind] < rs_df['A_FTP'][ind]:
      pagerank_df[rs_df['HomeTeamID'][ind] - 1101, rs_df['AwayTeamID'][ind] - 1101] += 1.5

  G = nx.DiGraph(pagerank_df)
  pr = nx.pagerank(G, alpha = alpha)
  ranking = []
  for i in pr :
      ranking.append((i + 1101, pr[i]))
  ranking.sort(key = sortSecond, reverse=True)
  return ranking


In [0]:
# This snippet sets up dataframe for regular season, converts from Winner/Loser into Home/Away 

rs_df = data_dict['MRegularSeasonDetailedResults'].copy()
rs_df = rs_df[rs_df['WLoc'] != "N"]
rs_df = rs_df.drop(['NumOT'], axis=1)
#rs_df = rs_df.loc[rs_df['Season'] >= 2010]

conf = data_dict['MTeams'].copy()
teams = conf.loc[conf['LastD1Season'] != 2020]
not_d1 = teams['TeamID'].tolist()

for team in not_d1:
    rs_df = rs_df[rs_df['WTeamID'] != team]
    rs_df = rs_df[rs_df['LTeamID'] != team]


rs_df['HomeTeamID'], rs_df['HScore'], rs_df['HFGM'], rs_df['HFGA'], rs_df['HFGM3'], rs_df['HFGA3'], rs_df['HFTM'], rs_df['HFTA'], rs_df['HOR'], rs_df['HDR'], rs_df['HAST'], rs_df['HTO'], rs_df['HStl'], rs_df['HBlk'], rs_df['HPF'] = zip(*rs_df.apply(home_conditions, axis=1))
rs_df['AwayTeamID'], rs_df['AScore'], rs_df['AFGM'], rs_df['AFGA'], rs_df['AFGM3'], rs_df['AFGA3'], rs_df['AFTM'], rs_df['AFTA'], rs_df['AOR'], rs_df['ADR'], rs_df['AAST'], rs_df['ATO'], rs_df['AStl'], rs_df['ABlk'], rs_df['APF'] = zip(*rs_df.apply(away_conditions, axis=1))
rs_df['H_eFG'], rs_df['A_eFG'] = zip(*rs_df.apply(efg, axis=1))
rs_df['H_ORP'], rs_df['H_DRP'], rs_df['A_ORP'], rs_df['A_DRP'] = zip(*rs_df.apply(rebound, axis=1))
rs_df['H_TOP'], rs_df['A_TOP'] = zip(*rs_df.apply(turnover, axis=1))
rs_df['H_FTP'], rs_df['A_FTP'] = zip(*rs_df.apply(ft, axis=1))


rs_df = rs_df.drop(['WTeamID', 'WScore', 'LTeamID', 'LScore', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR','WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3','LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'], axis=1)
rs_df = rs_df.loc[rs_df['WLoc'] != 'N']
rs_df = rs_df.rename(columns={"WLoc": "Winner"})


Unnamed: 0,Season,DayNum,Winner,HomeTeamID,HScore,HFGM,HFGA,HFGM3,HFGA3,HFTM,HFTA,HOR,HDR,HAST,HTO,HStl,HBlk,HPF,AwayTeamID,AScore,AFGM,AFGA,AFGM3,AFGA3,AFTM,AFTA,AOR,ADR,AAST,ATO,AStl,ABlk,APF,H_eFG,A_eFG,H_ORP,H_DRP,A_ORP,A_DRP,H_TOP,A_TOP,H_FTP,A_FTP
5,2003,11,H,1458,81,26,57,6,12,23,27,12,24,12,9,9,3,18,1186,55,20,46,3,11,12,17,6,22,8,19,4,3,25,0.508772,0.467391,0.352941,0.800000,0.200000,0.647059,0.115562,0.262141,0.473684,0.369565
6,2003,12,H,1161,80,23,55,2,8,32,39,13,18,14,17,11,1,25,1236,62,19,41,4,15,20,28,9,21,11,30,10,4,28,0.436364,0.512195,0.382353,0.666667,0.333333,0.617647,0.190668,0.360058,0.709091,0.682927
9,2003,12,H,1458,84,32,67,5,17,15,19,14,22,11,6,12,0,13,1296,56,23,52,3,14,7,12,9,23,10,18,1,3,18,0.514925,0.471154,0.378378,0.709677,0.290323,0.621622,0.073746,0.239107,0.283582,0.230769
10,2003,13,H,1166,106,41,69,15,25,9,13,15,29,21,11,10,6,16,1426,50,17,52,4,11,12,17,8,15,8,17,7,3,15,0.702899,0.365385,0.500000,0.783784,0.216216,0.500000,0.128325,0.222280,0.188406,0.326923
13,2003,13,H,1323,76,25,56,10,23,16,23,8,35,18,13,14,19,13,1125,48,18,64,8,24,4,8,14,26,12,17,10,0,17,0.535714,0.343750,0.235294,0.714286,0.285714,0.764706,0.164307,0.201136,0.410714,0.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87445,2019,130,H,1272,79,26,57,8,27,19,29,14,27,16,13,9,2,19,1416,55,19,53,6,20,11,19,8,22,7,16,6,6,21,0.526316,0.415094,0.388889,0.771429,0.228571,0.611111,0.157081,0.206825,0.508772,0.358491
87479,2019,131,A,1272,58,16,68,4,23,22,26,21,26,4,9,5,5,21,1222,61,21,57,5,19,14,27,16,32,8,14,4,9,22,0.264706,0.412281,0.396226,0.619048,0.380952,0.603774,0.101764,0.168919,0.382353,0.473684
87495,2019,131,H,1436,66,21,51,7,22,17,23,11,27,7,11,5,5,16,1420,49,15,44,8,26,11,14,4,23,9,13,6,4,19,0.480392,0.431818,0.323529,0.870968,0.129032,0.676471,0.152524,0.205826,0.450980,0.318182
87497,2019,131,H,1463,83,31,61,5,16,16,16,6,26,16,8,8,2,18,1343,77,28,62,6,24,15,20,9,24,9,10,4,1,17,0.549180,0.500000,0.200000,0.742857,0.257143,0.800000,0.105208,0.123762,0.262295,0.322581


In [40]:
""" This snippet will take the dataframe games organized into Home/Away and convert
each game (row) into a prediction ready dataset with info on last 5 games
"""
column_names = ["Season", "DayNum", "HomeTeamID", "H_eFG", "H_TOP","H_ORP","H_DRP", "H_FTP", "AwayTeamID", "A_eFG", "A_TOP", "A_ORP", "A_DRP", "A_FTP", "Winner"]
cleaned_df = pd.DataFrame(columns = column_names)
num_rows = rs_df.shape[0]
for index, row in rs_df.iterrows():
    if index % 1000 == 0:
        print("Completed {}".format(index/num_rows))
    new_row = {}
    new_row['DayNum'] = row['DayNum']
    new_row['Season'] = row['Season']
    new_row['HomeTeamID'] = row['HomeTeamID']
    new_row['AwayTeamID'] = row['AwayTeamID']
    new_row['Winner'] = row['Winner']
    home = get_n_recent_games(rs_df, new_row['HomeTeamID'], row['Season'], row['DayNum'], 3)
    away = get_n_recent_games(rs_df, new_row['AwayTeamID'], row['Season'], row['DayNum'], 3)
    if type(home) != type(cleaned_df) or type(away) != type(cleaned_df):
        continue
    else:
        new_row['H_eFG'] = home['eFG'].mean()
        new_row['A_eFG'] = away['eFG'].mean()
        new_row['H_TOP'] = home['TOP'].mean()
        new_row['A_TOP'] = away['TOP'].mean()
        new_row['H_ORP'] = home['ORP'].mean()
        new_row['A_ORP'] = away['ORP'].mean()
        new_row['H_DRP'] = home['DRP'].mean()
        new_row['A_DRP'] = away['DRP'].mean()
        new_row['H_FTP'] = home['FTP'].mean()
        new_row['A_FTP'] = away['FTP'].mean()
    cleaned_df = cleaned_df.append(new_row, ignore_index=True)

        #Get eFG, TO, R, FT for home and away and append to dict




Completed 0.012869515977504086
Completed 0.025739031955008172
Completed 0.051478063910016345
Completed 0.06434757988752043
Completed 0.0900866118425286
Completed 0.10295612782003269
Completed 0.12869515977504087
Completed 0.14156467575254494
Completed 0.15443419173004902
Completed 0.16730370770755312
Completed 0.1801732236850572
Completed 0.1930427396625613
Completed 0.20591225564006538
Completed 0.21878177161756945
Completed 0.23165128759507356
Completed 0.24452080357257763
Completed 0.25739031955008174
Completed 0.2702598355275858
Completed 0.2831293515050899
Completed 0.29599886748259396
Completed 0.32173789943760217
Completed 0.33460741541510625
Completed 0.3474769313926103
Completed 0.3603464473701144
Completed 0.3732159633476185
Completed 0.3860854793251226
Completed 0.3989549953026267
Completed 0.41182451128013076
Completed 0.42469402725763483
Completed 0.45043305921264304
Completed 0.4633025751901471
Completed 0.4761720911676512
Completed 0.48904160714515527
Completed 0.5019111

In [0]:
# Create Yearly Page Ranks Dictionary

years = rs_df['Season'].unique()
yearly_rankings = {}
for year in years:
    year_df = rs_df.loc[rs_df['Season'] == year]
    rankings = page_rank_df(year_df)
    yearly_rankings[year] = rankings

In [51]:
# Stitch page rank into cleaned dataframe
cleaned_df['H_PageRank'] = 0
cleaned_df['A_PageRank'] = 0
num_rows = rs_df.shape[0]
for index, row in cleaned_df.iterrows():
    if index%1000==0:
        print("Completed {}".format(index/num_rows))
    if row['Season']-1 > 2002:
        year_rankings = yearly_rankings[row['Season']-1]
        home_index = [i for i, tupl in enumerate(year_rankings) if tupl[0] == row['HomeTeamID']]
        away_index = [i for i, tupl in enumerate(year_rankings) if tupl[0] == row['AwayTeamID']]
        cleaned_df['H_PageRank'].iloc[index] = home_index[0]
        cleaned_df['A_PageRank'].iloc[index] = away_index[0]


Completed 0.0
Completed 0.012869515977504086
Completed 0.025739031955008172
Completed 0.038608547932512255
Completed 0.051478063910016345
Completed 0.06434757988752043
Completed 0.07721709586502451
Completed 0.0900866118425286
Completed 0.10295612782003269
Completed 0.11582564379753678
Completed 0.12869515977504087
Completed 0.14156467575254494
Completed 0.15443419173004902
Completed 0.16730370770755312
Completed 0.1801732236850572
Completed 0.1930427396625613
Completed 0.20591225564006538
Completed 0.21878177161756945
Completed 0.23165128759507356
Completed 0.24452080357257763
Completed 0.25739031955008174
Completed 0.2702598355275858
Completed 0.2831293515050899
Completed 0.29599886748259396
Completed 0.30886838346009804
Completed 0.32173789943760217
Completed 0.33460741541510625
Completed 0.3474769313926103
Completed 0.3603464473701144
Completed 0.3732159633476185
Completed 0.3860854793251226
Completed 0.3989549953026267
Completed 0.41182451128013076
Completed 0.42469402725763483
Co

In [0]:
def pagerank_tournament_results(page_ranks, year):
    correct = 0
    incorrect = 0
    df = data_dict['MNCAATourneyCompactResults'].copy()
    df = df.loc[df['Season'] == year]
    for index, row in df.iterrows():
        win_index = [i for i, tupl in enumerate(page_ranks) if tupl[0] == row['WTeamID']]
        lose_index = [i for i, tupl in enumerate(page_ranks) if tupl[0] == row['LTeamID']]
        if win_index < lose_index:
            correct = correct + 1
        else:
            incorrect = incorrect + 1
    return correct

In [0]:
page_rank_2019 = page_rank_df(rs_df, year=2019)

In [0]:
pagerank_tournament_results(page_rank_2019, 2019)

0.7611940298507462

In [0]:
# Predict Tournaments based solely on their page rank from the given season

years = rs_df['Season'].unique()
alphas = [0.5, 0.6, 0.7,0.8,0.9,1]
for i, season in enumerate(years):
    #print('\n----------------------{}----------------------'.format(season))
    pagerank = page_rank_df(rs_df, season)
    accuracy = pagerank_tournament_results(pagerank, season)
    print("{} - {}".format(season, accuracy))
                         

2003 - 0.65625
2004 - 0.703125


KeyboardInterrupt: ignored

And here we can see the results of regular seasons.

In [0]:
cleaned_df.to_csv('/content/drive/My Drive/cleaned_df_last_3_w_pagerank.csv')

In [0]:
cleaned_df.to_csv('/content/drive/My Drive/cleaned_df_last_3.csv')

## Data processing and feature engineering.

The main idea is to extract features, which could be useful to understand how much one team is better than another one.

In [0]:
# process seed
data_dict['MNCAATourneySeeds']['Seed'] = data_dict['MNCAATourneySeeds']['Seed'].apply(lambda x: int(x[1:3]))
# take only useful columns
data_dict['MNCAATourneySeeds'] = data_dict['MNCAATourneySeeds'][['Season', 'TeamID', 'Seed']]
data_dict['MNCAATourneyCompactResults'] = data_dict['MNCAATourneyCompactResults'][['Season','WTeamID', 'LTeamID']]

# merge the data and rename the columns
df = pd.merge(data_dict['MNCAATourneyCompactResults'], data_dict['MNCAATourneySeeds'],
              how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID'])
df = pd.merge(df, data_dict['MNCAATourneySeeds'], how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'])
df = df.drop(['TeamID_x', 'TeamID_y'], axis=1)
df.columns = ['Season', 'WTeamID', 'LTeamID', 'WSeed', 'LSeed']
df.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed
0,1985,1116,1234,9,8
1,1985,1120,1345,11,6
2,1985,1207,1250,1,16
3,1985,1229,1425,9,8
4,1985,1242,1325,3,14


In [0]:
df.head()

Unnamed: 0,Season,WTeamID,LTeamID,WSeed,LSeed
0,1985,1116,1234,9,8
1,1985,1120,1345,11,6
2,1985,1207,1250,1,16
3,1985,1229,1425,9,8
4,1985,1242,1325,3,14


In [0]:
team_win_score = data_dict['MRegularSeasonCompactResults'].groupby(['Season', 'WTeamID']).agg({'WScore':['sum', 'count']}).reset_index()
team_win_score.columns = ['Season', 'WTeamID', 'WScore_sum', 'WScore_count']
team_loss_score = data_dict['MRegularSeasonCompactResults'].groupby(['Season', 'LTeamID']).agg({'LScore':['sum', 'count']}).reset_index()
team_loss_score.columns = ['Season', 'LTeamID', 'LScore_sum', 'LScore_count']
df = pd.merge(df, team_win_score, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'WTeamID'])
df = pd.merge(df, team_loss_score, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'LTeamID'])
df = pd.merge(df, team_loss_score, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'LTeamID'])
df = pd.merge(df, team_win_score, how='left', left_on=['Season', 'LTeamID_x'], right_on=['Season', 'WTeamID'])
df.drop(['LTeamID_y', 'WTeamID_y'], axis=1, inplace=True)
df.head()

Unnamed: 0,Season,WTeamID_x,LTeamID_x,WSeed,LSeed,WScore_sum_x,WScore_count_x,LScore_sum_x,LScore_count_x,LScore_sum_y,LScore_count_y,WScore_sum_y,WScore_count_y
0,1985,1116,1234,9,8,1448,21,567.0,10.0,708.0,12.0,1525,20
1,1985,1120,1345,11,6,1308,18,537.0,8.0,732.0,11.0,1191,17
2,1985,1207,1250,1,16,1917,25,1085.0,18.0,128.0,2.0,822,11
3,1985,1229,1425,9,8,1484,20,529.0,9.0,449.0,7.0,1386,19
4,1985,1242,1325,3,14,1809,23,475.0,7.0,472.0,7.0,1349,20


In [0]:
for col in ['Score_1', 'Score_2', 'Count_1', 'Count_2', 'Score_diff', 'Count_diff']:
    print(col)
    data[col] = data[col].fillna(0).astype(int)

Score_1
Score_2
Count_1
Count_2
Score_diff
Count_diff


In [0]:
df['x_score'] = df['WScore_sum_x'] + df['LScore_sum_y']
df['y_score'] = df['WScore_sum_y'] + df['LScore_sum_x']
df['x_count'] = df['WScore_count_x'] + df['LScore_count_y']
df['y_count'] = df['WScore_count_y'] + df['WScore_count_x']

In [0]:
df_win = df.copy()
df_los = df.copy()
df_win = df_win[['WSeed', 'LSeed', 'x_score', 'y_score', 'x_count', 'y_count']]
df_los = df_los[['LSeed', 'WSeed', 'y_score', 'x_score', 'x_count', 'y_count']]
df_win.columns = ['Seed_1', 'Seed_2', 'Score_1', 'Score_2', 'Count_1', 'Count_2']
df_los.columns = ['Seed_1', 'Seed_2', 'Score_1', 'Score_2', 'Count_1', 'Count_2']

In [0]:
df_win['Seed_diff'] = df_win['Seed_1'] - df_win['Seed_2']
df_win['Score_diff'] = df_win['Score_1'] - df_win['Score_2']
df_los['Seed_diff'] = df_los['Seed_1'] - df_los['Seed_2']
df_los['Score_diff'] = df_los['Score_1'] - df_los['Score_2']

df_win['Count_diff'] = df_win['Count_1'] - df_win['Count_2']
df_win['Mean_score1'] = df_win['Score_1'] / df_win['Count_1']
df_win['Mean_score2'] = df_win['Score_2'] / df_win['Count_2']
df_win['Mean_score_diff'] = df_win['Mean_score1'] - df_win['Mean_score2']
df_los['Count_diff'] = df_los['Count_1'] - df_los['Count_2']
df_los['Mean_score1'] = df_los['Score_1'] / df_los['Count_1']
df_los['Mean_score2'] = df_los['Score_2'] / df_los['Count_2']
df_los['Mean_score_diff'] = df_los['Mean_score1'] - df_los['Mean_score2']

In [0]:
df_win['result'] = 1
df_los['result'] = 0
data = pd.concat((df_win, df_los)).reset_index(drop=True)

In [0]:
data.head()

Unnamed: 0,Seed_1,Seed_2,Score_1,Score_2,Count_1,Count_2,Seed_diff,Score_diff,Count_diff,Mean_score1,Mean_score2,Mean_score_diff,result
0,9,8,2156,2092,33,41,1,64,-8,65.333333,51.02439,14.308943,1
1,11,6,2040,1728,29,35,5,312,-6,70.344828,49.371429,20.973399,1
2,1,16,2045,1907,27,36,-15,138,-9,75.740741,52.972222,22.768519,1
3,9,8,1933,1915,27,39,1,18,-12,71.592593,49.102564,22.490028,1
4,3,14,2281,1824,30,43,-11,457,-13,76.033333,42.418605,33.614729,1


In [42]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Training model

In [0]:
n_fold = 5
folds = RepeatedStratifiedKFold(n_splits=n_fold)
# folds = StratifiedKFold(n_splits=n_fold)

In [0]:
X = data.drop(['result'], axis=1)
y = data['result']

In [0]:
# some of params are from this kernel: https://www.kaggle.com/ratan123/march-madness-2020-ncaam-simple-lightgbm-on-kfold
param = {'n_estimators':10000,
          'num_leaves': 400,
          'min_child_weight': 0.034,
          'feature_fraction': 0.379,
          'bagging_fraction': 0.418,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.007,
          "boosting_type": "gbdt",
          #"bagging_seed": 11,
          "metric": 'binary_logloss',
          "verbosity": 10,
          'reg_alpha': 0.3899,
          'reg_lambda': 0.648,
          'random_state': 47,
          'task':'train', 'nthread':-1, 
         'verbose': 10000,
         'early_stopping_rounds': 10000,
         'eval_metric': 'binary_logloss'
         }
cat_cols = []
mt = MainTransformer(create_interactions=False)
# ct = CategoricalTransformer(drop_original=True, cat_cols=cat_cols)
ft = FeatureTransformer()
transformers = {'ft': ft}
lgb_model = ClassifierModel(model_wrapper=LGBWrapper())
lgb_model.fit(X=X, y=y, folds=folds, params=param, preprocesser=mt, transformers=transformers,
                    eval_metric='binary_logloss', cols_to_drop=None, plot=True)

Fold 1 started at Mon Mar  2 22:32:38 2020
Training until validation scores don't improve for 10000 rounds.
[10000]	train's binary_logloss: 0.146494	valid's binary_logloss: 0.552982
Did not meet early stopping. Best iteration is:
[10000]	train's binary_logloss: 0.146494	valid's binary_logloss: 0.552982
Fold 2 started at Mon Mar  2 22:32:58 2020
Training until validation scores don't improve for 10000 rounds.
[10000]	train's binary_logloss: 0.141778	valid's binary_logloss: 0.580302
Did not meet early stopping. Best iteration is:
[10000]	train's binary_logloss: 0.141778	valid's binary_logloss: 0.580302
Fold 3 started at Mon Mar  2 22:33:18 2020
Training until validation scores don't improve for 10000 rounds.
[10000]	train's binary_logloss: 0.136991	valid's binary_logloss: 0.632491
Did not meet early stopping. Best iteration is:
[10000]	train's binary_logloss: 0.136991	valid's binary_logloss: 0.632491
Fold 4 started at Mon Mar  2 22:33:38 2020
Training until validation scores don't improv

## Preparing test data

In [0]:
test = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MSampleSubmissionStage1_2020.csv')
test = test.drop(['Pred'], axis=1)
test['Season'] = test['ID'].apply(lambda x: int(x.split('_')[0]))
test['Team1'] = test['ID'].apply(lambda x: int(x.split('_')[1]))
test['Team2'] = test['ID'].apply(lambda x: int(x.split('_')[2]))
test = pd.merge(test, data_dict['MNCAATourneySeeds'], how='left', left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
test = pd.merge(test, data_dict['MNCAATourneySeeds'], how='left', left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'])
test = pd.merge(test, team_win_score, how='left', left_on=['Season', 'Team1'], right_on=['Season', 'WTeamID'])
test = pd.merge(test, team_loss_score, how='left', left_on=['Season', 'Team2'], right_on=['Season', 'LTeamID'])
test = pd.merge(test, team_loss_score, how='left', left_on=['Season', 'Team1'], right_on=['Season', 'LTeamID'])
test = pd.merge(test, team_win_score, how='left', left_on=['Season', 'Team2'], right_on=['Season', 'WTeamID'])
test['seed_diff'] = test['Seed_x'] - test['Seed_y']

FileNotFoundError: ignored

In [0]:
test['x_score'] = test['WScore_sum_x'] + test['LScore_sum_y']
test['y_score'] = test['WScore_sum_y'] + test['LScore_sum_x']
test['x_count'] = test['WScore_count_x'] + test['LScore_count_y']
test['y_count'] = test['WScore_count_y'] + test['WScore_count_x']

NameError: ignored

In [0]:
test.head()

NameError: ignored

In [0]:
test = test[['Seed_x', 'Seed_y', 'x_score', 'y_score', 'x_count', 'y_count']]
test.columns = ['Seed_1', 'Seed_2', 'Score_1', 'Score_2', 'Count_1', 'Count_2']

NameError: ignored

In [0]:
test['Seed_diff'] = test['Seed_1'] - test['Seed_2']
test['Score_diff'] = test['Score_1'] - test['Score_2']
test['Seed_diff'] = test['Seed_1'] - test['Seed_2']
test['Score_diff'] = test['Score_1'] - test['Score_2']

test['Count_diff'] = test['Count_1'] - test['Count_2']
test['Mean_score1'] = test['Score_1'] / test['Count_1']
test['Mean_score2'] = test['Score_2'] / test['Count_2']
test['Mean_score_diff'] = test['Mean_score1'] - test['Mean_score2']
test['Count_diff'] = test['Count_1'] - test['Count_2']
test['Mean_score1'] = test['Score_1'] / test['Count_1']
test['Mean_score2'] = test['Score_2'] / test['Count_2']
test['Mean_score_diff'] = test['Mean_score1'] - test['Mean_score2']

NameError: ignored

In [0]:
test.head()

NameError: ignored

## Making predictions

In [0]:
test_preds = lgb_model.predict(test)

NameError: ignored

In [0]:
plt.hist(test_preds);

In [0]:
submission_df = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MSampleSubmissionStage1_2020.csv')
submission_df['Pred'] = test_preds
submission_df

In [0]:
submission_df.to_csv('submission.csv', index=False)