### Thanks for coming in this script, This Script shoes below.
- Use LGBM train
- Implemented CV analysis
- Showed Feature Importance.
- Showed Confusion Matrix and its results
- Submission completed

## 0. Libaray
- Import Libaray 

In [None]:
import os
import gc
import glob
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

## 1. Config
- Config Class: Base Setting is here

In [None]:
class Config:
    def __init__(self):
        self._debug = 0
        self.today = "20220227"
        self.revision = 1
        self.data_dir = '../input/mens-march-mania-2022/MDataFiles_Stage1/'
        self.output_dir = './'
    
        
config = Config()

## 2. PreProcessing
- Read CSV data
- Ref: https://www.kaggle.com/kazuya99986/eda-of-march-machine-learning-mania-2022-men
- This code using only detail result of regular and ncaa

In [None]:
cities = pd.read_csv(config.data_dir + 'Cities.csv')
conferences = pd.read_csv(config.data_dir + 'Conferences.csv')
teams = pd.read_csv(config.data_dir + 'MTeams.csv')
regular_compact_result = pd.read_csv(config.data_dir + 'MRegularSeasonCompactResults.csv')
regular_detail_result = pd.read_csv(config.data_dir + 'MRegularSeasonDetailedResults.csv')
conference_tourney = pd.read_csv(config.data_dir + 'MConferenceTourneyGames.csv')
seasons = pd.read_csv(config.data_dir + 'MSeasons.csv')
ncaa_compact_result = pd.read_csv(config.data_dir + 'MNCAATourneyCompactResults.csv')
ncaa_detail_result = pd.read_csv(config.data_dir + 'MNCAATourneyDetailedResults.csv')
ncaa_seed_roundslot = pd.read_csv(config.data_dir + 'MNCAATourneySeedRoundSlots.csv')
ncaa_seed = pd.read_csv(config.data_dir + 'MNCAATourneySeeds.csv')
ncaa_slot = pd.read_csv(config.data_dir + 'MNCAATourneySlots.csv')
secondary_compact_result = pd.read_csv(config.data_dir + 'MSecondaryTourneyCompactResults.csv')
secondary_teams = pd.read_csv(config.data_dir + 'MSecondaryTourneyTeams.csv')
coaches = pd.read_csv(config.data_dir + 'MTeamCoaches.csv')
team_conferences = pd.read_csv(config.data_dir + 'MTeamConferences.csv')
team_spellings = pd.read_csv(config.data_dir + 'MTeamSpellings.csv', encoding='shift-jis')
game_cities = pd.read_csv(config.data_dir + 'MGameCities.csv')
massey_ordinals = pd.read_csv(config.data_dir + 'MMasseyOrdinals.csv')
submission = pd.read_csv(config.data_dir + 'MSampleSubmissionStage1.csv')

### Regular Detail Data

In [None]:
regular_detail_result["Reg/NCAA"] = 0
regular_detail_result["Win/Lose"] = 1
regular_detail_result.head(5)

### NCAA Detail Data

In [None]:
ncaa_detail_result["Reg/NCAA"] = 1
ncaa_detail_result["Win/Lose"] = 1
ncaa_detail_result.head(5)

### Merge Data (Regular + NCAA)

In [None]:
detail_result = pd.concat([regular_detail_result, ncaa_detail_result], axis=0)
detail_result.head(5)

### Data Array Changed
- Change the data array.
- In order to create a column of win / loss (1 or 0), the data at the time of defeat was also merged vertically.

In [None]:
detail_result_win = detail_result
detail_result_lose = detail_result[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
                                   'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
                                   'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 
                                   'Reg/NCAA', 'Win/Lose']]
detail_result_lose.columns = detail_result_win.columns
detail_result_lose['Win/Lose'] = 0
detail_result_merge = pd.concat([detail_result_win, detail_result_lose], axis=0)
detail_result_merge.tail(5)

- Label Encoding

In [None]:
en_loc = LabelEncoder()
detail_result_merge["WLoc"] = en_loc.fit_transform(detail_result_merge["WLoc"])
detail_result_merge

- Categorical Data

In [None]:
categorical_val = ["Season", "DayNum", "WTeamID", "LTeamID", "WLoc", "NumOT", "Reg/NCAA"]
detail_result_merge[categorical_val] = detail_result_merge[categorical_val].astype("category")
detail_result_merge.dtypes

## 3. Train Data
- Removed Score data, because the score data determines the outcome.

In [None]:
X = detail_result_merge.drop(["Win/Lose"], axis=1)  
y = detail_result_merge["Win/Lose"]

X = X[[
      'WTeamID', 'WFGA', 'WFGA3', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF',
      'LTeamID', 'LFGA', 'LFGA3', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
      'Season', 'WLoc', 'NumOT', 'Reg/NCAA'
    ]]

const_columns = ['Season', 'WLoc', 'NumOT', 'Reg/NCAA']
win_columns = ['WTeamID', 'WFGA', 'WFGA3', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']
lose_columns = ['LTeamID', 'LFGA', 'LFGA3', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']

display(X.head(5))

## 4. LGBM Trial

In [None]:
class LGBM:
    def __init__(self, train, target):
        self.n_splits = 5
        self.kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=2021)
        self.cv_clf = []
        self.results = []
        self.train, self.target = train, target
        self.y_oof = np.zeros(len(self.target))
        self.num_boost_round = 10000
        self.early_stopping = 100
        self.verbose_eval = 100


    def lgbm_train(self, params):
        for fold, (trn_idx, val_idx) in enumerate(self.kf.split(self.train, self.target)):
            print("="*15 + f' Fold {fold+1} started at {time.ctime()} ' + "="*15)
            X_trn, X_val = self.train.iloc[trn_idx], self.train.iloc[val_idx]
            y_trn, y_val = self.target.iloc[trn_idx], self.target.iloc[val_idx]

            lgb_trn = lgb.Dataset(X_trn, y_trn, weight=None)
            lgb_val = lgb.Dataset(X_val, y_val, weight=None)

            clf = lgb.train(params=params,
                            train_set=lgb_trn,
                            valid_sets=[lgb_trn, lgb_val],
                            num_boost_round=self.num_boost_round,
                            early_stopping_rounds=self.early_stopping,
                            verbose_eval=self.verbose_eval
                            )

            y_pred = clf.predict(X_val)
            self.y_oof[val_idx] = y_pred

            y_pred2 = np.where(y_pred < 0.5, 0, 1)
            res_dict = {"Matrix": metrics.confusion_matrix(y_val, y_pred2),
                        "Accuracy": metrics.accuracy_score(y_val, y_pred2),
                        "Precision": metrics.precision_score(y_val, y_pred2),
                        "Recall": metrics.recall_score(y_val, y_pred2),
                        "F_value": metrics.f1_score(y_val, y_pred2), 
                        "LogLoss": metrics.log_loss(y_val, y_pred)
                       }

            print("Train Results")
            display(res_dict["Matrix"])
            print(f'LogLoss: {res_dict["LogLoss"]}\n')
            self.cv_clf.append(clf)                     
            self.results.append(res_dict)

        print(f'Summary LogLoss: {metrics.log_loss(self.target, self.y_oof)}\n')

lgbm_inst = LGBM(X, y)
params =  {   
          "boosting_type": "gbdt",
          "metric": "binary_logloss",
          "objective": "binary",
          "learning_rate": .1,
          "random_state": 1,
         }


lgbm_inst.lgbm_train(params)

### Feature_Importance

In [None]:
fig , ax = plt.subplots(1, len(lgbm_inst.cv_clf), figsize=(6 * len(lgbm_inst.cv_clf), 12))
for i, clf in enumerate(lgbm_inst.cv_clf):
    lgb.plot_importance(clf, ax=ax[i], height=0.5, title=f'Feature importance CV{i+1}', xlabel='Feature importance', ylabel=None)
plt.show()

### Confusion_matrix Results

In [None]:
vmin, vmax = 0.90, 0.93
col_lst = ["Accuracy", "Precision", "Recall", "F_value"]

train_results = pd.DataFrame(lgbm_inst.results)
fig , ax = plt.subplots(1, len(col_lst), figsize=(24, 6))
for i, name in enumerate(col_lst):
    ax[i].bar([f"CV{j+1}" for j in range(len(train_results))], train_results[name])
    ax[i].set(title=name, ylim=(vmin, vmax))
plt.show()

## 4. Created Test Data & Submission
- Used Mean value for each team

In [None]:
# Specify the minimum and maximum prediction probabilities.
pred_min, pred_max = 0.2, 0.8

for i, clf in tqdm(enumerate(lgbm_inst.cv_clf)):
    counter = 0
    for row in submission.iterrows():
        YY = int(row[1]['ID'].split('_')[0])
        WW = int(row[1]['ID'].split('_')[1])
        LL = int(row[1]['ID'].split('_')[2])
        
        team1_data =  X[(X["Season"] == YY) & (X["WTeamID"] == WW)].iloc[:, :len(win_columns)].groupby("WTeamID").mean().reset_index().dropna()
        team2_data =  X[(X["Season"] == YY) & (X["WTeamID"] == LL)].iloc[:, :len(win_columns)].groupby("WTeamID").mean().reset_index().dropna()
        team1_data = team1_data[win_columns]
        team2_data = team2_data[win_columns]
        team2_data.columns = lose_columns
        X_test = pd.concat([team1_data.reset_index(drop=True), team2_data.reset_index(drop=True)], axis=1)
        
        # Temporary data
        X_test["Season"] = YY
        X_test["WLoc"] = 2  # N
        X_test["NumOT"] = 0
        X_test["Reg/NCAA"] = 1

        categorical_val = ["Season", "WTeamID", "LTeamID", "WLoc", "NumOT", "Reg/NCAA"]
        X_test[categorical_val] = X_test[categorical_val].astype("category")

        submission.loc[counter, f'Pred{i}'] = np.clip(clf.predict(X_test), pred_min, pred_max)
        counter += 1

display(submission.head(10))

- Submission

In [None]:
for i in range(len(lgbm_inst.cv_clf)):
    submission.loc[:, 'Pred'] += submission.loc[:, f'Pred{i}']
    submission = submission.drop([f"Pred{i}"], axis=1)
submission.loc[:, 'Pred'] = (submission.loc[:, 'Pred'] - 0.5) / len(lgbm_inst.cv_clf)

display(submission.head(10))
submission.to_csv('submission.csv', index=False)