In [None]:
import catboost
print(f"Catboost version: {catboost.__version__}")
!python --version

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from catboost import CatBoostClassifier, Pool, cv
from sklearn import model_selection, metrics, preprocessing

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Stage1
stage1dir = "../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1"
reg_season = pd.read_csv(f"{stage1dir}/MRegularSeasonDetailedResults.csv")
tourney = pd.read_csv(f"{stage1dir}/MNCAATourneyDetailedResults.csv")
submit = pd.read_csv(f"../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MSampleSubmissionStage1_2020.csv")

In [None]:
print(f"Shape: {reg_season.shape}\n")
print(reg_season.dtypes)
reg_season.head()

In [None]:
print(f"Shape: {tourney.shape}\n")
print(tourney.dtypes)
tourney.head()

In [None]:
print(f"Shape: {submit.shape}\n")
print(submit.dtypes)
submit.head()

## Functions

In [None]:
def get_id_part(ID, part):
    return {["Season","t1","t2"][i]:v for i,v in enumerate(ID.split("_"))}[part]

# Get submission-like dataset from `MRegularSeasonDetailedResults.csv` and a test/validation dataset from `MNCAATourneyDetailedResults.csv`.

In [None]:
for i in {"0": reg_season,"test": tourney}.items():
    df = pd.DataFrame()
    label = i[0]
    idf = i[1]
    df["Season"] = idf["Season"]
    df["DayNum"] = idf["DayNum"] if label == "0" else 134
    df["t1"] = idf[["WTeamID","LTeamID"]].min(axis=1)
    df["t2"] = idf[["WTeamID","LTeamID"]].max(axis=1)
    df["ID"] = df["Season"].astype(str).add("_").add(df["t1"].astype(str)).add("_").add(df["t2"].astype(str))
    df["Pred"] = (df["t1"] == idf["WTeamID"]).astype(int)
    print(f"{label} Shape: {df.shape}")
    print(df.head(20))
    df.to_csv(f"df{label}.csv", index=False)

# Get submission dataset from `MSampleSubmissionStage1_2020.csv`.

In [None]:
df = pd.DataFrame()

df["ID"] = submit["ID"]
df["Season"] = submit["ID"].apply(lambda x: get_id_part(x, "Season"))
df["DayNum"] = 134
df["t1"] = submit["ID"].apply(lambda x: get_id_part(x, "t1"))
df["t2"] = submit["ID"].apply(lambda x: get_id_part(x, "t2"))
print(f"Shape: {df.shape}")
df.to_csv(f"submission.csv", index=False)
df.head()

# Feature engineering

## SQL Server query from `MRegularSeasonDetailedResults.csv` to `df.csv` and `MNCAATourneyDetailedResults.csv` to `dftest.csv`

**Why SQL Server?**

Because Pandas is slow. Using SQL Server this query took 14:54 minutes on a normal laptop. The submission query is the same with two exceptions:

1. `Pred` is not included (of course).
2. `t1_loc` is just `N` as `'N' AS t1_loc`
3. `ORDER BY` is changed to `ORDER BY Season, t1, t2`

The query follows this pattern:
```
SELECT <query_1>
    INTO #temp_table1
    FROM <original_table>

SELECT <query_2>
    INTO #temp_table2
    FROM #temp_table1

SELECT <query_3>
    FROM #temp_table2
    
DROP TABLE #temp_table1
DROP TABLE #temp_table2
```

**Here is the full query:**

```
/****** Script for SelectTopNRows command from SSMS  ******/
SELECT [Season]
      ,[DayNum]
      ,[t1]
      ,[t2]
      ,[ID]
      ,[Pred]
	  ,(SELECT CASE WHEN a.t1 = b.WTeamID THEN b.WLoc ELSE
					CASE WHEN b.WLoc = 'N' THEN 'N' ELSE
						CASE WHEN b.WLoc = 'H' THEN 'A' ELSE 'H' END
					END
			   END
		FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
		WHERE b.Season = a.Season AND b.DayNum = a.DayNum AND a.t1 IN (b.WTeamID, b.LTeamID)) AS t1_loc

	  -- RANKINGS
	  ,(SELECT TOP 1 RankingDayNum FROM [kaggle_ncaam_2020].[dbo].[MMasseyOrdinals] b
			WHERE b.Season = a.Season AND b.TeamID = a.t1 ORDER BY ABS(a.DayNum - b.RankingDayNum)) AS t1_rank_day
	  ,(SELECT TOP 1 OrdinalRank FROM [kaggle_ncaam_2020].[dbo].[MMasseyOrdinals] b
			WHERE b.Season = a.Season AND b.TeamID = a.t1 ORDER BY ABS(a.DayNum - b.RankingDayNum)) AS t1_rank
	  ,(SELECT TOP 1 RankingDayNum FROM [kaggle_ncaam_2020].[dbo].[MMasseyOrdinals] b
			WHERE b.Season = a.Season AND b.TeamID = a.t2 ORDER BY ABS(a.DayNum - b.RankingDayNum)) AS t2_rank_day
	  ,(SELECT TOP 1 OrdinalRank FROM [kaggle_ncaam_2020].[dbo].[MMasseyOrdinals] b
			WHERE b.Season = a.Season AND b.TeamID = a.t2 ORDER BY ABS(a.DayNum - b.RankingDayNum)) AS t2_rank

	  -- T1

	  --- winning percentage
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 IN (b.WTeamID, b.LTeamID)) AS t1_games
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wins
	  , COALESCE(CAST((SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS FLOAT) /
			CAST(NULLIF((SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 IN (b.WTeamID, b.LTeamID)), 0) AS FLOAT), 0.5) AS t1_win_perc

	  --- home/away/neutral winning percentage
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID AND b.WLoc = 'H') AS t1_H_wins
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID AND b.WLoc = 'A') AS t1_H_losses
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID AND b.WLoc = 'A') AS t1_A_wins
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID AND b.WLoc = 'H') AS t1_A_losses
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID AND b.WLoc = 'N') AS t1_N_wins
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID AND b.WLoc = 'N') AS t1_N_losses

	  --- outscore
	  , (SELECT SUM(b.WScore)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_win_points
	  , (SELECT SUM(b.LScore)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_win_opp_points
	  , COALESCE((SELECT CAST(SUM(b.WScore) - SUM(b.LScore) AS FLOAT) / CAST(NULLIF(COUNT(b.DayNum), 0) AS FLOAT)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID), 0) AS t1_outscore

	  --- outscored
	  , (SELECT SUM(b.LScore)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_l_points
	  , (SELECT SUM(b.WScore)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_l_opp_points
	  , COALESCE((SELECT CAST(SUM(b.WScore) - SUM(b.LScore) AS FLOAT) / CAST(NULLIF(COUNT(b.DayNum), 0) AS FLOAT)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID), 0) AS t1_outscored

	  --- field goals
	  , (SELECT SUM(b.WFGA) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wfga
	  , (SELECT SUM(b.WFGM) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wfgm
	  , (SELECT CAST(SUM(b.WFGM) AS FLOAT)/NULLIF(CAST(SUM(b.WFGA) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wfg_perc

	  , (SELECT SUM(b.LFGA) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lfga
	  , (SELECT SUM(b.LFGM) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lfgm
	  , (SELECT CAST(SUM(b.LFGM) AS FLOAT)/NULLIF(CAST(SUM(b.LFGA) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lfg_perc
	  
	  --- 3 pointers
	  , (SELECT SUM(b.WFGA3) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wfga3
	  , (SELECT SUM(b.WFGM3) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wfgm3
	  , (SELECT CAST(SUM(b.WFGM3) AS FLOAT)/NULLIF(CAST(SUM(b.WFGA3) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wfg3_perc

	  , (SELECT SUM(b.LFGA3) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lfga3
	  , (SELECT SUM(b.LFGM3) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lfgm3
	  , (SELECT CAST(SUM(b.LFGM3) AS FLOAT)/NULLIF(CAST(SUM(b.LFGA3) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lfg3_perc

	  --- free throws
	  , (SELECT SUM(b.WFTA) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wfta
	  , (SELECT SUM(b.WFTM) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wftm
	  , (SELECT CAST(SUM(b.WFTM) AS FLOAT)/NULLIF(CAST(SUM(b.WFTA) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wft_perc

	  , (SELECT SUM(b.LFTA) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lfta
	  , (SELECT SUM(b.LFTM) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lftm
	  , (SELECT CAST(SUM(b.LFTM) AS FLOAT)/NULLIF(CAST(SUM(b.LFTA) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lft_perc

	  --- rebounds

	  ---- offensive
	  , (SELECT SUM(b.WOR) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wor
	  , (SELECT CAST(SUM(b.WOR) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wor_avg

	  , (SELECT SUM(b.LOR) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lor
	  , (SELECT CAST(SUM(b.LOR) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lor_avg

	  ---- defensive
	  , (SELECT SUM(b.WDR) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wdr
	  , (SELECT CAST(SUM(b.WDR) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wdr_avg

	  , (SELECT SUM(b.LDR) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_ldr
	  , (SELECT CAST(SUM(b.LDR) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_ldr_avg

	  --- assists
	  , (SELECT SUM(b.WAst) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wast
	  , (SELECT CAST(SUM(b.WAst) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wast_avg
	  , (SELECT SUM(b.LAst) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_last
	  , (SELECT CAST(SUM(b.LAst) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_last_avg

	  --- timeouts
	  , (SELECT SUM(b.WTO) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wto
	  , (SELECT CAST(SUM(b.WTO) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wto_avg
	  , (SELECT SUM(b.LTO) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lto
	  , (SELECT CAST(SUM(b.LTO) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lto_avg

	  --- steals
	  , (SELECT SUM(b.WStl) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wstl
	  , (SELECT CAST(SUM(b.WStl) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wstl_avg
	  , (SELECT SUM(b.LStl) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lstl
	  , (SELECT CAST(SUM(b.LStl) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lstl_avg

	  --- blocks
	  , (SELECT SUM(b.WBlk) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wblk
	  , (SELECT CAST(SUM(b.WBlk) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wblk_avg
	  , (SELECT SUM(b.LBlk) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lblk
	  , (SELECT CAST(SUM(b.LBlk) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lblk_avg

	  --- personal fouls
	  , (SELECT SUM(b.WPF) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wpf
	  , (SELECT CAST(SUM(b.WPF) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.WTeamID) AS t1_wpf_avg
	  , (SELECT SUM(b.LPF) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lpf
	  , (SELECT CAST(SUM(b.LPF) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t1 = b.LTeamID) AS t1_lpf_avg

	  -- T2

	  --- winning percentage
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 IN (b.WTeamID, b.LTeamID)) AS t2_games
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wins
	  , COALESCE(CAST((SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS FLOAT) /
			CAST(NULLIF((SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 IN (b.WTeamID, b.LTeamID)), 0) AS FLOAT), 0.5) AS t2_win_perc

	  --- home/away/neutral winning percentage
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID AND b.WLoc = 'H') AS t2_H_wins
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID AND b.WLoc = 'A') AS t2_H_losses
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID AND b.WLoc = 'A') AS t2_A_wins
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID AND b.WLoc = 'H') AS t2_A_losses
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID AND b.WLoc = 'N') AS t2_N_wins
	  , (SELECT COUNT(b.DayNum) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID AND b.WLoc = 'N') AS t2_N_losses

	  --- outscore
	  , (SELECT SUM(b.WScore)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_win_points
	  , (SELECT SUM(b.LScore)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_win_opp_points
	  , COALESCE((SELECT CAST(SUM(b.WScore) - SUM(b.LScore) AS FLOAT) / CAST(NULLIF(COUNT(b.DayNum), 0) AS FLOAT)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID), 0) AS t2_outscore

	  --- outscored
	  , (SELECT SUM(b.LScore)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_l_points
	  , (SELECT SUM(b.WScore)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_l_opp_points
	  , COALESCE((SELECT CAST(SUM(b.WScore) - SUM(b.LScore) AS FLOAT) / CAST(NULLIF(COUNT(b.DayNum), 0) AS FLOAT)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season
			AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID), 0) AS t2_outscored

	  --- field goals
	  , (SELECT SUM(b.WFGA) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wfga
	  , (SELECT SUM(b.WFGM) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wfgm
	  , (SELECT CAST(SUM(b.WFGM) AS FLOAT)/NULLIF(CAST(SUM(b.WFGA) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wfg_perc

	  , (SELECT SUM(b.LFGA) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lfga
	  , (SELECT SUM(b.LFGM) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lfgm
	  , (SELECT CAST(SUM(b.LFGM) AS FLOAT)/NULLIF(CAST(SUM(b.LFGA) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lfg_perc

	  --- 3 pointers
	  , (SELECT SUM(b.WFGA3) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wfga3
	  , (SELECT SUM(b.WFGM3) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wfgm3
	  , (SELECT CAST(SUM(b.WFGM3) AS FLOAT)/NULLIF(CAST(SUM(b.WFGA3) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wfg3_perc

	  , (SELECT SUM(b.LFGA3) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lfga3
	  , (SELECT SUM(b.LFGM3) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lfgm3
	  , (SELECT CAST(SUM(b.LFGM3) AS FLOAT)/NULLIF(CAST(SUM(b.LFGA3) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lfg3_perc

	  --- free throws
	  , (SELECT SUM(b.WFTA) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wfta
	  , (SELECT SUM(b.WFTM) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wftm
	  , (SELECT CAST(SUM(b.WFTM) AS FLOAT)/NULLIF(CAST(SUM(b.WFTA) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wft_perc

	  , (SELECT SUM(b.LFTA) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lfta
	  , (SELECT SUM(b.LFTM) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lftm
	  , (SELECT CAST(SUM(b.LFTM) AS FLOAT)/NULLIF(CAST(SUM(b.LFTA) AS FLOAT), 0)
			FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lft_perc

	  --- rebounds

	  ---- offensive
	  , (SELECT SUM(b.WOR) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wor
	  , (SELECT CAST(SUM(b.WOR) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wor_avg

	  , (SELECT SUM(b.LOR) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lor
	  , (SELECT CAST(SUM(b.LOR) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lor_avg

	  ---- defensive
	  , (SELECT SUM(b.WDR) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wdr
	  , (SELECT CAST(SUM(b.WDR) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wdr_avg

	  , (SELECT SUM(b.LDR) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_ldr
	  , (SELECT CAST(SUM(b.LDR) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_ldr_avg

	  --- assists
	  , (SELECT SUM(b.WAst) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wast
	  , (SELECT CAST(SUM(b.WAst) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wast_avg
	  , (SELECT SUM(b.LAst) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_last
	  , (SELECT CAST(SUM(b.LAst) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_last_avg

	  --- timeouts
	  , (SELECT SUM(b.WTO) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wto
	  , (SELECT CAST(SUM(b.WTO) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wto_avg
	  , (SELECT SUM(b.LTO) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lto
	  , (SELECT CAST(SUM(b.LTO) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lto_avg

	  --- steals
	  , (SELECT SUM(b.WStl) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wstl
	  , (SELECT CAST(SUM(b.WStl) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wstl_avg
	  , (SELECT SUM(b.LStl) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lstl
	  , (SELECT CAST(SUM(b.LStl) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lstl_avg

	  --- blocks
	  , (SELECT SUM(b.WBlk) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wblk
	  , (SELECT CAST(SUM(b.WBlk) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wblk_avg
	  , (SELECT SUM(b.LBlk) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lblk
	  , (SELECT CAST(SUM(b.LBlk) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lblk_avg

	  --- personal fouls
	  , (SELECT SUM(b.WPF) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wpf
	  , (SELECT CAST(SUM(b.WPF) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.WTeamID) AS t2_wpf_avg
	  , (SELECT SUM(b.LPF) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lpf
	  , (SELECT CAST(SUM(b.LPF) AS FLOAT) / CAST(COUNT(b.DayNum) AS FLOAT) FROM [kaggle_ncaam_2020].[dbo].[MRegularSeasonDetailedResults] b
			WHERE b.Season = a.Season AND b.DayNum < a.DayNum AND a.t2 = b.LTeamID) AS t2_lpf_avg
  
  -- Insert into temp table 1.
  INTO #df1

  FROM [kaggle_ncaam_2020].[dbo].[df0] a
  ORDER BY Season, t1, DayNum

-- Query from temp table 1 for next processong steps.
SELECT [Season]
      ,[DayNum]
      ,[t1]
      ,[t2]
      ,[ID]
      ,[Pred]
	  ,t1_loc

	  -- RANKINGS
	  ,t1_rank_day
	  ,t1_rank
	  ,t2_rank_day
	  ,t2_rank

	  -- T1

	  --- wins
	  ,t1_games
	  ,t1_wins
	  ,t1_win_perc

	  --- home/away/neutral winning percentages
	  ,t1_H_wins
	  ,t1_H_losses
	  ,t1_H_wins + t1_H_losses AS t1_H_games
	  ,t1_A_wins
	  ,t1_A_losses
	  ,t1_A_wins + t1_A_losses AS t1_A_games
	  ,t1_N_wins
	  ,t1_N_losses
	  ,t1_N_wins + t1_N_losses AS t1_N_games

	  --- outscore
	  ,t1_win_points
	  ,t1_win_opp_points
	  ,t1_outscore

	  --- outscored
	  ,t1_l_points
	  ,t1_l_opp_points
	  ,t1_outscored

	  --- points
	  ,COALESCE(t1_win_points, 0) + COALESCE(t1_l_points, 0) AS t1_points
	  ,(CAST(COALESCE(t1_win_points, 0) AS FLOAT) + CAST(COALESCE(t1_l_points, 0) AS FLOAT)) / NULLIF(t1_games, 0) AS t1_points_avg

	  --- field goals
	  ,t1_wfga
	  ,t1_wfgm
	  ,t1_wfg_perc
	  ,t1_lfga
	  ,t1_lfgm
	  ,t1_lfg_perc
	  , CASE WHEN t1_wfg_perc IS NULL OR t1_lfg_perc IS NULL
		THEN (COALESCE(t1_wfg_perc, 0) + COALESCE(t1_lfg_perc, 0))
		ELSE (COALESCE(t1_wfg_perc, 0) + COALESCE(t1_lfg_perc, 0)) / 2
		END AS t1_fg_perc

	  --- 3 pointers
	  ,t1_wfga3
	  ,t1_wfgm3
	  ,t1_wfg3_perc
	  ,t1_lfga3
	  ,t1_lfgm3
	  ,t1_lfg3_perc
	  , CASE WHEN t1_wfg3_perc IS NULL OR t1_lfg3_perc IS NULL
		THEN (COALESCE(t1_wfg3_perc, 0) + COALESCE(t1_lfg3_perc, 0))
		ELSE (COALESCE(t1_wfg3_perc, 0) + COALESCE(t1_lfg3_perc, 0)) / 2
		END AS t1_fg3_perc

	  --- free throws
	  ,t1_wfta
	  ,t1_wftm
	  ,t1_wft_perc
	  ,t1_lfta
	  ,t1_lftm
	  ,t1_lft_perc
	  , CASE WHEN t1_wft_perc IS NULL OR t1_lft_perc IS NULL
		THEN (COALESCE(t1_wft_perc, 0) + COALESCE(t1_lft_perc, 0))
		ELSE (COALESCE(t1_wft_perc, 0) + COALESCE(t1_lft_perc, 0)) / 2
		END AS t1_ft_perc

	  --- rebounds

	  ---- offensive
	  ,t1_wor
	  ,t1_wor_avg
	  ,t1_lor
	  ,t1_lor_avg

	  ----- total offenive
	  , CASE WHEN t1_wor_avg IS NULL OR t1_lor_avg IS NULL
		THEN (COALESCE(t1_wor_avg, 0) + COALESCE(t1_lor_avg, 0))
		ELSE (COALESCE(t1_wor_avg, 0) + COALESCE(t1_lor_avg, 0)) / 2
		END AS t1_or_avg

	  ---- defensive
	  ,t1_wdr
	  ,t1_wdr_avg
	  ,t1_ldr
	  ,t1_ldr_avg

	  ----- total defensive
	  , CASE WHEN t1_wdr_avg IS NULL OR t1_ldr_avg IS NULL
		THEN (COALESCE(t1_wdr_avg, 0) + COALESCE(t1_ldr_avg, 0))
		ELSE (COALESCE(t1_wdr_avg, 0) + COALESCE(t1_ldr_avg, 0)) / 2
		END AS t1_dr_avg

	  --- assists
	  ,t1_wast
	  ,t1_wast_avg
	  ,t1_last
	  ,t1_last_avg
	  , CASE WHEN t1_wast_avg IS NULL OR t1_last_avg IS NULL
		THEN (COALESCE(t1_wast_avg, 0) + COALESCE(t1_last_avg, 0))
		ELSE (COALESCE(t1_wast_avg, 0) + COALESCE(t1_last_avg, 0)) / 2
		END AS t1_ast_avg

	  --- timeouts
	  ,t1_wto
	  ,t1_wto_avg
	  ,t1_lto
	  ,t1_lto_avg
	  , CASE WHEN t1_wto_avg IS NULL OR t1_lto_avg IS NULL
		THEN (COALESCE(t1_wto_avg, 0) + COALESCE(t1_lto_avg, 0))
		ELSE (COALESCE(t1_wto_avg, 0) + COALESCE(t1_lto_avg, 0)) / 2
		END AS t1_to_avg

	  --- steals
	  ,t1_wstl
	  ,t1_wstl_avg
	  ,t1_lstl
	  ,t1_lstl_avg
	  , CASE WHEN t1_wstl_avg IS NULL OR t1_lstl_avg IS NULL
		THEN (COALESCE(t1_wstl_avg, 0) + COALESCE(t1_lstl_avg, 0))
		ELSE (COALESCE(t1_wstl_avg, 0) + COALESCE(t1_lstl_avg, 0)) / 2
		END AS t1_stl_avg

	  --- blocks
	  ,t1_wblk
	  ,t1_wblk_avg
	  ,t1_lblk
	  ,t1_lblk_avg
	  , CASE WHEN t1_wblk_avg IS NULL OR t1_lblk_avg IS NULL
		THEN (COALESCE(t1_wblk_avg, 0) + COALESCE(t1_lblk_avg, 0))
		ELSE (COALESCE(t1_wblk_avg, 0) + COALESCE(t1_lblk_avg, 0)) / 2
		END AS t1_blk_avg

	  --- personal fouls
	  ,t1_wpf
	  ,t1_wpf_avg
	  ,t1_lpf
	  ,t1_lpf_avg
	  , CASE WHEN t1_wpf_avg IS NULL OR t1_lpf_avg IS NULL
		THEN (COALESCE(t1_wpf_avg, 0) + COALESCE(t1_lpf_avg, 0))
		ELSE (COALESCE(t1_wpf_avg, 0) + COALESCE(t1_lpf_avg, 0)) / 2
		END AS t1_pf_avg

	  -- T2

	  --- wins
	  ,t2_games
	  ,t2_wins
	  ,t2_win_perc

	  --- home/away/neutral winning percentages
	  ,t2_H_wins
	  ,t2_H_losses
	  ,t2_H_wins + t2_H_losses AS t2_H_games
	  ,t2_A_wins
	  ,t2_A_losses
	  ,t2_A_wins + t2_A_losses AS t2_A_games
	  ,t2_N_wins
	  ,t2_N_losses
	  ,t2_N_wins + t2_N_losses AS t2_N_games

	  --- outscore
	  ,t2_win_points
	  ,t2_win_opp_points
	  ,t2_outscore

	  --- outscored
	  ,t2_l_points
	  ,t2_l_opp_points
	  ,t2_outscored

	  --- points
	  ,COALESCE(t2_win_points, 0) + COALESCE(t2_l_points, 0) AS t2_points
	  ,(CAST(COALESCE(t2_win_points, 0) AS FLOAT) + CAST(COALESCE(t2_l_points, 0) AS FLOAT)) / NULLIF(t2_games, 0) AS t2_points_avg

	  --- field goals
	  ,t2_wfga
	  ,t2_wfgm
	  ,t2_wfg_perc
	  ,t2_lfga
	  ,t2_lfgm
	  ,t2_lfg_perc
	  , CASE WHEN t2_wfg_perc IS NULL OR t2_lfg_perc IS NULL
		THEN (COALESCE(t2_wfg_perc, 0) + COALESCE(t2_lfg_perc, 0))
		ELSE (COALESCE(t2_wfg_perc, 0) + COALESCE(t2_lfg_perc, 0)) / 2
		END AS t2_fg_perc

	  --- 3 pointers
	  ,t2_wfga3
	  ,t2_wfgm3
	  ,t2_wfg3_perc
	  ,t2_lfga3
	  ,t2_lfgm3
	  ,t2_lfg3_perc
	  , CASE WHEN t2_wfg3_perc IS NULL OR t2_lfg3_perc IS NULL
		THEN (COALESCE(t2_wfg3_perc, 0) + COALESCE(t2_lfg3_perc, 0))
		ELSE (COALESCE(t2_wfg3_perc, 0) + COALESCE(t2_lfg3_perc, 0)) / 2
		END AS t2_fg3_perc

	  --- free throws
	  ,t2_wfta
	  ,t2_wftm
	  ,t2_wft_perc
	  ,t2_lfta
	  ,t2_lftm
	  ,t2_lft_perc
	  , CASE WHEN t2_wft_perc IS NULL OR t2_lft_perc IS NULL
		THEN (COALESCE(t2_wft_perc, 0) + COALESCE(t2_lft_perc, 0))
		ELSE (COALESCE(t2_wft_perc, 0) + COALESCE(t2_lft_perc, 0)) / 2
		END AS t2_ft_perc

	  --- rebounds

	  ---- offensive
	  ,t2_wor
	  ,t2_wor_avg
	  ,t2_lor
	  ,t2_lor_avg

	  ----- total offenive
	  , CASE WHEN t2_wor_avg IS NULL OR t2_lor_avg IS NULL
		THEN (COALESCE(t2_wor_avg, 0) + COALESCE(t2_lor_avg, 0))
		ELSE (COALESCE(t2_wor_avg, 0) + COALESCE(t2_lor_avg, 0)) / 2
		END AS t2_or_avg

	  ---- defensive
	  ,t2_wdr
	  ,t2_wdr_avg
	  ,t2_ldr
	  ,t2_ldr_avg

	  ----- total defensive
	  , CASE WHEN t2_wdr_avg IS NULL OR t2_ldr_avg IS NULL
		THEN (COALESCE(t2_wdr_avg, 0) + COALESCE(t2_ldr_avg, 0))
		ELSE (COALESCE(t2_wdr_avg, 0) + COALESCE(t2_ldr_avg, 0)) / 2
		END AS t2_dr_avg

	  --- assists
	  ,t2_wast
	  ,t2_wast_avg
	  ,t2_last
	  ,t2_last_avg
	  , CASE WHEN t2_wast_avg IS NULL OR t2_last_avg IS NULL
		THEN (COALESCE(t2_wast_avg, 0) + COALESCE(t2_last_avg, 0))
		ELSE (COALESCE(t2_wast_avg, 0) + COALESCE(t2_last_avg, 0)) / 2
		END AS t2_ast_avg

	  --- timeouts
	  ,t2_wto
	  ,t2_wto_avg
	  ,t2_lto
	  ,t2_lto_avg
	  , CASE WHEN t2_wto_avg IS NULL OR t2_lto_avg IS NULL
		THEN (COALESCE(t2_wto_avg, 0) + COALESCE(t2_lto_avg, 0))
		ELSE (COALESCE(t2_wto_avg, 0) + COALESCE(t2_lto_avg, 0)) / 2
		END AS t2_to_avg

	  --- steals
	  ,t2_wstl
	  ,t2_wstl_avg
	  ,t2_lstl
	  ,t2_lstl_avg
	  , CASE WHEN t2_wstl_avg IS NULL OR t2_lstl_avg IS NULL
		THEN (COALESCE(t2_wstl_avg, 0) + COALESCE(t2_lstl_avg, 0))
		ELSE (COALESCE(t2_wstl_avg, 0) + COALESCE(t2_lstl_avg, 0)) / 2
		END AS t2_stl_avg

	  --- blocks
	  ,t2_wblk
	  ,t2_wblk_avg
	  ,t2_lblk
	  ,t2_lblk_avg
	  , CASE WHEN t2_wblk_avg IS NULL OR t2_lblk_avg IS NULL
		THEN (COALESCE(t2_wblk_avg, 0) + COALESCE(t2_lblk_avg, 0))
		ELSE (COALESCE(t2_wblk_avg, 0) + COALESCE(t2_lblk_avg, 0)) / 2
		END AS t2_blk_avg

	  --- personal fouls
	  ,t2_wpf
	  ,t2_wpf_avg
	  ,t2_lpf
	  ,t2_lpf_avg
	  , CASE WHEN t2_wpf_avg IS NULL OR t2_lpf_avg IS NULL
		THEN (COALESCE(t2_wpf_avg, 0) + COALESCE(t2_lpf_avg, 0))
		ELSE (COALESCE(t2_wpf_avg, 0) + COALESCE(t2_lpf_avg, 0)) / 2
		END AS t2_pf_avg

  -- Insert into temp table 2.
  INTO #df2

  FROM #df1
  ORDER BY Season, t1, DayNum

-- Query from temp table 2 for next processong steps.
SELECT [Season]
      ,[DayNum]
      ,[t1]
      ,[t2]
      ,[ID]
      ,[Pred]
	  ,t1_loc
      ,CHARINDEX(t1_loc, 'HAN') AS t1_loc_enc

	  -- RANKINGS
	  ,t1_rank_day
	  ,t1_rank
	  ,t2_rank_day
	  ,t2_rank

	  -- T1

	  --- wins
	  ,t1_games
	  ,t1_wins
	  ,t1_win_perc

	  --- home/away/neutral winning percentages
	  ,t1_H_wins
	  ,t1_H_losses
	  ,t1_H_games
	  ,COALESCE(CAST(t1_H_wins AS FLOAT) / NULLIF(CAST(t1_H_games AS FLOAT), 0), .5) AS t1_H_win_perc
	  ,t1_A_wins
	  ,t1_A_losses
	  ,t1_A_games
	  ,COALESCE(CAST(t1_A_wins AS FLOAT) / NULLIF(CAST(t1_A_games AS FLOAT), 0), .5) AS t1_A_win_perc
	  ,t1_N_losses
	  ,t1_N_games
	  ,COALESCE(CAST(t1_N_wins AS FLOAT) / NULLIF(CAST(t1_N_games AS FLOAT), 0), .5) AS t1_N_win_perc

	  --- outscore
	  ,t1_win_points
	  ,t1_win_opp_points
	  ,t1_outscore

	  --- outscored
	  ,t1_l_points
	  ,t1_l_opp_points
	  ,t1_outscored

	  --- points
	  ,t1_points
	  ,t1_points_avg

	  --- field goals
	  ,t1_wfga
	  ,t1_wfgm
	  ,t1_wfg_perc
	  ,t1_lfga
	  ,t1_lfgm
	  ,t1_lfg_perc
	  ,t1_fg_perc

	  --- 3 pointers
	  ,t1_wfga3
	  ,t1_wfgm3
	  ,t1_wfg3_perc
	  ,t1_lfga3
	  ,t1_lfgm3
	  ,t1_lfg3_perc
	  ,t1_fg3_perc

	  --- free throws
	  ,t1_wfta
	  ,t1_wftm
	  ,t1_wft_perc
	  ,t1_lfta
	  ,t1_lftm
	  ,t1_lft_perc
	  ,t1_ft_perc

	  --- rebounds

	  ---- offensive
	  ,t1_wor
	  ,t1_wor_avg
	  ,t1_lor
	  ,t1_lor_avg

	  ----- total offenive
	  ,t1_or_avg

	  ---- defensive
	  ,t1_wdr
	  ,t1_wdr_avg
	  ,t1_ldr
	  ,t1_ldr_avg

	  ----- total defensive
	  ,t1_dr_avg

	  ---- total rebound average
	  , (t1_or_avg + t1_dr_avg) / 2 AS t1_r_avg

	  --- assists
	  ,t1_wast
	  ,t1_wast_avg
	  ,t1_last
	  ,t1_last_avg
	  ,t1_ast_avg

	  --- timeouts
	  ,t1_wto
	  ,t1_wto_avg
	  ,t1_lto
	  ,t1_lto_avg
	  ,t1_to_avg

	  --- steals
	  ,t1_wstl
	  ,t1_wstl_avg
	  ,t1_lstl
	  ,t1_lstl_avg
	  ,t1_stl_avg

	  --- blocks
	  ,t1_wblk
	  ,t1_wblk_avg
	  ,t1_lblk
	  ,t1_lblk_avg
	  ,t1_blk_avg

	  --- personal fouls
	  ,t1_wpf
	  ,t1_wpf_avg
	  ,t1_lpf
	  ,t1_lpf_avg
	  ,t1_pf_avg

	  -- T2

	  --- wins
	  ,t2_games
	  ,t2_wins
	  ,t2_win_perc

	  --- home/away/neutral winning percentages
	  ,t2_H_wins
	  ,t2_H_losses
	  ,t2_H_games
	  ,COALESCE(CAST(t2_H_wins AS FLOAT) / NULLIF(CAST(t2_H_games AS FLOAT), 0), .5) AS t2_H_win_perc
	  ,t2_A_wins
	  ,t2_A_losses
	  ,t2_A_games
	  ,COALESCE(CAST(t2_A_wins AS FLOAT) / NULLIF(CAST(t2_A_games AS FLOAT), 0), .5) AS t2_A_win_perc
	  ,t2_N_losses
	  ,t2_N_games
	  ,COALESCE(CAST(t2_N_wins AS FLOAT) / NULLIF(CAST(t2_N_games AS FLOAT), 0), .5) AS t2_N_win_perc

	  --- outscore
	  ,t2_win_points
	  ,t2_win_opp_points
	  ,t2_outscore

	  --- outscored
	  ,t2_l_points
	  ,t2_l_opp_points
	  ,t2_outscored

	  --- points
	  ,t2_points
	  ,t2_points_avg

	  --- field goals
	  ,t2_wfga
	  ,t2_wfgm
	  ,t2_wfg_perc
	  ,t2_lfga
	  ,t2_lfgm
	  ,t2_lfg_perc
	  ,t2_fg_perc

	  --- 3 pointers
	  ,t2_wfga3
	  ,t2_wfgm3
	  ,t2_wfg3_perc
	  ,t2_lfga3
	  ,t2_lfgm3
	  ,t2_lfg3_perc
	  ,t2_fg3_perc

	  --- free throws
	  ,t2_wfta
	  ,t2_wftm
	  ,t2_wft_perc
	  ,t2_lfta
	  ,t2_lftm
	  ,t2_lft_perc
	  ,t2_ft_perc

	  --- rebounds

	  ---- offensive
	  ,t2_wor
	  ,t2_wor_avg
	  ,t2_lor
	  ,t2_lor_avg

	  ----- total offenive
	  ,t2_or_avg

	  ---- defensive
	  ,t2_wdr
	  ,t2_wdr_avg
	  ,t2_ldr
	  ,t2_ldr_avg

	  ----- total defensive
	  ,t2_dr_avg

	  ---- total rebound average
	  , (t2_or_avg + t2_dr_avg) / 2 AS t2_r_avg

	  --- assists
	  ,t2_wast
	  ,t2_wast_avg
	  ,t2_last
	  ,t2_last_avg
	  ,t2_ast_avg

	  --- timeouts
	  ,t2_wto
	  ,t2_wto_avg
	  ,t2_lto
	  ,t2_lto_avg
	  ,t2_to_avg

	  --- steals
	  ,t2_wstl
	  ,t2_wstl_avg
	  ,t2_lstl
	  ,t2_lstl_avg
	  ,t2_stl_avg

	  --- blocks
	  ,t2_wblk
	  ,t2_wblk_avg
	  ,t2_lblk
	  ,t2_lblk_avg
	  ,t2_blk_avg

	  --- personal fouls
	  ,t2_wpf
	  ,t2_wpf_avg
	  ,t2_lpf
	  ,t2_lpf_avg
	  ,t2_pf_avg

  FROM #df2
  ORDER BY Season, t1, DayNum

-- Drop the temp tables.
DROP TABLE #df1
DROP TABLE #df2
```

# Get data

In [None]:
df = pd.read_csv("../input/ncaam2020/df.csv")
dftest = pd.read_csv("../input/ncaam2020/dftest.csv")
dfsubmit = pd.read_csv("../input/ncaam2020/submission.csv")

In [None]:
print(f"df Shape: {df.shape}\n")
print("Columns:\n\n" + "\n".join(df.columns.values))
df.head()

In [None]:
print(f"dftest Shape: {dftest.shape}\n")
print("Columns:\n\n" + "\n".join(dftest.columns.values))
dftest.head()

In [None]:
print(f"dfsubmit Shape: {dfsubmit.shape}\n")
print("Columns:\n\n" + "\n".join(dfsubmit.columns.values))
dfsubmit.head()

# Select features

In [None]:
label = "Pred"
cat_cols = []

features = ["t1_rank","t2_rank","t1_win_perc","t2_win_perc",
            "t1_outscore","t2_outscore","t1_outscored","t2_outscored",
            "t1_points_avg","t2_points_avg","t1_fg_perc","t2_fg_perc",
            "t1_fg3_perc","t2_fg3_perc","t1_ft_perc","t2_ft_perc",
            "t1_or_avg","t1_dr_avg","t2_or_avg","t2_dr_avg","t1_ast_avg","t2_ast_avg",
            "t1_stl_avg","t2_stl_avg","t1_blk_avg","t2_blk_avg"]

df[features].head(10)

# Filter out first games

In [None]:
filternum = 0
df = df.loc[(df["t1_games"] > filternum) & (df["t2_games"] > filternum)]
print(f"Shape: {df.shape}")

# Learn, test, evaluate

## Split

In [None]:
X_train = df[features]
X_test = dftest[features]
y_train = df[label]
y_test = dftest[label]

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
test_pool = Pool(X_test, y_test, cat_features=cat_cols)
pred_pool = Pool(dfsubmit[features], cat_features=cat_cols)

# Train, test, evaluate predict

In [None]:
task_type="CPU"
waste_time = False
ver = 3

if waste_time:
    
    # Grid search
    model = CatBoostClassifier(loss_function="Logloss", random_seed=1, task_type=task_type)

    grid = {"learning_rate": [0.03, 0.1],
            "depth": [4, 6, 10],
            "l2_leaf_reg": [1, 3, 5, 7, 9]}

    grid_search_result = model.grid_search(grid, 
                                           train_pool, 
                                           plot=True)
    
    params = grid_search_result["params"]
    params_df = pd.DataFrame(params, index=[0])
    
    # Cross validate
    params = {"iterations": 999,
              "depth":params["depth"],
              "l2_leaf_reg":params["l2_leaf_reg"],
              "learning_rate":params["learning_rate"],
              "loss_function": "Logloss",
              "random_seed":1,
              "verbose": True,
              "roc_file": "roc-file.txt"}

    scores = cv(train_pool, params)
    scores.to_csv("cv_scores.csv")
    
    iterations = scores.loc[scores["test-Logloss-mean"].idxmin()]["iterations"] + 1
    params_df["cv_best_iter"] = [iterations]
    
    # Train
    model = CatBoostClassifier(
        iterations=iterations,
        loss_function="Logloss",
        random_seed=1,
        task_type=task_type,
        depth=params["depth"],
        l2_leaf_reg=params["l2_leaf_reg"],
        learning_rate=params["learning_rate"])

    model.fit(train_pool)
    
    # Test
    preds_class = model.predict(test_pool)
    preds_proba = model.predict_proba(test_pool)
    print(f"class = {preds_class[:20]}")
    print(f"proba = {preds_proba[:20]}")
    
    ## Evaluate
    logloss = metrics.log_loss(y_test, preds_proba)
    params_df["test_logloss"] = [logloss]
    params_df.to_csv("params.csv")
    print(f"logloss: {logloss}")
    
    # Predict
    preds_class = model.predict(pred_pool)
    preds_proba = model.predict_proba(pred_pool)
    print(f"class = {preds_class[:20]}")
    print(f"proba = {preds_proba[:20]}")
    
    df = pd.DataFrame({"ID":list(dfsubmit["ID"]),"Pred":[i[1] for i in preds_proba],"Class":preds_class})

    # Submit
    df.to_csv(f"sumbit{ver}_full.csv", index=False)
    df[["ID","Pred"]].to_csv(f"sumbit{ver}.csv", index=False)
    
else:
    # Get best iteration
    model = CatBoostClassifier(
            iterations=999,
            loss_function="Logloss",
            random_seed=1,
            task_type=task_type)

    model.fit(train_pool, eval_set=test_pool)
    iterations = model.get_best_iteration()
    
    iterations = scores.loc[scores["test-Logloss-mean"].idxmin()]["iterations"] + 1
    results_df = pd.DataFrame({"cv_best_iter":iterations}, index=[0])
    
    # Train
    model = CatBoostClassifier(
        iterations=iterations,
        loss_function="Logloss",
        random_seed=1,
        task_type=task_type)

    model.fit(train_pool)
    
    # Test
    preds_class = model.predict(test_pool)
    preds_proba = model.predict_proba(test_pool)
    print(f"class = {preds_class[:20]}")
    print(f"proba = {preds_proba[:20]}")
    
    ## Evaluate
    logloss = metrics.log_loss(y_test, preds_proba)
    results_df["test_logloss"] = [logloss]
    results_df.to_csv("results.csv", index=False)
    print(f"logloss: {logloss}")
    
    # Predict
    preds_class = model.predict(pred_pool)
    preds_proba = model.predict_proba(pred_pool)
    print(f"class = {preds_class[:20]}")
    print(f"proba = {preds_proba[:20]}")
    
    df = pd.DataFrame({"ID":list(dfsubmit["ID"]),"Pred":[i[1] for i in preds_proba],"Class":preds_class})

    # Submit
    df.to_csv(f"sumbit{ver}_full.csv", index=False)
    df[["ID","Pred"]].to_csv(f"sumbit{ver}.csv", index=False)

In [None]:
print(f"iterations: {iterations}")

In [None]:
print(f"logloss: {logloss}")

In [None]:
df

## Feature importances

In [None]:
fi = model.get_feature_importance(prettified=True)
fi.to_csv("feature_importances.csv")
fi