# NCAA Tournament 2021 - Sean Norris, Mikayla Pugel, Ren Tu

In [1]:
%matplotlib inline 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score

# Project Background

## NCAA tournament prediction using NCAA basketball game data

## Trying different model types to get predictions in before tournament officially starts on Thursday, 3/18

# Initial EDA

In [2]:
# Load data
cities_df = pd.read_csv("2021_Data/Cities.csv")
teams_df = pd.read_csv("2021_Data/MTeams.csv")
reg_df = pd.read_csv("MRegularSeasonDetailedResults2021.csv")

print(reg_df.head(5))
print(reg_df.columns)

   Season  DayNum  WTeamID  WScore  LTeamID  LScore WLoc  NumOT  WFGM  WFGA  \
0    2003      10     1104      68     1328      62    N      0    27    58   
1    2003      10     1272      70     1393      63    N      0    26    62   
2    2003      11     1266      73     1437      61    N      0    24    58   
3    2003      11     1296      56     1457      50    N      0    18    38   
4    2003      11     1400      77     1208      71    N      0    30    61   

   ...  LFGA3  LFTM  LFTA  LOR  LDR  LAst  LTO  LStl  LBlk  LPF  
0  ...     10    16    22   10   22     8   18     9     2   20  
1  ...     24     9    20   20   25     7   12     8     6   16  
2  ...     26    14    23   31   22     9   12     2     5   23  
3  ...     22     8    15   17   20     9   19     4     3   23  
4  ...     16    17    27   21   15    12   10     7     1   14  

[5 rows x 34 columns]
Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA

In [3]:
reg_df_test1 = copy.deepcopy(reg_df[["Season","WTeamID","LTeamID","WLoc"]])
print("Number of Games Per Season: ")
print(reg_df_test1[["WTeamID","Season"]].groupby("Season").agg('count').reset_index())
print(" ")

Number of Games Per Season: 
    Season  WTeamID
0     2003     4616
1     2004     4571
2     2005     4675
3     2006     4757
4     2007     5043
5     2008     5163
6     2009     5249
7     2010     5263
8     2011     5246
9     2012     5253
10    2013     5320
11    2014     5362
12    2015     5354
13    2016     5369
14    2017     5395
15    2018     5405
16    2019     5463
17    2020     5328
18    2021     3855
 


In [4]:
reg_df2 = copy.deepcopy(reg_df[reg_df["Season"]>=2010])
reg_df3 = copy.deepcopy(reg_df2[["Season","WTeamID","LTeamID","WLoc","WScore","WFGA","WFTA","WOR","WTO","WBlk","WPF","LScore","LFGA","LFTA","LOR","LTO","LBlk","LPF"]])
print("Size of dataset for seasons since 2010:")
print(len(reg_df3))

Size of dataset for seasons since 2010:
62613


# Create new features using basketball stats

In [5]:
# Create new stats for possessions and bad plays 
reg_df3["WPoss"] = reg_df3["WFGA"] + reg_df3["WFTA"]/2.0 + reg_df3["WTO"] - reg_df3["WOR"]
reg_df3["LPoss"] = reg_df3["LFGA"] + reg_df3["LFTA"]/2.0 + reg_df3["LTO"] - reg_df3["LOR"]
reg_df3["WBad"] = reg_df3["WTO"]*3 + reg_df3["WPF"]/2.0 + reg_df3["LOR"]*3 + reg_df3["LBlk"]*5
reg_df3["LBad"] = reg_df3["LTO"]*3 + reg_df3["LPF"]/2.0 + reg_df3["WOR"]*3 + reg_df3["WBlk"]*5
reg_df3["W_Game_Pts_Per_Poss"] = reg_df3["WScore"] / reg_df3["WPoss"]
reg_df3["L_Game_Pts_Per_Poss"] = reg_df3["LScore"] / reg_df3["LPoss"]

reg_df4 = copy.deepcopy(reg_df3[["Season","WTeamID","LTeamID","WLoc","WScore","WPoss","LScore","LPoss","WBad","LBad","W_Game_Pts_Per_Poss","L_Game_Pts_Per_Poss"]])
print(reg_df4.head(5))

       Season  WTeamID  LTeamID WLoc  WScore  WPoss  LScore  LPoss   WBad  \
34074    2010     1143     1293    H      75   67.0      70   65.5   91.0   
34075    2010     1314     1198    H      88   84.5      72   83.5  136.5   
34076    2010     1326     1108    H     100   72.0      60   73.0   85.5   
34077    2010     1393     1107    H      75   82.5      43   82.0  127.0   
34078    2010     1143     1178    H      95   73.5      61   71.5   92.0   

        LBad  W_Game_Pts_Per_Poss  L_Game_Pts_Per_Poss  
34074  107.0             1.119403             1.068702  
34075  145.0             1.041420             0.862275  
34076  116.0             1.388889             0.821918  
34077  181.5             0.909091             0.524390  
34078  115.5             1.292517             0.853147  


In [6]:
# Winners aggregation by season
win_agg1 = reg_df4.groupby(["Season","WTeamID"])[["WScore","WPoss","WBad"]].sum().reset_index()
win_agg2 = reg_df4[["Season","WTeamID","WScore"]].groupby(["Season","WTeamID"]).agg('count').reset_index()
win_agg2.columns = ["Season","WTeamID","games_played"]
win_agg3 = win_agg1.merge(win_agg2, how="left", left_on=["Season","WTeamID"], right_on=["Season","WTeamID"])

# Losers aggregation by season
lose_agg1 = reg_df4.groupby(["Season","LTeamID"])[["LScore","LPoss","LBad"]].sum().reset_index()
lose_agg2 = reg_df4[["Season","LTeamID","LScore"]].groupby(["Season","LTeamID"]).agg('count').reset_index()
lose_agg2.columns = ["Season","LTeamID","games_played"]
lose_agg3 = lose_agg1.merge(lose_agg2, how="left", left_on=["Season","LTeamID"], right_on=["Season","LTeamID"])

# Rename columns for groupings to have consistent names, then combine
win_agg3.columns = ["Season","TeamID","TeamScore","TeamPoss","TeamBad","TeamGames"]
lose_agg3.columns = ["Season","TeamID","TeamScore","TeamPoss","TeamBad","TeamGames"]
team_agg1 = win_agg3.append(lose_agg3)

# Get aggregate sum of points, possessions, bad plays and games played for each team
team_agg2 = team_agg1.groupby(["Season","TeamID"])[["TeamScore","TeamPoss","TeamBad","TeamGames"]].sum().reset_index()

# Get standard deviation of points per possession for each team
win_agg4 = reg_df4[["Season","WTeamID","W_Game_Pts_Per_Poss"]]
win_agg4.columns = ["Season","TeamID","Game_Pts_Per_Poss"]
lose_agg4 = reg_df4[["Season","LTeamID","L_Game_Pts_Per_Poss"]]
lose_agg4.columns = ["Season","TeamID","Game_Pts_Per_Poss"]
team_agg3 = copy.deepcopy(win_agg4.append(lose_agg4))
team_agg4 = team_agg3.groupby(["Season","TeamID"]).agg(np.std).reset_index()
team_agg5 = team_agg2.merge(team_agg4, how="left", left_on=["Season","TeamID"], right_on=["Season","TeamID"])
team_agg5.columns = ["Season","TeamID","TeamScore","TeamPoss","TeamBad","TeamGames","Team_PPP_SD"]

# Calculate points per possession, possessions per game, and bad plays per possession stats for each team
team_agg5["Pts_Per_Poss"] = team_agg5["TeamScore"] / team_agg5["TeamPoss"]
team_agg5["Poss_Per_Game"] = team_agg5["TeamPoss"] / team_agg5["TeamGames"]
team_agg5["Bad_Per_Poss"] = team_agg5["TeamBad"] / (team_agg5["TeamPoss"] * 2)

print("Team Aggregated Stats: ")
print(team_agg5.head(5))
print(" ")

# Using original full schedule, add opponent points and possessions data
reg_df4a = reg_df4.merge(team_agg2, how='left', left_on=["Season","LTeamID"], right_on=["Season","TeamID"])
reg_df4b = copy.deepcopy(reg_df4a[["Season","WTeamID","LTeamID","WLoc","TeamScore","TeamPoss"]])
reg_df4b.columns = ["Season","WTeamID","LTeamID","WLoc","W_Opp_Score","W_Opp_Poss"]
reg_df4c = reg_df4b.merge(team_agg2, how='left', left_on=["Season","WTeamID"], right_on=["Season","TeamID"])
reg_df4d = copy.deepcopy(reg_df4c[["Season","WTeamID","LTeamID","WLoc","W_Opp_Score","W_Opp_Poss","TeamScore","TeamPoss"]])
reg_df4d.columns = ["Season","WTeamID","LTeamID","WLoc","W_Opp_Score","W_Opp_Poss","L_Opp_Score","L_Opp_Poss"]

# Split into winners and losers aggregation by season for opponent points and possession
win_opp_agg1 = reg_df4d.groupby(["Season","WTeamID"])[["W_Opp_Score","W_Opp_Poss"]].sum().reset_index()
lose_opp_agg1 = reg_df4d.groupby(["Season","LTeamID"])[["L_Opp_Score","L_Opp_Poss"]].sum().reset_index()

# Rename columns for groupings to have consistent names, then combine
win_opp_agg1.columns = ["Season","TeamID","Opp_Score","Opp_Poss"]
lose_opp_agg1.columns = ["Season","TeamID","Opp_Score","Opp_Poss"]
opp_agg1 = win_opp_agg1.append(lose_opp_agg1)

# Get aggregate sum of opponent points and possessions for each team
opp_agg2 = opp_agg1.groupby(["Season","TeamID"])[["Opp_Score","Opp_Poss"]].sum().reset_index()

# Calculate opponent points per possession for each team
opp_agg2["Opp_Pts_Per_Poss"] = opp_agg2["Opp_Score"] / opp_agg2["Opp_Poss"]

print("Team Opponent Aggregated Stats:")
print(opp_agg2.head(5))

Team Aggregated Stats: 
   Season  TeamID  TeamScore  TeamPoss  TeamBad  TeamGames  Team_PPP_SD  \
0    2010    1102       1613    1724.5   2563.5         29     0.156011   
1    2010    1103       2344    2262.0   3242.0         33     0.114776   
2    2010    1104       2192    2125.5   3328.5         32     0.104933   
3    2010    1105       1468    1676.5   2998.5         23     0.159620   
4    2010    1106       1793    1917.5   3014.5         28     0.175666   

   Pts_Per_Poss  Poss_Per_Game  Bad_Per_Poss  
0      0.935344      59.465517      0.743259  
1      1.036251      68.545455      0.716622  
2      1.031287      66.421875      0.782992  
3      0.875634      72.891304      0.894274  
4      0.935072      68.482143      0.786050  
 
Team Opponent Aggregated Stats:
   Season  TeamID  Opp_Score  Opp_Poss  Opp_Pts_Per_Poss
0    2010    1102      61853   60692.0          1.019129
1    2010    1103      68894   68996.0          0.998522
2    2010    1104      71512   69291.0

# Prepare Data for Modeling

In [7]:
# Using original full schedule, add winning team points per possession, possessions per game, bad plays per possession, PPP_SD
reg_df5 = reg_df4.merge(team_agg5, how='left', left_on=["Season","WTeamID"], right_on=["Season","TeamID"])
reg_df6 = copy.deepcopy(reg_df5[["Season","WTeamID","LTeamID","WLoc","Pts_Per_Poss","Poss_Per_Game","Bad_Per_Poss","Team_PPP_SD"]])
reg_df6.columns = ["Season","WTeamID","LTeamID","WLoc","W_Pts_Per_Poss","W_Poss_Per_Game","W_Bad_Per_Poss","W_PPP_SD"]

# Using original full schedule, add losing team points per possession, possessions per game, bad plays per possession, PPP_SD
reg_df7 = reg_df6.merge(team_agg5, how='left', left_on=["Season","LTeamID"], right_on=["Season","TeamID"])
reg_df8 = copy.deepcopy(reg_df7[["Season","WTeamID","LTeamID","WLoc","W_Pts_Per_Poss","W_Poss_Per_Game","W_Bad_Per_Poss","W_PPP_SD",
                                 "Pts_Per_Poss","Poss_Per_Game","Bad_Per_Poss","Team_PPP_SD"]])
reg_df8.columns = ["Season","WTeamID","LTeamID","WLoc","W_Pts_Per_Poss","W_Poss_Per_Game","W_Bad_Per_Poss","W_PPP_SD",
                   "L_Pts_Per_Poss","L_Poss_Per_Game","L_Bad_Per_Poss","L_PPP_SD"]

# Using original full schedule, add winning and losing team opponent points per possession
reg_df8a = reg_df8.merge(opp_agg2, how='left', left_on=["Season","WTeamID"], right_on=["Season","TeamID"])
reg_df8b = copy.deepcopy(reg_df8a[["Season","WTeamID","LTeamID","WLoc","W_Pts_Per_Poss","W_Poss_Per_Game","W_Bad_Per_Poss","W_PPP_SD",
                                   "L_Pts_Per_Poss","L_Poss_Per_Game","L_Bad_Per_Poss","L_PPP_SD","Opp_Pts_Per_Poss"]])
reg_df8b.columns = ["Season","WTeamID","LTeamID","WLoc","W_Pts_Per_Poss","W_Poss_Per_Game","W_Bad_Per_Poss","W_PPP_SD",
                    "L_Pts_Per_Poss","L_Poss_Per_Game","L_Bad_Per_Poss","L_PPP_SD","W_Opp_Pts_Per_Poss"]
reg_df8c = reg_df8b.merge(opp_agg2, how='left', left_on=["Season","LTeamID"], right_on=["Season","TeamID"])
reg_df8d = copy.deepcopy(reg_df8c[["Season","WTeamID","LTeamID","WLoc","W_Pts_Per_Poss","W_Poss_Per_Game","W_Bad_Per_Poss","W_PPP_SD",
                                   "L_Pts_Per_Poss","L_Poss_Per_Game","L_Bad_Per_Poss","L_PPP_SD","W_Opp_Pts_Per_Poss","Opp_Pts_Per_Poss"]])
reg_df8d.columns = ["Season","WTeamID","LTeamID","WLoc","W_Pts_Per_Poss","W_Poss_Per_Game","W_Bad_Per_Poss","W_PPP_SD",
                    "L_Pts_Per_Poss","L_Poss_Per_Game","L_Bad_Per_Poss","L_PPP_SD","W_Opp_Pts_Per_Poss","L_Opp_Pts_Per_Poss"]

# Map Winner Location variable to 1 for H, 0 for N, -1 for A
location_map = {"H":1, "N":0, "A":-1}
reg_df8d['WLoc_Mapped'] = reg_df8d["WLoc"].map(location_map)

# Split full schedule into DFs of perspective of team with lower Team ID number
wteam_lower = copy.deepcopy(reg_df8d[reg_df8d["WTeamID"]<reg_df8d["LTeamID"]])
lteam_lower = copy.deepcopy(reg_df8d[reg_df8d["WTeamID"]>reg_df8d["LTeamID"]])

# Add game_result column: 1 for win, 0 for loss
wteam_lower["Game_Result"] = 1
lteam_lower["Game_Result"] = 0

# Calculate points per possession, possessions per game, bad plays per possession, opponent points per possession difference from winner/loser perspective
wteam_lower["Pts_Per_Poss_Diff"] = wteam_lower["W_Pts_Per_Poss"] - wteam_lower["L_Pts_Per_Poss"]
lteam_lower["Pts_Per_Poss_Diff"] = lteam_lower["L_Pts_Per_Poss"] - lteam_lower["W_Pts_Per_Poss"]
wteam_lower["Poss_Per_Game_Diff"] = wteam_lower["W_Poss_Per_Game"] - wteam_lower["L_Poss_Per_Game"]
lteam_lower["Poss_Per_Game_Diff"] = lteam_lower["L_Poss_Per_Game"] - lteam_lower["W_Poss_Per_Game"]
wteam_lower["Bad_Per_Poss_Diff"] = wteam_lower["W_Bad_Per_Poss"] - wteam_lower["L_Bad_Per_Poss"]
lteam_lower["Bad_Per_Poss_Diff"] = lteam_lower["L_Bad_Per_Poss"] - lteam_lower["W_Bad_Per_Poss"]
wteam_lower["PPP_SD_Diff"] = wteam_lower["W_PPP_SD"] - wteam_lower["L_PPP_SD"]
lteam_lower["PPP_SD_Diff"] = lteam_lower["L_PPP_SD"] - lteam_lower["W_PPP_SD"]
wteam_lower["Opp_Pts_Per_Poss_Diff"] = wteam_lower["W_Opp_Pts_Per_Poss"] - wteam_lower["L_Opp_Pts_Per_Poss"]
lteam_lower["Opp_Pts_Per_Poss_Diff"] = lteam_lower["L_Opp_Pts_Per_Poss"] - lteam_lower["W_Opp_Pts_Per_Poss"]

# Convert location to winner/loser perspective
wteam_lower["Location"] = wteam_lower["WLoc_Mapped"]
lteam_lower["Location"] = lteam_lower["WLoc_Mapped"] * -1

# Recombine DFs to create full dataset of features and outcome variable
reg_df9 = copy.deepcopy(wteam_lower.append(lteam_lower))

# Shuffle schedule into random order, then transform feature values to standardized normal scale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
reg_final1 = copy.deepcopy(reg_df9.sample(frac=1, random_state=7))
reg_final1["Pts_Per_Poss_Diff"] = scaler.fit_transform(np.array(reg_final1["Pts_Per_Poss_Diff"]).reshape(-1,1))
reg_final1["Poss_Per_Game_Diff"] = scaler.fit_transform(np.array(reg_final1["Poss_Per_Game_Diff"]).reshape(-1,1))
reg_final1["Bad_Per_Poss_Diff"] = scaler.fit_transform(np.array(reg_final1["Bad_Per_Poss_Diff"]).reshape(-1,1))
reg_final1["PPP_SD_Diff"] = scaler.fit_transform(np.array(reg_final1["PPP_SD_Diff"]).reshape(-1,1))
reg_final1["Opp_Pts_Per_Poss_Diff"] = scaler.fit_transform(np.array(reg_final1["Opp_Pts_Per_Poss_Diff"]).reshape(-1,1))

# Get X predictor variables and split into train and test sets
X_features = ["Pts_Per_Poss_Diff","Opp_Pts_Per_Poss_Diff","Poss_Per_Game_Diff","Location","PPP_SD_Diff","Bad_Per_Poss_Diff"]
reg_X = np.array(reg_final1[X_features])
reg_X_train = reg_X[0:62000]
reg_X_test = reg_X[62000:]

# Get Y outcome variable and split into train and test sets
reg_Y = np.array(reg_final1[["Game_Result"]]).reshape(-1)
reg_Y_train = reg_Y[0:62000]
reg_Y_test = reg_Y[62000:]

# Display samples of X and Y data
print("Features used for modeling:")
print(X_features)
print(" ")
print("Sample of feature values in training set:")
print(reg_X_train[0:10])
print(" ")
print("Sample of outcome values in training set: ")
print(reg_Y_train[0:10])

print(len(reg_X_train))
print(len(reg_X_test))
print(len(reg_Y_train))
print(len(reg_Y_test))

Features used for modeling:
['Pts_Per_Poss_Diff', 'Opp_Pts_Per_Poss_Diff', 'Poss_Per_Game_Diff', 'Location', 'PPP_SD_Diff', 'Bad_Per_Poss_Diff']
 
Sample of feature values in training set:
[[-0.71087079 -0.29569093 -0.70852576  0.          1.03357011 -0.36999847]
 [ 0.5059096  -1.49354652 -1.109481    1.          0.30963304 -0.90119719]
 [-0.40591225  1.31730696 -1.29476403  1.          0.85128228  0.48649496]
 [ 2.0518106   0.76849523  1.0233721   1.         -0.34333088 -1.95037123]
 [-0.17534843  0.39451785 -1.66381665 -1.          0.01047746  0.90368145]
 [ 0.37944904  0.39396855  0.35943065 -1.          1.05512011 -0.53070865]
 [ 1.6506352   0.93215518  0.97084961  1.         -0.49707964 -1.29501233]
 [ 0.55124822 -0.15862069  0.28136063 -1.         -0.73561906  0.84118193]
 [-1.53763054 -0.13404918  0.19693757 -1.          0.24388712  3.09229878]
 [-0.86132949  0.36793551  0.75963709 -1.          0.77455344 -1.05686522]]
 
Sample of outcome values in training set: 
[0 0 1 1 0 1 1 

# Run Preliminary Models to Get Sense of Modeling Options and Potential Accuracy

In [8]:
# Set up logistic regression model
lr1 = LogisticRegression(C=0.1, random_state=7)
lr1.fit(reg_X_train, reg_Y_train)
lr1_pred = lr1.predict(reg_X_test)
lr1_coef = pd.DataFrame(data={"Variable":["Offensive Efficiency: Pts Per Poss Diff","Defensive Efficiency: Pts Per Poss Allowed Diff",
                                          "Tempo: Poss Per Game Diff","Home Court Advantage","Offensive Volatility: Pts Per Poss SD Diff",
                                          "Mistake-Prone: Bad Plays Per Poss Diff"],
                              "Coefficient":lr1.coef_.reshape(-1)})
print("Logistic Regression Coefficient Values: ")
print(lr1_coef)
print(" ")
print("F1 Score - Logistic Regression:", f1_score(reg_Y_test, lr1_pred, average = 'weighted'))

# Set up random forest model
rf1 = RandomForestClassifier(n_estimators=1000, criterion="entropy", max_features=None, max_samples=0.01, random_state=7)
rf1.fit(reg_X_train, reg_Y_train)
rf1_pred = rf1.predict(reg_X_test)
print("F1 Score - Random Forest:", f1_score(reg_Y_test, rf1_pred, average = 'weighted'))

# Set up gradient boosting model
gb1 = GradientBoostingClassifier(n_estimators=100, max_features=None, learning_rate=0.1, subsample=0.3, random_state=7)
gb1.fit(reg_X_train, reg_Y_train)
gb1_pred = gb1.predict(reg_X_test)
print("F1 Score - Gradient Boosting:", f1_score(reg_Y_test, gb1_pred, average = 'weighted'))

# Set up naive bayes model
nb1 = BernoulliNB()
nb1.fit(reg_X_train, reg_Y_train)
nb1_pred = nb1.predict(reg_X_test)
print("F1 Score - Naive Bayes:", f1_score(reg_Y_test, nb1_pred, average = 'weighted'))

# Set up KNN model
knn1 = KNeighborsClassifier(n_neighbors=9)
knn1.fit(reg_X_train, reg_Y_train)
knn1_pred = knn1.predict(reg_X_test)
print("F1 Score - KNN:", f1_score(reg_Y_test, knn1_pred, average = 'weighted'))

# Set up ensemble voting classifier
voter = VotingClassifier(estimators=[("LR",lr1),("RF",rf1),("GB",gb1),("NB",nb1),("KNN",knn1)], voting="hard")
voter.fit(reg_X_train, reg_Y_train)
voter_pred = voter.predict(reg_X_test)
print("F1 Score - Voting Ensemble of Above 5 Models:", f1_score(reg_Y_test, voter_pred, average = 'weighted'))

Logistic Regression Coefficient Values: 
                                          Variable  Coefficient
0          Offensive Efficiency: Pts Per Poss Diff     1.221873
1  Defensive Efficiency: Pts Per Poss Allowed Diff     0.332447
2                        Tempo: Poss Per Game Diff    -0.138246
3                             Home Court Advantage     0.629619
4       Offensive Volatility: Pts Per Poss SD Diff    -0.071926
5           Mistake-Prone: Bad Plays Per Poss Diff    -0.011393
 
F1 Score - Logistic Regression: 0.7699099180946478
F1 Score - Random Forest: 0.7699099180946478
F1 Score - Gradient Boosting: 0.7763248386128256
F1 Score - Naive Bayes: 0.748666372714145
F1 Score - KNN: 0.7307456488341624
F1 Score - Voting Ensemble of Above 5 Models: 0.7731736071996883


### Next Steps: Predict the Tournament?

In [9]:
# Read in and clean up tourney data
tourney_seeds = pd.read_csv("tourney_seeds.csv")
tourney_seeds = tourney_seeds[["seed_line", "Seed", "overall_seed", "TeamID", "TeamName"]]

round_1 = pd.read_csv("Round_1_2021.csv")
round_1 = round_1[["tm1_seed_num", "tm1_Seed", "tm1_ID", "tm2_Seed", "tm2_seed_num", "tm2_ID"]]
round_1 = round_1[0:36][:]

print(tourney_seeds)
print(round_1)

    seed_line Seed  overall_seed  TeamID      TeamName
0           1  W01             4    1276      Michigan
1           2  W02             5    1104       Alabama
2           3  W03            11    1400         Texas
3           4  W04            13    1199    Florida St
4           5  W05            20    1160      Colorado
..        ...  ...           ...     ...           ...
63         12  Z12            49    1457      Winthrop
64         13  Z13            52    1317   North Texas
65         14  Z14            57    1159       Colgate
66         15  Z15            61    1331  Oral Roberts
67         16  Z16            64    1216      Hartford

[68 rows x 5 columns]
    tm1_seed_num tm1_Seed  tm1_ID tm2_Seed  tm2_seed_num  tm2_ID
0             16     W16b    1411     W16a            16  1291.0
1             11     X11a    1179     X11b            11  1455.0
2             16     X16a    1111     X16b            16  1313.0
3             11     W11b    1417     W11a            11 

In [10]:
# Get 2021 regular season data only
team_agg2021 = copy.deepcopy(team_agg5[team_agg5["Season"]==2021])
opp_agg2021 = copy.deepcopy(opp_agg2[opp_agg2["Season"]==2021])

# Create dataframe of possible 2021 tournament matchups
tourney_ids = list(copy.deepcopy(tourney_seeds["TeamID"].sort_values()))

team1_list = []
for i in tourney_ids:
    for j in range(68):
        team1_list.append(i)

team2_list = []
for x in range(68):
    for y in tourney_ids:
        team2_list.append(y)
        
tourney_matchups1 = pd.DataFrame(data={"Team1":team1_list, "Team2":team2_list})
tourney_matchups2 = copy.deepcopy(tourney_matchups1[tourney_matchups1["Team1"] < tourney_matchups1["Team2"]])
print(tourney_matchups2.head(5))
print(len(tourney_matchups2))

   Team1  Team2
1   1101   1104
2   1101   1111
3   1101   1116
4   1101   1124
5   1101   1140
2278


In [11]:
# Using full tourney matchups, add Team 1 points per possession, possessions per game, bad plays per possession, PPP_SD
tourney_matchups3 = tourney_matchups2.merge(team_agg2021, how='left', left_on="Team1", right_on="TeamID")
tourney_matchups4 = copy.deepcopy(tourney_matchups3[["Team1","Team2","Pts_Per_Poss","Poss_Per_Game","Bad_Per_Poss","Team_PPP_SD"]])
tourney_matchups4.columns = ["Team1","Team2","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD"]

# Using full tourney matchups, add Team 2 points per possession, possessions per game, bad plays per possession, PPP_SD
tourney_matchups5 = tourney_matchups4.merge(team_agg2021, how='left', left_on="Team2", right_on="TeamID")
tourney_matchups6 = copy.deepcopy(tourney_matchups5[["Team1","Team2","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD",
                                                     "Pts_Per_Poss","Poss_Per_Game","Bad_Per_Poss","Team_PPP_SD"]])
tourney_matchups6.columns = ["Team1","Team2","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD",
                             "T2_Pts_Per_Poss","T2_Poss_Per_Game","T2_Bad_Per_Poss","T2_PPP_SD"]

# Using full tourney matchups, add Team 1 and Team 2 opponent points per possession
tourney_matchups7 = tourney_matchups6.merge(opp_agg2021, how='left', left_on="Team1", right_on="TeamID")
tourney_matchups7a = copy.deepcopy(tourney_matchups7[["Team1","Team2","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD",
                                                      "T2_Pts_Per_Poss","T2_Poss_Per_Game","T2_Bad_Per_Poss","T2_PPP_SD","Opp_Pts_Per_Poss"]])
tourney_matchups7a.columns = ["Team1","Team2","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD",
                              "T2_Pts_Per_Poss","T2_Poss_Per_Game","T2_Bad_Per_Poss","T2_PPP_SD","T1_Opp_Pts_Per_Poss"]
tourney_matchups7b = tourney_matchups7a.merge(opp_agg2021, how='left', left_on="Team2", right_on="TeamID")
tourney_matchups7c = copy.deepcopy(tourney_matchups7b[["Team1","Team2","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD","T2_Pts_Per_Poss",
                                                       "T2_Poss_Per_Game","T2_Bad_Per_Poss","T2_PPP_SD","T1_Opp_Pts_Per_Poss","Opp_Pts_Per_Poss"]])
tourney_matchups7c.columns = ["Team1","Team2","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD","T2_Pts_Per_Poss",
                              "T2_Poss_Per_Game","T2_Bad_Per_Poss","T2_PPP_SD","T1_Opp_Pts_Per_Poss","T2_Opp_Pts_Per_Poss"]

# Give neutral location for each matchup
tourney_matchups7c["Location"] = 0

# Calculate points per possession, possessions per game, bad plays per possession, opponent points per possession difference from Team 1 perspective
tourney_matchups7c["Pts_Per_Poss_Diff"] = tourney_matchups7c["T1_Pts_Per_Poss"] - tourney_matchups7c["T2_Pts_Per_Poss"]
tourney_matchups7c["Poss_Per_Game_Diff"] = tourney_matchups7c["T1_Poss_Per_Game"] - tourney_matchups7c["T2_Poss_Per_Game"]
tourney_matchups7c["Bad_Per_Poss_Diff"] = tourney_matchups7c["T1_Bad_Per_Poss"] - tourney_matchups7c["T2_Bad_Per_Poss"]
tourney_matchups7c["PPP_SD_Diff"] = tourney_matchups7c["T1_PPP_SD"] - tourney_matchups7c["T2_PPP_SD"]
tourney_matchups7c["Opp_Pts_Per_Poss_Diff"] = tourney_matchups7c["T1_Opp_Pts_Per_Poss"] - tourney_matchups7c["T2_Opp_Pts_Per_Poss"]

# Transform feature values to standardized normal scale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
tourney_final = copy.deepcopy(tourney_matchups7c)
tourney_final["Pts_Per_Poss_Diff"] = scaler.fit_transform(np.array(tourney_final["Pts_Per_Poss_Diff"]).reshape(-1,1))
tourney_final["Poss_Per_Game_Diff"] = scaler.fit_transform(np.array(tourney_final["Poss_Per_Game_Diff"]).reshape(-1,1))
tourney_final["Bad_Per_Poss_Diff"] = scaler.fit_transform(np.array(tourney_final["Bad_Per_Poss_Diff"]).reshape(-1,1))
tourney_final["PPP_SD_Diff"] = scaler.fit_transform(np.array(tourney_final["PPP_SD_Diff"]).reshape(-1,1))
tourney_final["Opp_Pts_Per_Poss_Diff"] = scaler.fit_transform(np.array(tourney_final["Opp_Pts_Per_Poss_Diff"]).reshape(-1,1))

# Get X predictor variables for 2021 Tourney
X_features = ["Pts_Per_Poss_Diff","Opp_Pts_Per_Poss_Diff","Poss_Per_Game_Diff","Location","PPP_SD_Diff","Bad_Per_Poss_Diff"]
tourney_X_test = np.array(tourney_final[X_features])

# Display samples of X and Y data
print("Features used for modeling:")
print(X_features)
print(" ")
print("Sample of feature values in 2021 tourney test set:")
print(tourney_X_test[0:10])
print(" ")
print(len(tourney_X_test))

# Predict probabilities using Logistic Regression
tourney_lr_proba = lr1.predict_proba(tourney_X_test)[:,1]
tourney_lr_pred = lr1.predict(tourney_X_test)

print(tourney_lr_proba)
print(tourney_lr_proba.shape)
print(tourney_lr_pred)
print(min(tourney_lr_proba))
print(max(tourney_lr_proba))

Features used for modeling:
['Pts_Per_Poss_Diff', 'Opp_Pts_Per_Poss_Diff', 'Poss_Per_Game_Diff', 'Location', 'PPP_SD_Diff', 'Bad_Per_Poss_Diff']
 
Sample of feature values in 2021 tourney test set:
[[ 0.10601457 -1.70677407 -1.0732056   0.         -0.8274211  -0.88807454]
 [ 0.70840791 -0.35802369  0.72759943  0.         -0.6846905  -0.75822741]
 [-0.33856701 -1.30906444 -1.15887142  0.         -1.45625667  0.50236517]
 [-1.8206764  -0.74401012 -0.01514151  0.         -1.04309444 -0.30125366]
 [-0.48308094 -2.06254953 -0.01167842  0.         -1.21494109  1.04434167]
 [ 0.66791766 -1.659582    1.40278245  0.          0.25266328  0.17523639]
 [ 0.45928775 -2.37274908  0.2733322   0.         -1.41000278 -0.14862455]
 [-1.83775645 -0.73873628 -0.36600686  0.          0.18433279  1.28371774]
 [-0.63896347 -1.35931836  1.06297203  0.         -0.05254865  0.91017314]
 [-0.24010362 -1.14950131  0.77233787  0.         -0.30504474  0.46290022]]
 
2278
[0.42345421 0.64910928 0.33662234 ... 0.2121

In [12]:
# Print all possible tourney matchups' predicted results

tourney_matchups8a = pd.DataFrame(data={"Team1":tourney_final["Team1"], "Team2":tourney_final["Team2"],
                                        "Team1_Win_Pred":tourney_lr_pred, "Team1_Win_Proba":tourney_lr_proba})
tourney_matchups8b = tourney_matchups8a.merge(tourney_seeds, left_on="Team1", right_on="TeamID")
tourney_matchups8c = copy.deepcopy(tourney_matchups8b[["Team1","Team2","Team1_Win_Pred","Team1_Win_Proba","TeamName","seed_line"]])
tourney_matchups8c.columns = ["Team1","Team2","Team1_Win_Pred","Team1_Win_Proba","Team1_Name","Team1_Seed"]

tourney_matchups8d = tourney_matchups8c.merge(tourney_seeds, left_on="Team2", right_on="TeamID")
tourney_matchups8e = copy.deepcopy(tourney_matchups8d[["Team1","Team2","Team1_Name","TeamName","Team1_Seed","seed_line","Team1_Win_Pred","Team1_Win_Proba"]])
tourney_matchups8e.columns = ["Team1","Team2","Team1_Name","Team2_Name","Team1_Seed","Team2_Seed","Team1_Win_Pred","Team1_Win_Proba"]
tourney_matchups8 = copy.deepcopy(tourney_matchups8e.sort_values("Team1_Win_Proba"))

print("Matchups with lowest predicted probabilities to win for Team 1: ")
print(tourney_matchups8[["Team1_Name","Team2_Name","Team1_Seed","Team2_Seed","Team1_Win_Proba"]].head(30))
print(" ")
print("Matchups with highest predicted probabilities to win for Team 1: ")
print(tourney_matchups8[["Team1_Name","Team2_Name","Team1_Seed","Team2_Seed","Team1_Win_Proba"]].tail(30))
print(" ")

Matchups with lowest predicted probabilities to win for Team 1: 
          Team1_Name Team2_Name  Team1_Seed  Team2_Seed  Team1_Win_Proba
173   Appalachian St    Gonzaga          16           1         0.019991
1946     TX Southern   Virginia          16           4         0.020646
321         Hartford       Iowa          16           2         0.024184
1912        Hartford   Virginia          16           4         0.028075
188       Georgetown    Gonzaga          12           1         0.028201
1927      Norfolk St   Virginia          16           4         0.031496
302   Appalachian St       Iowa          16           2         0.033782
30    Appalachian St    Colgate          16          14         0.034235
324             Iona       Iowa          15           2         0.036592
1885     TX Southern  Villanova          16           5         0.036800
8     Appalachian St     Baylor          16           1         0.036872
177          Clemson    Gonzaga           7           1    

In [13]:
# Convert to Kaggle submission format
tourney_matchups8x = copy.deepcopy(tourney_matchups8.sort_values(["Team1","Team2"]))
tourney_matchups8x["ID"] = "2021_" + tourney_matchups8x["Team1"].astype(str) + "_" + tourney_matchups8x["Team2"].astype(str)
tourney_matchups9 = tourney_matchups8x[["ID","Team1_Win_Proba","Team1_Name","Team2_Name","Team1_Seed","Team2_Seed"]]
tourney_matchups9.columns = ["ID","Team1_Win_Probability","Team1_Name","Team2_Name","Team1_Seed","Team2_Seed"]
print(tourney_matchups9.head(10))

# Write submission to CSV
# tourney_matchups9.to_csv("RT_bracket_predictions_v1.csv")

                ID  Team1_Win_Probability   Team1_Name      Team2_Name  \
0   2021_1101_1104               0.423454  Abilene Chr         Alabama   
1   2021_1101_1111               0.649109  Abilene Chr  Appalachian St   
3   2021_1101_1116               0.336622  Abilene Chr        Arkansas   
6   2021_1101_1124               0.077261  Abilene Chr          Baylor   
10  2021_1101_1140               0.216263  Abilene Chr             BYU   
15  2021_1101_1155               0.490390  Abilene Chr         Clemson   
21  2021_1101_1156               0.437570  Abilene Chr    Cleveland St   
28  2021_1101_1159               0.071950  Abilene Chr         Colgate   
36  2021_1101_1160               0.186198  Abilene Chr        Colorado   
45  2021_1101_1163               0.298519  Abilene Chr     Connecticut   

    Team1_Seed  Team2_Seed  
0           14           2  
1           14          16  
3           14           3  
6           14           1  
10          14           6  
15         

In [14]:
# Kaggle scoring function
def logloss(prob_pred, results):
    score1 = 0
    for i in range(len(prob_pred)):
        game_score = results[i]*np.log(prob_pred[i]) + (1-results[i])*np.log(1-prob_pred[i])
        score1 += game_score
    score2 = score1 * -1 / len(prob_pred)
    return score2

# Replicate Kaggle scoring results
results1a = pd.read_csv("ncaa_tourney_2021_results1a.csv")
results1b = results1a.merge(tourney_matchups9, left_on="ID", right_on="ID")
results1c = copy.deepcopy(results1b[["ID","Team1_Win_Probability","Result"]])

prob_pred_test2 = list(results1c["Team1_Win_Probability"])
results_test2 = list(results1c["Result"])
print(" ")
print("Kaggle score is: ")
print(logloss(prob_pred_test2, results_test2))

                ID  Result
0   2021_1211_1313       1
1   2021_1281_1328       0
2   2021_1166_1364       1
3   2021_1325_1438       1
4   2021_1179_1425       0
..             ...     ...
58  2021_1116_1124       0
59  2021_1222_1333       1
60  2021_1211_1417       1
61  2021_1124_1222       1
62  2021_1124_1211       1

[63 rows x 2 columns]
                ID  Team1_Win_Probability  Result
0   2021_1211_1313               0.977752       1
1   2021_1281_1328               0.326750       0
2   2021_1166_1364               0.429712       1
3   2021_1325_1438               0.355202       1
4   2021_1179_1425               0.617178       0
..             ...                    ...     ...
58  2021_1116_1124               0.122473       0
59  2021_1222_1333               0.743353       1
60  2021_1211_1417               0.851561       1
61  2021_1124_1222               0.620808       1
62  2021_1124_1211               0.310674       1

[63 rows x 3 columns]
 
Kaggle score is: 
0.59474894

In [15]:
# Get 2019 tourney matchups data
tourney_hist_df = pd.read_csv("2021_Data/MNCAATourneyCompactResults.csv")
tourney_hist_df2 = copy.deepcopy(tourney_hist_df[tourney_hist_df["Season"]==2019])

wteam_2019 = list(tourney_hist_df2["WTeamID"])
lteam_2019 = list(tourney_hist_df2["LTeamID"])

result_2019 = []
team1_2019 = []
team2_2019 = []
for i in range(len(wteam_2019)):
    if wteam_2019[i] < lteam_2019[i]:
        result_2019.append(1)
        team1_2019.append(wteam_2019[i])
        team2_2019.append(lteam_2019[i])
    else:
        result_2019.append(0)
        team1_2019.append(lteam_2019[i])
        team2_2019.append(wteam_2019[i])
    

# Get 2019 regular season data only
team_agg2019 = copy.deepcopy(team_agg5[team_agg5["Season"]==2019])
opp_agg2019 = copy.deepcopy(opp_agg2[opp_agg2["Season"]==2019])

tourney_2019 = pd.DataFrame(data={"Team1":team1_2019, "Team2":team2_2019, "Team1_Win":result_2019})

# Using 2019 tourney matchups, add Team 1 points per possession, possessions per game, bad plays per possession, PPP_SD
tourney_2019a = tourney_2019.merge(team_agg2019, how='left', left_on="Team1", right_on="TeamID")
tourney_2019b = copy.deepcopy(tourney_2019a[["Team1","Team2","Team1_Win","Pts_Per_Poss","Poss_Per_Game","Bad_Per_Poss","Team_PPP_SD"]])
tourney_2019b.columns = ["Team1","Team2","Team1_Win","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD"]

# Using 2019 tourney matchups, add Team 2 points per possession, possessions per game, bad plays per possession, PPP_SD
tourney_2019c = tourney_2019b.merge(team_agg2019, how='left', left_on="Team2", right_on="TeamID")
tourney_2019d = copy.deepcopy(tourney_2019c[["Team1","Team2","Team1_Win","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD",
                                                 "Pts_Per_Poss","Poss_Per_Game","Bad_Per_Poss","Team_PPP_SD"]])
tourney_2019d.columns = ["Team1","Team2","Team1_Win","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD",
                         "T2_Pts_Per_Poss","T2_Poss_Per_Game","T2_Bad_Per_Poss","T2_PPP_SD"]

# Using 2019 tourney matchups, add Team 1 and Team 2 opponent points per possession
tourney_2019e = tourney_2019d.merge(opp_agg2021, how='left', left_on="Team1", right_on="TeamID")
tourney_2019f = copy.deepcopy(tourney_2019e[["Team1","Team2","Team1_Win","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD",
                                             "T2_Pts_Per_Poss","T2_Poss_Per_Game","T2_Bad_Per_Poss","T2_PPP_SD","Opp_Pts_Per_Poss"]])
tourney_2019f.columns = ["Team1","Team2","Team1_Win","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD",
                             "T2_Pts_Per_Poss","T2_Poss_Per_Game","T2_Bad_Per_Poss","T2_PPP_SD","T1_Opp_Pts_Per_Poss"]
tourney_2019g = tourney_2019f.merge(opp_agg2021, how='left', left_on="Team2", right_on="TeamID")
tourney_2019h = copy.deepcopy(tourney_2019g[["Team1","Team2","Team1_Win","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD","T2_Pts_Per_Poss",
                                             "T2_Poss_Per_Game","T2_Bad_Per_Poss","T2_PPP_SD","T1_Opp_Pts_Per_Poss","Opp_Pts_Per_Poss"]])
tourney_2019h.columns = ["Team1","Team2","Team1_Win","T1_Pts_Per_Poss","T1_Poss_Per_Game","T1_Bad_Per_Poss","T1_PPP_SD","T2_Pts_Per_Poss",
                         "T2_Poss_Per_Game","T2_Bad_Per_Poss","T2_PPP_SD","T1_Opp_Pts_Per_Poss","T2_Opp_Pts_Per_Poss"]

# Give neutral location for each matchup
tourney_2019h["Location"] = 0

# Calculate points per possession, possessions per game, bad plays per possession, opponent points per possession difference from Team 1 perspective
tourney_2019h["Pts_Per_Poss_Diff"] = tourney_2019h["T1_Pts_Per_Poss"] - tourney_2019h["T2_Pts_Per_Poss"]
tourney_2019h["Poss_Per_Game_Diff"] = tourney_2019h["T1_Poss_Per_Game"] - tourney_2019h["T2_Poss_Per_Game"]
tourney_2019h["Bad_Per_Poss_Diff"] = tourney_2019h["T1_Bad_Per_Poss"] - tourney_2019h["T2_Bad_Per_Poss"]
tourney_2019h["PPP_SD_Diff"] = tourney_2019h["T1_PPP_SD"] - tourney_2019h["T2_PPP_SD"]
tourney_2019h["Opp_Pts_Per_Poss_Diff"] = tourney_2019h["T1_Opp_Pts_Per_Poss"] - tourney_2019h["T2_Opp_Pts_Per_Poss"]
tourney_2019h.fillna(0, inplace=True)

# Transform feature values to standardized normal scale
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
tourney_2019_final = copy.deepcopy(tourney_2019h)
tourney_2019_final["Pts_Per_Poss_Diff"] = scaler.fit_transform(np.array(tourney_2019_final["Pts_Per_Poss_Diff"]).reshape(-1,1))
tourney_2019_final["Poss_Per_Game_Diff"] = scaler.fit_transform(np.array(tourney_2019_final["Poss_Per_Game_Diff"]).reshape(-1,1))
tourney_2019_final["Bad_Per_Poss_Diff"] = scaler.fit_transform(np.array(tourney_2019_final["Bad_Per_Poss_Diff"]).reshape(-1,1))
tourney_2019_final["PPP_SD_Diff"] = scaler.fit_transform(np.array(tourney_2019_final["PPP_SD_Diff"]).reshape(-1,1))
tourney_2019_final["Opp_Pts_Per_Poss_Diff"] = scaler.fit_transform(np.array(tourney_2019_final["Opp_Pts_Per_Poss_Diff"]).reshape(-1,1))

# Get X predictor variables for 2021 Tourney
X_features_2019 = ["Pts_Per_Poss_Diff","Opp_Pts_Per_Poss_Diff","Poss_Per_Game_Diff","Location","PPP_SD_Diff","Bad_Per_Poss_Diff"]
tourney_2019_X_test = np.array(tourney_2019_final[X_features_2019])

# Display samples of X and Y data
print("Features used for modeling:")
print(X_features_2019)
print(" ")
print("Sample of feature values in 2019 tourney test set:")
print(tourney_2019_X_test[0:10])

# Predict probabilities using Logistic Regression
tourney_2019_lr_proba = lr1.predict_proba(tourney_2019_X_test)[:,1]
tourney_2019_lr_pred = lr1.predict(tourney_2019_X_test)

# Kaggle scoring
prob_pred_2019 = list(tourney_2019_lr_proba)
print(" ")
print("Kaggle score is: ")
print(logloss(prob_pred_2019, result_2019))

print("F1 Score - 2019 Tourney Results:", f1_score(result_2019, tourney_2019_lr_pred , average = 'weighted'))

Features used for modeling:
['Pts_Per_Poss_Diff', 'Opp_Pts_Per_Poss_Diff', 'Poss_Per_Game_Diff', 'Location', 'PPP_SD_Diff', 'Bad_Per_Poss_Diff']
 
Sample of feature values in 2019 tourney test set:
[[ 1.61877055 -0.01718817  0.66770211  0.          1.98376456 -0.34222466]
 [ 0.90573556  2.74694867 -1.17539659  0.          0.77529405 -0.18763382]
 [ 0.14195541  0.6939942  -0.13024103  0.          0.31086776  1.11447223]
 [ 1.20038186  1.88277715 -0.39049271  0.          0.13603582 -2.94751593]
 [ 0.11579035  1.33969717  0.09038169  0.          0.43911932  1.82236762]
 [ 0.77867563 -0.97010454 -0.51068472  0.          0.39670841  0.20038406]
 [-0.9390798   0.53696756 -1.75750722  0.         -1.08268294  1.84622078]
 [-0.19336099  2.25047584  0.7690598   0.         -0.38974563  1.05071093]
 [-1.81956813 -0.38088574 -0.9967474   0.          1.07195577  2.16549824]
 [-0.75977768  0.50074258  0.70242212  0.         -0.79442864  1.70166331]]
 
Kaggle score is: 
0.5964961995839146
F1 Score - 2