In [1]:
#Importing all the necessary modules
import pandas as pd
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn import metrics

# Importing the Data

In [2]:
#Reading in the Data from Previous Tournaments from 2013-2019
#Also read in the tournament data frame and created separate data frames for each region

full = pd.read_csv('NCAATourneyFullSeasonStats_13-19 - NCAATourneyFullSeasonStats_13-19.csv')
tourney = pd.read_csv("2023 KenPom_Official - Sheet1.csv")
South = tourney[tourney.Region == "South"]
East = tourney[tourney.Region == "East"]
West = tourney[tourney.Region == "West"]
Midwest = tourney[tourney.Region == "Midwest"]

In [3]:
#Giving the seeds a numerical value the higher seed gets 1 and the lower seed gets 0
full2 = full.replace("Higher", 1).replace("Lower", 0)
full2

Unnamed: 0,winner,SEED_higher,SEED_lower,TEAM_higher,TEAM_lower,ADJOE_higher,ADJDE_higher,ORB_higher,DRB_higher,FTR_higher,...,Turnover_diff_higher,ADJOE_lower,ADJDE_lower,ORB_lower,DRB_lower,FTR_lower,FTRD_lower,3P_O_lower,3P_D_lower,Turnover_diff_lower
0,1,1,16,North Carolina,Florida Gulf Coast,123.3,94.9,40.7,30.0,32.3,...,2.8,102.9,102.4,31.7,27.5,37.3,35.3,34.2,31.2,-1.3
1,1,1,9,North Carolina,Providence,123.3,94.9,40.7,30.0,32.3,...,2.8,108.7,95.4,30.4,30.3,37.3,28.0,32.1,32.3,4.5
2,1,1,5,North Carolina,Indiana,123.3,94.9,40.7,30.0,32.3,...,2.8,121.3,98.4,37.2,29.2,33.9,30.2,41.6,34.3,-0.7
3,1,1,6,North Carolina,Notre Dame,123.3,94.9,40.7,30.0,32.3,...,2.8,118.3,103.3,32.7,32.1,32.9,26.0,37.4,36.9,-0.5
4,1,1,10,North Carolina,Syracuse,123.3,94.9,40.7,30.0,32.3,...,2.8,111.9,93.6,33.5,35.3,35.4,28.0,36.0,30.7,2.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
933,0,12,13,Mississippi,La Salle,110.0,95.6,34.1,31.8,38.8,...,5.6,112.0,96.2,29.0,34.2,31.3,28.5,37.7,30.2,4.2
934,1,9,13,Wichita St.,La Salle,110.6,91.0,38.0,26.5,36.5,...,0.6,112.0,96.2,29.0,34.2,31.3,28.5,37.7,30.2,4.2
935,0,2,15,Georgetown,Florida Gulf Coast,107.6,85.0,30.4,31.0,36.8,...,2.3,103.4,96.3,32.5,32.8,35.2,32.7,33.4,31.3,1.1
936,0,7,15,San Diego St.,Florida Gulf Coast,106.0,89.5,31.8,27.9,35.3,...,0.1,103.4,96.3,32.5,32.8,35.2,32.7,33.4,31.3,1.1


# Training and Creating Linear Regression Model

In [4]:
#Training and testing the data using a logistical regression to see what columns need to be dropped
#Some columns are being dropped off of the bat because they are unneeded for training and testing the data
full3 = full2.drop(columns=["TEAM_higher","TEAM_lower", "SEED_higher","SEED_lower"])
split = train_test_split(full3.astype(float),train_size = 0.75, test_size = 0.25)
train = split[0]
train_labels = train['winner']
train_vectors = split[0].drop(columns = ['winner'])


train_labels = train_labels.sort_index()
train_vectors = train_vectors.sort_index()
train_labels
train_vectors
pred = sm.add_constant(train_vectors)

model = sm.Logit(train_labels, pred)
result = model.fit()
print(result.summary() )

Optimization terminated successfully.
         Current function value: 0.465448
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 winner   No. Observations:                  703
Model:                          Logit   Df Residuals:                      684
Method:                           MLE   Df Model:                           18
Date:                Tue, 18 Apr 2023   Pseudo R-squ.:                  0.2885
Time:                        16:22:12   Log-Likelihood:                -327.21
converged:                       True   LL-Null:                       -459.88
Covariance Type:            nonrobust   LLR p-value:                 6.104e-46
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    0.4238      4.539      0.093      0.926      -8.473       9.321

In [5]:
#Dropping the columns whose coefficients are not greater than or equal to 1 or less than or equal to -1
full4 = full2.drop(columns = ["FTR_higher","FTR_lower",
                     "FTRD_higher", "FTRD_lower","Turnover_diff_higher","Turnover_diff_lower","ORB_higher","ORB_lower",
                              "3P_O_higher","3P_O_lower","3P_D_higher","3P_D_lower"])
full4.head()

Unnamed: 0,winner,SEED_higher,SEED_lower,TEAM_higher,TEAM_lower,ADJOE_higher,ADJDE_higher,DRB_higher,ADJOE_lower,ADJDE_lower,DRB_lower
0,1,1,16,North Carolina,Florida Gulf Coast,123.3,94.9,30.0,102.9,102.4,27.5
1,1,1,9,North Carolina,Providence,123.3,94.9,30.0,108.7,95.4,30.3
2,1,1,5,North Carolina,Indiana,123.3,94.9,30.0,121.3,98.4,29.2
3,1,1,6,North Carolina,Notre Dame,123.3,94.9,30.0,118.3,103.3,32.1
4,1,1,10,North Carolina,Syracuse,123.3,94.9,30.0,111.9,93.6,35.3


## Creating Model With Important Features

In [6]:
#Training and testing the data with our new data frame and dropping the
# same columns as before that have no affect on the data
full7 = full4.drop(columns=["TEAM_higher","TEAM_lower", "SEED_higher","SEED_lower"])


split = train_test_split(full7.astype(float),train_size = 0.75, test_size = 0.25, random_state = 10)
train = split[0]
train_labels = train['winner']
train_vectors = split[0].drop(columns = ['winner'])


train_labels = train_labels.sort_index()
train_vectors = train_vectors.sort_index()
pred = sm.add_constant(train_vectors)

model = sm.Logit(train_labels, pred)
result = model.fit()
print(result.summary() )

Optimization terminated successfully.
         Current function value: 0.469363
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 winner   No. Observations:                  703
Model:                          Logit   Df Residuals:                      696
Method:                           MLE   Df Model:                            6
Date:                Tue, 18 Apr 2023   Pseudo R-squ.:                  0.2699
Time:                        16:22:12   Log-Likelihood:                -329.96
converged:                       True   LL-Null:                       -451.95
Covariance Type:            nonrobust   LLR p-value:                 7.957e-50
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -5.7974      4.134     -1.402      0.161     -13.901       2.306
ADJOE_higher     0.2278

In [7]:
#Assigning the coeficient values from the logistic regression model
#to what they represent to be used in the calculations

coef = result.params.values[0]
ADJOE_higher = result.params.values[1]
ADJOE_lower = result.params.values[4]
ADJDE_higher = result.params.values[2]
ADJDE_lower = result.params.values[5]
DRB_higher = result.params.values[3]
DRB_lower = result.params.values[6]

# Functions to Advance Tournament

In [8]:
#Creating the function that will be used to determine the winner of each matchup
def winner(higher_seed, lower_seed):
    win = (coef+(ADJOE_higher* higher_seed[3])+(ADJDE_higher * higher_seed[4])+(DRB_higher* higher_seed[5])+
    (ADJOE_lower * lower_seed[3])+(ADJDE_lower * lower_seed[4])+(DRB_lower * lower_seed[5]))
    print(higher_seed[1], higher_seed[0],"vs",lower_seed[1], lower_seed[0],":")
    print(win)
    if win >= 0:
        value = 1
    else:
        value = 0
    if value == 1 :
        team_dropped = lower_seed[0]
        print(higher_seed[1], "Advances")
    elif value == 0:
        team_dropped = higher_seed[0]
        print(lower_seed[1], "Advances")

    return(team_dropped)

In [9]:
#Creating the Function to Simulate and Drop the teams as the round robin progresses
def Round_Robin (Region,k):
    Region_Dropped = []
    j = 1
    h = 0

    for i in range(k):
        Region_Dropped.append(winner(Region.iloc[h,:],Region.iloc[j,:]))
        h+=2
        j+=2
    return(Region_Dropped)


## Round of 64

In [10]:
print(South)
print(East)
print(Midwest)
print(West)

    Seed                    Team Region   AdjO   AdjD  Reb/Game
0      1                 Alabama  South  115.5   88.2      29.6
1     16  Texas A&M Corpus Chris  South  107.1  107.6      23.0
2      8                Maryland  South  113.5   95.6      22.3
3      9           West Virginia  South  116.9   98.1      21.5
4      5           San Diego St.  South  110.8   90.4      23.8
5     12              Charleston  South  110.6   98.7      25.4
6      4                Virginia  South  110.8   94.5      22.7
7     13                  Furman  South  113.5  105.5      23.5
8      6               Creighton  South  115.0   93.2      27.2
9     11              N.C. State  South  113.6  100.0      24.1
10     3                  Baylor  South  121.3  101.6      20.3
11    14        UC Santa Barbara  South  110.3  104.6      22.0
12     7                Missouri  South  118.3  105.3      19.9
13    10                Utah St.  South  116.3   98.9      25.9
14     2                 Arizona  South 

In [11]:
#Advancing through the round of 64
print("East")
East2 = East.set_index("Seed").drop(index=Round_Robin(East,8)).reset_index()
print("South")
South2 = South.set_index("Seed").drop(index=Round_Robin(South,8)).reset_index()
print("West")
West2 = West.set_index("Seed").drop(index=Round_Robin(West,8)).reset_index()
print("Midwest")
Midwest2 = Midwest.set_index("Seed").drop(index=Round_Robin(Midwest,8)).reset_index()


East
Purdue 1 vs Fairleigh Dickinson 16 :
5.485502327814162
Purdue Advances
Memphis 8 vs Florida Atlantic 9 :
-0.9944012412684948
Florida Atlantic Advances
Duke 5 vs Oral Roberts 12 :
0.7555165791199894
Duke Advances
Tennessee 4 vs Louisiana 13 :
2.5300173382587205
Tennessee Advances
Kentucky 6 vs Providence 11 :
-0.2454293708268005
Providence Advances
Kansas St. 3 vs Montana St. 14 :
1.4363165372878839
Kansas St. Advances
Michigan St. 7 vs USC 10 :
-0.2183848548783205
USC Advances
Marquette 2 vs Vermont 15 :
2.2766447634825826
Marquette Advances
South
Alabama 1 vs Texas A&M Corpus Chris 16 :
5.253931212656472
Alabama Advances
Maryland 8 vs West Virginia 9 :
-0.5979695598927941
West Virginia Advances
San Diego St. 5 vs Charleston 12 :
0.7938637113915146
San Diego St. Advances
Virginia 4 vs Furman 13 :
0.7532008347221026
Virginia Advances
Creighton 6 vs N.C. State 11 :
1.3143283932176435
Creighton Advances
Baylor 3 vs UC Santa Barbara 14 :
1.5786044859958592
Baylor Advances
Missouri 7 v

## Round of 32

In [12]:
#Printing the Updated Bracket For the Round of 32
print(South2)
print(East2)
print(Midwest2)
print(West2)

   Seed           Team Region   AdjO   AdjD  Reb/Game
0     1        Alabama  South  115.5   88.2      29.6
1     9  West Virginia  South  116.9   98.1      21.5
2     5  San Diego St.  South  110.8   90.4      23.8
3     4       Virginia  South  110.8   94.5      22.7
4     6      Creighton  South  115.0   93.2      27.2
5     3         Baylor  South  121.3  101.6      20.3
6    10       Utah St.  South  116.3   98.9      25.9
7     2        Arizona  South  118.2   96.3      27.3
   Seed              Team Region   AdjO   AdjD  Reb/Game
0     1            Purdue   East  117.7   94.5      25.2
1     9  Florida Atlantic   East  115.1   95.7      26.1
2     5              Duke   East  113.2   93.9      25.0
3     4         Tennessee   East  111.3   87.5      24.0
4    11        Providence   East  116.2  100.8      24.5
5     3        Kansas St.   East  113.5   95.0      23.0
6    10               USC   East  112.3   97.5      24.3
7     2         Marquette   East  119.1   96.7      21.2
 

In [13]:
#Advancing Through the Round of 32
print("East")
East3 = East2.set_index("Seed").drop(index=Round_Robin(East2,4)).reset_index()
print("South")
South3 = South2.set_index("Seed").drop(index=Round_Robin(South2,4)).reset_index()
print("West")
West3 = West2.set_index("Seed").drop(index=Round_Robin(West2,4)).reset_index()
print("Midwst")
Midwest3 = Midwest2.set_index("Seed").drop(index=Round_Robin(Midwest2,4)).reset_index()

East
Purdue 1 vs Florida Atlantic 9 :
0.2515390216295348
Purdue Advances
Duke 5 vs Tennessee 4 :
-1.4912063489195568
Tennessee Advances
Providence 11 vs Kansas St. 3 :
-1.344481461597196
Kansas St. Advances
USC 10 vs Marquette 2 :
-1.696466045098606
Marquette Advances
South
Alabama 1 vs West Virginia 9 :
2.3060932227288085
Alabama Advances
San Diego St. 5 vs Virginia 4 :
0.22105595441340364
San Diego St. Advances
Creighton 6 vs Baylor 3 :
0.9369319029763121
Creighton Advances
Utah St. 10 vs Arizona 2 :
-1.4983969335750849
Arizona Advances
West
Kansas 1 vs Illinois 9 :
0.6023630250166914
Kansas Advances
Saint Mary's 5 vs Connecticut 4 :
-1.8653078362619424
Connecticut Advances
Arizona St. 11 vs Gonzaga 3 :
-2.3481726264072313
Gonzaga Advances
Boise St. 10 vs UCLA 2 :
-2.546115529539136
UCLA Advances
Midwst
Houston 1 vs Auburn 9 :
1.908526951784216
Houston Advances
Drake 12 vs Indiana 4 :
-1.8098662277005673
Indiana Advances
Pittsburgh 11 vs Xavier 3 :
-2.1703689046415553
Xavier Advances

## Sweet Sixteen

In [14]:
#Printing the Sweet Sixteen Bracket
print(South3)
print(East3)
print(Midwest3)
print(West3)

   Seed           Team Region   AdjO  AdjD  Reb/Game
0     1        Alabama  South  115.5  88.2      29.6
1     5  San Diego St.  South  110.8  90.4      23.8
2     6      Creighton  South  115.0  93.2      27.2
3     2        Arizona  South  118.2  96.3      27.3
   Seed        Team Region   AdjO  AdjD  Reb/Game
0     1      Purdue   East  117.7  94.5      25.2
1     4   Tennessee   East  111.3  87.5      24.0
2     3  Kansas St.   East  113.5  95.0      23.0
3     2   Marquette   East  119.1  96.7      21.2
   Seed     Team   Region   AdjO  AdjD  Reb/Game
0     1  Houston  Midwest  118.1  90.4      24.4
1     4  Indiana  Midwest  114.2  97.2      25.2
2     3   Xavier  Midwest  118.5  98.6      26.0
3     2    Texas  Midwest  116.5  92.1      23.1
   Seed         Team Region   AdjO  AdjD  Reb/Game
0     1       Kansas   West  114.6  91.8      24.6
1     4  Connecticut   West  120.8  90.9      24.9
2     3      Gonzaga   West  122.3  99.3      24.9
3     2         UCLA   West  115.2  

In [15]:
#Advancing Through the Sweet Sixteen Bracket
print("East")
East4 = East3.set_index("Seed").drop(index=Round_Robin(East3,2)).reset_index()
print("South")
South4 = South3.set_index("Seed").drop(index=Round_Robin(South3,2)).reset_index()
print("West")
West4 = West3.set_index("Seed").drop(index=Round_Robin(West3,2)).reset_index()
print("Midwest")
Midwest4 = Midwest3.set_index("Seed").drop(index=Round_Robin(Midwest3,2)).reset_index()

East
Purdue 1 vs Tennessee 4 :
-0.5970353336014829
Tennessee Advances
Kansas St. 3 vs Marquette 2 :
-0.9176189061095628
Marquette Advances
South
Alabama 1 vs San Diego St. 5 :
1.4603913391002008
Alabama Advances
Creighton 6 vs Arizona 2 :
-0.27777839607950083
Arizona Advances
West
Kansas 1 vs Connecticut 4 :
-1.3887920095361266
Connecticut Advances
Gonzaga 3 vs UCLA 2 :
-1.0192539592818863
UCLA Advances
Midwest
Houston 1 vs Indiana 4 :
1.7673902344385226
Houston Advances
Xavier 3 vs Texas 2 :
-1.1010364965192023
Texas Advances


## Elite Eight

In [16]:
#Printing the Elite 8 Bracket
print(South4)
print(East4)
print(Midwest4)
print(West4)

   Seed     Team Region   AdjO  AdjD  Reb/Game
0     1  Alabama  South  115.5  88.2      29.6
1     2  Arizona  South  118.2  96.3      27.3
   Seed       Team Region   AdjO  AdjD  Reb/Game
0     4  Tennessee   East  111.3  87.5      24.0
1     2  Marquette   East  119.1  96.7      21.2
   Seed     Team   Region   AdjO  AdjD  Reb/Game
0     1  Houston  Midwest  118.1  90.4      24.4
1     2    Texas  Midwest  116.5  92.1      23.1
   Seed         Team Region   AdjO  AdjD  Reb/Game
0     4  Connecticut   West  120.8  90.9      24.9
1     2         UCLA   West  115.2  88.0      22.2


In [17]:
#Advancing through the Elite Eight
print("East")
East5 = East4.set_index("Seed").drop(index=Round_Robin(East4,1)).reset_index()
print("South")
South5 = South4.set_index("Seed").drop(index=Round_Robin(South4,1)).reset_index()
print("West")
West5 = West4.set_index("Seed").drop(index=Round_Robin(West4,1)).reset_index()
print("Midwest")
Midwest5 = Midwest4.set_index("Seed").drop(index=Round_Robin(Midwest4,1)).reset_index()

East
Tennessee 4 vs Marquette 2 :
0.5162710896932043
Tennessee Advances
South
Alabama 1 vs Arizona 2 :
1.2741888189730806
Alabama Advances
West
Connecticut 4 vs UCLA 2 :
0.7106280910758049
Connecticut Advances
Midwest
Houston 1 vs Texas 2 :
0.693428086738554
Houston Advances


## Final Four

In [18]:
#Printing the final four bracket
print(South5)
print(East5)
print(Midwest5)
print(West5)

   Seed     Team Region   AdjO  AdjD  Reb/Game
0     1  Alabama  South  115.5  88.2      29.6
   Seed       Team Region   AdjO  AdjD  Reb/Game
0     4  Tennessee   East  111.3  87.5      24.0
   Seed     Team   Region   AdjO  AdjD  Reb/Game
0     1  Houston  Midwest  118.1  90.4      24.4
   Seed         Team Region   AdjO  AdjD  Reb/Game
0     4  Connecticut   West  120.8  90.9      24.9


In [19]:
#Creating a new data frame for the final four
final_four_df = pd.concat([South5,East5,Midwest5,West5])
final_four_df = final_four_df.reset_index()
final_four_df = final_four_df.drop(columns="index")
final_four_df

Unnamed: 0,Seed,Team,Region,AdjO,AdjD,Reb/Game
0,1,Alabama,South,115.5,88.2,29.6
1,4,Tennessee,East,111.3,87.5,24.0
2,1,Houston,Midwest,118.1,90.4,24.4
3,4,Connecticut,West,120.8,90.9,24.9


## Create Final Four Function
### Pulling from a new data frame 
### New function is needed because the Round Robin function above uses seeds as the values to be dropped in the data frames and there are duplicate seeds in the Final Four

In [20]:
#Creating the function to calculate the winner in the final four
def final_4_winner(higher_seed, lower_seed, Message):
    win = (coef+(ADJOE_higher* higher_seed[3])+(ADJDE_higher * higher_seed[4])+(DRB_higher* higher_seed[5])+
    (ADJOE_lower * lower_seed[3])+(ADJDE_lower * lower_seed[4])+(DRB_lower * lower_seed[5]))
    print(higher_seed[1], higher_seed[0],"vs",lower_seed[1], lower_seed[0],":")
    print(win)
    if win >= 0:
        value = 1
    else:
        value = 0
    if value == 1 :
        team_dropped = lower_seed[2]
        print(higher_seed[1], Message)
    elif value == 0:
        team_dropped = higher_seed[2]
        print(lower_seed[1], Message)
    return(team_dropped)
#Creating the function to Drop the losers of Each matchup
def final_four(Region, Message):
    Region_Dropped = []
    j = 1
    h = 0
    for i in range(2):
        Region_Dropped.append(final_4_winner(Region.iloc[h,:],Region.iloc[j,:],Message))
    
        h+=2
        j+=2
    
    return(Region_Dropped)
final_4_Dropped = final_four(final_four_df, "Advances")
final_four_df3 = final_four_df.set_index("Seed")


final_two = final_four_df[final_four_df.Region.isin(final_4_Dropped) == False].reset_index().drop(columns="index")
final_two

Alabama 1 vs Tennessee 4 :
0.8313273037237585
Alabama Advances
Houston 1 vs Connecticut 4 :
-0.2633470384619563
Connecticut Advances


Unnamed: 0,Seed,Team,Region,AdjO,AdjD,Reb/Game
0,1,Alabama,South,115.5,88.2,29.6
1,4,Connecticut,West,120.8,90.9,24.9


# Championship

In [21]:
#Creating the function for the Championship game to declare the Winner
def Championship(Region, Message):
    Region_Dropped = []
    j = 1
    h = 0
    for i in range(1):
        Region_Dropped.append(final_4_winner(Region.iloc[h,:],Region.iloc[j,:],Message))
    
        h+=2
        j+=2
    
    return(Region_Dropped)
Champion_Dropped = Championship(final_two, "has won the 2023 National Championship")
Winner = final_two[final_two.Region.isin(Champion_Dropped) == False]
Winner

Alabama 1 vs Connecticut 4 :
0.13110902087117582
Alabama has won the 2023 National Championship


Unnamed: 0,Seed,Team,Region,AdjO,AdjD,Reb/Game
0,1,Alabama,South,115.5,88.2,29.6


# References

2023 Data:

https://kenpom.com/

2013-2019 Data:

https://www.kaggle.com/datasets/jedbell/fullseasonstats-1319

2023 Rebounding Data:

https://www.teamrankings.com/ncaa-basketball/stat/total-rebounds-per-game

Idea: 

https://www.analytics8.com/blog/how-to-use-machine-learning-to-predict-ncaa-march-madness/
