In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

In [2]:
# Test on 2021 data
def make_game(team1_name, team2_name, team_data):
    
    team1 = team_data[team_data.TEAM == team1_name]
    team2 = team_data[team_data.TEAM == team2_name]
    game = team1.drop(['TEAM'], axis=1)
    game['Opp_Rank'] = team2['Rank'].tolist()

#     cols_1 = {}
#     for col in team1.columns:
#         cols_1[col] = '1-' + col

#     team1 = team1.rename(columns=cols_1)

    
#     team2 = team2.drop(['Team'], axis=1)

#     cols_2 = {}
#     for col in team2.columns:
#         cols_2[col] = '2-' + col

#     team2 = team2.rename(columns=cols_2)

#     return pd.concat([team1,team2], axis=1)

    return game

def predict_game(team1_name, team2_name, model, team_data):
    
    g = make_game(team1_name, team2_name, team_data)
    
    scaler = StandardScaler()
    X = scaler.fit_transform(g)
    prob = model.predict_proba(X)
    print('P(%s wins) = %.4f' %(team1_name, prob[0,1]))

In [3]:
df = pd.read_csv('./march_madness.csv')
df = df.drop(['Unnamed: 0'], axis=1)
print(df.shape)
df.head(5)

(600, 44)


Unnamed: 0,Team,G,W,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,TORD,...,Opp_2P_O,Opp_2P_D,Opp_3P_O,Opp_3P_D,Opp_ADJ_T,Opp_WAB,Opp_SEED,Rank,Opp_Rank,Win
0,Duke,37,29,122.3,92.9,0.9592,55.9,46.4,17.5,17.3,...,46.6,45.1,31.7,31.8,64.0,-0.4,11.0,3,54,1
1,UCLA,36,31,122.8,99.6,0.9175,59.8,48.7,15.1,15.8,...,52.1,41.4,34.5,34.4,65.4,5.6,6.0,18,22,1
2,Purdue,37,30,123.2,95.7,0.9484,57.7,47.0,15.8,17.7,...,51.1,45.1,35.9,32.3,67.6,4.7,3.0,5,12,0
3,Dayton,37,26,113.0,99.1,0.8188,52.4,48.8,18.1,18.8,...,47.7,45.5,33.1,34.5,61.5,7.9,3.0,51,14,1
4,Harvard,29,19,104.9,99.9,0.6362,54.3,49.2,20.8,20.9,...,46.1,42.0,35.1,34.6,65.4,6.9,3.0,108,17,1


In [4]:
columns = ['Team', 'ADJOE', 'ADJDE', 'FTRD', 'ADJ_T', 'Rank', 'Opp_Rank','Win']
features = ['TEAM','ADJOE', 'ADJDE', 'FTRD', 'ADJ_T', 'Rank']
df = df[columns]
df.head(5)

Unnamed: 0,Team,ADJOE,ADJDE,FTRD,ADJ_T,Rank,Opp_Rank,Win
0,Duke,122.3,92.9,22.3,70.9,3,54,1
1,UCLA,122.8,99.6,25.4,74.6,18,22,1
2,Purdue,123.2,95.7,24.2,68.7,5,12,0
3,Dayton,113.0,99.1,43.0,65.4,51,14,1
4,Harvard,104.9,99.9,31.2,64.7,108,17,1


## Logistic Regression

In [5]:
scaler = StandardScaler()
X = scaler.fit_transform(df.drop(['Team','Win'], axis=1))
y = df.Win

In [6]:
cv = 5                     # 5-fold cross validation
Cs = 10**np.linspace(-5,5) # constants for cost function
n_CPU = 8                  # number of CPUs to use for CV

lgr = LogisticRegressionCV(Cs=Cs, cv=cv, n_jobs=n_CPU).fit(X,y)

In [20]:
round_of_64 = [
    ('Gonzaga',        'Norfolk St'),
    ('Oklahoma',       'Missouri'),
    ('Creighton',      'UC Santa Barbara'),
    ('Virginia',       'Ohio'),
    ('USC',            'Drake'),
    ('Kansas',         'Eastern Washington'),
    ('Oregon',         'VCU'),
    ('Iowa',           'Grand Canyon'),
    ('Michigan',       'Texas Southern'),
    ('LSU',            'St Bonaventure'),
    ('Colorado',       'Georgetown'),
    ('Florida St',     'UNC Greensboro'),
    ('BYU',            'UCLA'),
    ('Texas',          'Abilene Christian'),
    ('Connecticut',    'Maryland'),
    ('Alabama',        'Iona'),
    ('Baylor',         'Hartford'),
    ('North Carolina', 'Wisconsin'),
    ('Villanova',      'Winthrop'),
    ('Purdue',         'North Texas'),
    ('Texas Tech',     'Utah St'),
    ('Arkansas',       'Colgate'),
    ('Florida',        'Virginia Tech'),
    ('Ohio St',        'Oral Roberts'),
    ('Illinois',       'Drexel'),
    ('Loyola Chicago', 'Georgia Tech'),
    ('Tennessee',      'Oregon St'),
    ('Oklahoma St',    'Liberty'),
    ('San Diego St',   'Syracuse'),
    ('West Virginia',  'Morehead St'),
    ('Clemson',        'Rutgers'),
    ('Houston',        'Cleveland St')
]

In [21]:
ranks = pd.read_csv('google_cloud_ranks_21.csv')
teams = pd.read_csv('google_cloud_teams_21.csv')
team_data = pd.read_csv('./kaggle_cbb_2021.csv')
team_data.TEAM = team_data.TEAM.str.replace(r'[^\w\s]+', '')

POM = ranks[ranks.SystemName == 'POM']
POM = POM[POM.RankingDayNum == 133]
POM_21 = POM[POM.Season == 2021]
POM_21 = POM_21.drop(['RankingDayNum','SystemName'], axis=1)

POM_21 = \
    POM_21 \
    .merge(teams[['TeamName', 'TeamID']],
           left_on='TeamID',
           right_on='TeamID',
           validate='many_to_one') \
    .drop('TeamID', axis=1) \
    .drop('Season', axis=1)

team_data = team_data \
            .merge(POM_21,left_on='TEAM', right_on='TeamName', how='inner') \
            .drop(['TeamName'], axis=1) \
            .rename(columns={'OrdinalRank' : 'Rank'})
team_data = team_data[features]

i = 0
regions = ['West','East','South','Midwest']
for g,teams in enumerate(round_of_64):
    if g % 8 == 0:
        print('\n%s Region:' %(regions[i]))
        i += 1

    predict_game(teams[0],teams[1],lgr,team_data)


West Region:
P(Gonzaga wins) = 0.5364
P(Oklahoma wins) = 0.5364
P(Creighton wins) = 0.5364
P(Virginia wins) = 0.5364
P(USC wins) = 0.5364


  team_data.TEAM = team_data.TEAM.str.replace(r'[^\w\s]+', '')


ValueError: Length of values (0) does not match length of index (1)

In [22]:
for t in team_data.TEAM:
    if t.startswith('Kan'):
        print(t)
    if t.startswith('E'):
        print(t)

Kansas
East Carolina
Evansville
Kansas St
Elon


In [23]:
for t in POM_21.TeamName:
    if t.startswith('Kan'):
        print(t)
    if t.startswith('E'):
        print(t)

E Illinois
E Kentucky
E Michigan
E Washington
East Carolina
Elon
ETSU
Evansville
Kansas
Kansas St
