In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold

resultsWin = pd.read_csv('data/RegularSeasonDetailedResults.csv')
teams = pd.read_csv('data/Teams.csv')

# Get only winning teams game stats to predict their score
resultsWin = resultsWin.drop(['WTeamID', 'LTeamID', 'WLoc'], axis=1)

def neuralNetwork(results) :
    train_features, test_features, train_outcome, test_outcome = train_test_split(
        results.drop("WScore", axis=1),
        results.WScore,
        test_size=0.30, 
        random_state=11
    )
    scaler = MinMaxScaler()
    mlp_reg = MLPClassifier()

    imputer = Imputer()
    selector = SelectPercentile()
    threshold = VarianceThreshold(.1)
    pipe = make_pipeline(imputer, threshold, selector, scaler, mlp_reg)

    param_grid = {
        'selectpercentile__percentile':range(10, 30, 5)
        }

    crossVal = KFold()
    grid = GridSearchCV(pipe, param_grid, cv = crossVal, scoring="neg_mean_absolute_error")
    grid.fit(train_features, train_outcome)
    grid.score(test_features, test_outcome)

    score = grid.score(test_features, test_outcome)

    predictedValues = grid.predict(test_features)

    return [score, predictedValues, grid, test_outcome]

neural = neuralNetwork(resultsWin)

# How many points off were we from predicting the winning score?
print(neural[0])

# This can be exactly replicated for the other team using LScore for losing score
# Then need to randomize which teams get which grid and we run the grid for each playoff game invididually
# in their own csv. Take the two scores, see who won, move them manually to the next round.



In [None]:
resultsWin.head()

I'm not quite done withe the following but it is a way to predict the outcomes of the tournament games as they contunue. The games each round can be based off of the predicted outcome of the last round.

In [96]:
def next_round(teams_df):
    '''Takes a dataframe with two columns: School id, and ranking, none of 
    these teams should have been eliminated. It returns the next matchups for these 
    teams based on their ranking'''
    arar = np.char.array(['01','16','08','09','05','12','04','13','06','11','03','14','07','10','02','15'])
    arr = np.append(arar, arar)
    first_round_bracket = np.char.array(['W', 'X', 'Y', 'Z']).repeat(16) + np.append(arr, arr)
    if(len(teams_df) < 64):
        won_ranks = teams_df['Seed'].values
        first_round_bracket = np.array([x for x in first_round_bracket if x in won_ranks])
    #print(first_round_bracket)
    picks1 = first_round_bracket[np.arange(1, len(teams_df), 2)]
    picks2 = first_round_bracket[np.arange(0, len(teams_df), 2)]
    #print(picks1)
    #print(picks2)
    teams_df = teams_df.set_index('Seed')
    teams_next = pd.DataFrame()
    teams_next['team1'] = teams_df.loc[picks1, 'TeamID'].values
    teams_next['team2'] = teams_df.loc[picks2, 'TeamID'].values
    teams_next['rank1'] = picks1
    teams_next['rank2'] = picks2
    return teams_next

def find_winners(nx):
    '''Takes a dataframe with teamsids and ranks from matches and keeps only the 
    winners'''
    nx['score'] = (nx['team1'] - nx['team2'])
    nx['TeamID'] = nx.loc[:, 'team2']
    nx['Seed'] = nx.loc[:, 'rank2']
    
    condition = nx['score'] > 0
    #this can be replaced with an ouput from a neural net to predict winners
    underdogs = nx.loc[condition,['rank1', 'team1']]
    underdogs.columns = ['Seed', 'TeamID']
    nx.update(underdogs)
    return(nx)#.loc[:, ['Seed', 'TeamID']])

def first_four(teams_df, games_record):
    pregames = teams_df.loc[teams_df['Seed'].str.contains('a|b'),:]
    teams_df = teams_df.loc[~teams_df['Seed'].str.contains('a|b'),:]
    #features = pd.merge(games, team_summary_stats, how='left', left_on=['team1'], right_on=['TeamID'])
    #features = pd.merge(games, team_summary_stats, how='left', left_on=['team2'], right_on=['TeamID'], suffixes=('', '_t2'))
    teams_next = pd.DataFrame()
    teams_next['team1'] = pregames.iloc[np.arange(1, len(pregames), 2), 1].values
    teams_next['team2'] = pregames.iloc[np.arange(0, len(pregames), 2), 1].values
    teams_next['rank1'] = pregames.iloc[np.arange(1, len(pregames), 2), 0].values
    teams_next['rank2'] = pregames.iloc[np.arange(0, len(pregames), 2), 0].values
    to_begin = find_winners(teams_next.copy())
    to_begin['round'] = 0
    if len(games_record) < 1:
        games_record = to_begin.copy()#.loc[:,:]
    else:
        games_record = games_record.append(to_begin.copy(), ignore_index=True)
    to_begin['Seed'] = [x[0:-1] for x in to_begin['Seed'].values]
    teams_df = teams_df.append(to_begin.loc[:, ['Seed', 'TeamID']])
    #print(to_begin.loc[:, ['Seed', 'TeamID']])
    return([teams_df, games_record])

games_record = pd.DataFrame() # records all matches
teams_df = pd.read_csv('data/NCAATourneySeeds.csv')
teams_df = teams_df.loc[teams_df.Season == 2003, ['Seed', 'TeamID']]
teams_df, games_record = first_four(teams_df, games_record)
#print(teams_df)
round = 1 

#runs until only 1 team remains
while len(teams_df) > 1:
    games = next_round(teams_df)
    features = pd.merge(games, team_summary_stats, how='left', left_on=['team1'], right_on=['TeamID'])
    features = pd.merge(features, team_summary_stats, how='left', left_on=['team2'], right_on=['TeamID'], suffixes=('', '_t2'))
    print(features.head(2))
    games['round'] = round
    teams_df = find_winners(games)
    if len(games_record) < 1:
        games_record = games#.loc[:,:]
    else:
        games_record = games_record.append(games, ignore_index=True)
    round = round + 1
#print(games_record)

   team1  team2 rank1 rank2  Season  TeamID   FGM        FGA      FGM3  \
0   1354   1328   W16   W01    2003    1354  24.8  54.366667  5.166667   
1   1354   1328   W16   W01    2003    1354  24.8  54.366667  5.166667   

        FGA3    ...        FGA3_t2     FTM_t2     FTA_t2      OR_t2  \
0  15.466667    ...      18.966667  13.166667  18.600000  12.133333   
1  15.466667    ...      19.965517  13.655172  20.551724  13.275862   

       DR_t2     Ast_t2      TO_t2    Stl_t2    Blk_t2      PF_t2  
0  24.966667  14.166667  11.800000  6.933333  3.766667  18.600000  
1  20.896552  12.379310  12.965517  8.689655  5.034483  20.103448  

[2 rows x 34 columns]
    team1   team2 rank1 rank2  Season  TeamID        FGM        FGA      FGM3  \
0  1301.0  1354.0   W09   W16    2003    1301  24.333333  53.333333  7.966667   
1  1301.0  1354.0   W09   W16    2003    1301  24.333333  53.333333  7.966667   

   FGA3    ...        FGA3_t2     FTM_t2     FTA_t2      OR_t2      DR_t2  \
0  22.5    ... 

In [100]:
regular_season_results = pd.read_csv('data/RegularSeasonDetailedResults.csv')
post_season_outcomes = pd.read_csv('data/NCAATourneyDetailedResults.csv')

Creating a set of regular season features

In [101]:
winners = regular_season_results.loc[:,['Season', 'WTeamID', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 
                                         'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]
losers = regular_season_results.loc[:,['Season', 'LTeamID', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
                                      'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']]
winners.columns = ['Season', 'TeamID', 'FGM', 'FGA', 'FGM3', 'FGA3',
                                      'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']
losers.columns = ['Season', 'TeamID', 'FGM', 'FGA', 'FGM3', 'FGA3',
                                      'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']
all_teams = winners.copy()
all_teams = all_teams.append(losers.copy(), ignore_index=True)
team_summary_stats = all_teams.groupby(['Season', 'TeamID'], as_index=False).mean()
print(team_summary_stats.head())
team_summary_stats.to_csv('data/team_summary_stats.csv')

   Season  TeamID        FGM        FGA      FGM3       FGA3        FTM  \
0    2003    1102  19.142857  39.785714  7.821429  20.821429  11.142857   
1    2003    1103  27.148148  55.851852  5.444444  16.074074  19.037037   
2    2003    1104  24.035714  57.178571  6.357143  19.857143  14.857143   
3    2003    1105  24.384615  61.615385  7.576923  20.769231  15.423077   
4    2003    1106  23.428571  55.285714  6.107143  17.642857  10.642857   

         FTA         OR         DR        Ast         TO       Stl       Blk  \
0  17.107143   4.178571  16.821429  13.000000  11.428571  5.964286  1.785714   
1  25.851852   9.777778  19.925926  15.222222  12.629630  7.259259  2.333333   
2  20.928571  13.571429  23.928571  12.107143  13.285714  6.607143  3.785714   
3  21.846154  13.500000  23.115385  14.538462  18.653846  9.307692  2.076923   
4  16.464286  12.285714  23.857143  11.678571  17.035714  8.357143  3.142857   

          PF  
0  18.750000  
1  19.851852  
2  18.035714  
3  20.23

Creating a set of post season outcomes to test with

In [98]:
post_season_outcomes.head(2)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15


In [103]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler

post_season_outcomes['score_dif'] = post_season_outcomes.WScore - post_season_outcomes.LScore
#regular_season_results['score_dif'] = regular_season_results.WScore - regular_season_results.LScore
outcome = post_season_outcomes.loc[:,['Season', 'WTeamID', 'LTeamID', 'score_dif']]
#reg_outcome = regular_season_results.loc[:,['Season', 'WTeamID', 'LTeamID', 'score_dif']]
#all_season_outcome = outcome.append(reg_outcome)
#outcome = all_season_outcome
mixing_matrix = np.random.choice([True, False], len(outcome))
mixed_outcome = outcome.copy()
print(mixed_outcome.head())
mixed_outcome.loc[mixing_matrix, ['WTeamID', 'LTeamID']] = mixed_outcome.loc[mixing_matrix, ['LTeamID', 'WTeamID']].values 
mixed_outcome.loc[mixing_matrix, ['score_dif']] = mixed_outcome.loc[mixing_matrix, ['score_dif']].mul(-1)
print(mixed_outcome.head())
print(len(mixed_outcome))
features = pd.merge(mixed_outcome, team_summary_stats, left_on=['WTeamID', 'Season'], right_on=['TeamID', 'Season'])
print(len(features))
features = pd.merge(features, team_summary_stats, left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID'], suffixes=('', '_t2'))
print(len(features))
features.loc[:, 'FGM':'PF'] = features.loc[:, 'FGM':'PF'].values - features.loc[:, 'FGM_t2':'PF_t2'].values
features = features.loc[:, 'Season':'PF']
features.to_csv('data/mixed_prepared_features.csv')

   Season  WTeamID  LTeamID  score_dif
0    2003     1421     1411          8
1    2003     1112     1436         29
2    2003     1113     1272         13
3    2003     1141     1166          6
4    2003     1143     1301          2
   Season  WTeamID  LTeamID  score_dif
0    2003     1421     1411          8
1    2003     1112     1436         29
2    2003     1113     1272         13
3    2003     1141     1166          6
4    2003     1301     1143         -2
1048
1048
1048


### A possible model for predicting games based on regular season summary statistics

In [123]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from ModelFunctions import DecisionTreeFunc, KnnFunc, BayesianRidgeFunc, NeuralNetworkFunc
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge

import warnings
#warnings.filterwarnings('ignore') # disables warnings
warnings.filterwarnings('ignore')
features.score_dif = 1*(features.score_dif > 0)
test_year = features.loc[(features.Season == 2018), :]
test_features = features.loc[(features.Season != 2018), :]
test_features

Unnamed: 0,Season,WTeamID,LTeamID,score_dif,TeamID,FGM,FGA,FGM3,FGA3,FTM,FTA,OR,DR,Ast,TO,Stl,Blk,PF
0,2003,1421,1411,1,1421,-0.354023,1.526437,0.549425,-0.500000,-1.434483,-7.135632,-0.890805,-1.627586,-1.165517,0.973563,0.635632,0.766667,0.803448
1,2003,1421,1400,0,1421,-3.620690,-5.635468,0.625616,1.214286,-1.034483,-2.854680,-3.902709,-2.970443,-1.465517,2.778325,0.676108,-0.857143,-1.253695
2,2003,1163,1400,0,1163,1.533333,-0.228571,0.209524,-1.085714,-2.100000,-1.685714,-1.411905,1.757143,1.133333,2.371429,-0.459524,3.876190,-1.957143
3,2003,1393,1400,1,1393,1.241379,-0.221675,-0.615764,-0.923645,-0.620690,-0.165025,-1.868227,0.753695,0.465517,0.192118,1.917488,3.418719,-3.770936
4,2003,1112,1436,1,1112,5.493842,9.852217,1.759852,4.588670,4.673645,5.448276,2.213054,1.918719,3.435961,0.716749,1.602217,1.248768,1.853448
5,2003,1112,1211,1,1112,4.256912,10.262673,-0.125576,1.006912,-0.238479,0.354839,3.243088,2.320276,1.900922,0.237327,1.657834,0.698157,-0.895161
6,2003,1112,1323,1,1112,3.095622,5.036866,-1.254608,-1.702765,0.180876,2.161290,3.791475,0.771889,0.739631,2.011521,1.012673,-1.430876,1.524194
7,2003,1228,1323,0,1228,-0.125806,-4.877419,-0.856989,-1.607527,-3.554839,-3.738710,-1.653763,-1.470968,1.363441,0.392473,-0.451613,-2.545161,2.907527
8,2003,1112,1242,0,1112,0.088095,3.414286,2.235714,5.938095,1.469048,0.866667,0.878571,0.742857,0.909524,-0.114286,-1.669048,-0.685714,1.050000
9,2003,1113,1242,0,1113,-3.026437,-5.403448,-0.800000,-1.547126,1.485057,2.073563,-0.610345,-3.589655,-1.181609,-0.900000,-4.926437,-0.658621,2.713793


In [124]:
model = BayesianRidgeFunc(test_features.drop(columns=['score_dif', 'Season']), test_features.score_dif)

In [115]:
model.predict(test_year.drop(columns=['score_dif', 'Season']))

array([  0.46230747,  -4.15415824, -20.15933737,  -1.7814492 ,
       -11.98883159, -18.31810685,  -7.14713595,   8.53325036,
        -7.28875165,   1.70196684,  -3.6468232 , -10.68628214,
        -3.75976698,   4.62572459,  -3.58649442,   3.31961293,
         2.19539422,  -1.12889643,   1.53738177,  -3.95057716,
       -12.23240121,  10.14924885,  18.50124458,   8.19234944,
         8.95804411,   4.56767397,   5.09321574,  11.40875034,
        16.78939639,  12.51431826,  -2.24686853,  -4.91733152,
         5.22675096,  -5.17668751,   1.93311192,  -5.1980951 ,
        -9.74352296,   9.3773659 ,  -3.70600138,  -3.42174162,
         2.41671417,  -2.66504154, -12.53856219,  -6.84810137,
       -11.9612298 ,  -7.77988683,   5.03755395, -12.74030613,
        -3.39103684,   2.01585434,  -7.63300451,  10.1159821 ,
        10.31830484,   3.21946515,  -1.8466208 ,  -8.34787908,
        -5.34695196,  -8.50001097,   4.12074938,  -4.50439625,
        -1.89333034, -13.36781897,   0.73218647,  -5.97

In [128]:
scaler = StandardScaler()
mlp_reg = MLPClassifier()
print(features.head())
param_grid = {}#'mlpregressor__activation' : ['identity', 'logistic', 'tanh', 'relu']}
#print(test_year.score_dif)
pipe = make_pipeline(scaler, mlp_reg)
grid = GridSearchCV(pipe, param_grid)
grid.fit(test_features.drop(columns=['score_dif', 'Season']), test_features.score_dif)
#print(grid.score(test_year.drop(columns=['score_dif', 'Season']), test_year.score_dif))

   Season  WTeamID  LTeamID  score_dif  TeamID       FGM       FGA      FGM3  \
0    2003     1421     1411          1    1421 -0.354023  1.526437  0.549425   
1    2003     1421     1400          0    1421 -3.620690 -5.635468  0.625616   
2    2003     1163     1400          0    1163  1.533333 -0.228571  0.209524   
3    2003     1393     1400          1    1393  1.241379 -0.221675 -0.615764   
4    2003     1112     1436          1    1112  5.493842  9.852217  1.759852   

       FGA3       FTM       FTA        OR        DR       Ast        TO  \
0 -0.500000 -1.434483 -7.135632 -0.890805 -1.627586 -1.165517  0.973563   
1  1.214286 -1.034483 -2.854680 -3.902709 -2.970443 -1.465517  2.778325   
2 -1.085714 -2.100000 -1.685714 -1.411905  1.757143  1.133333  2.371429   
3 -0.923645 -0.620690 -0.165025 -1.868227  0.753695  0.465517  0.192118   
4  4.588670  4.673645  5.448276  2.213054  1.918719  3.435961  0.716749   

        Stl       Blk        PF  
0  0.635632  0.766667  0.803448  


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlpclassifier', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
 ...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [130]:
results = pd.DataFrame()
results['real'] = test_year.score_dif
results['predicted'] = grid.predict(test_year.drop(columns=['score_dif', 'Season']))
results['FGAdif'] = test_year.FGA

results[results['real'] * results['predicted'] > 0]
results['correct'] = results['real'] == results['predicted']
display(results)
len(results[results['real'] == results['predicted']]) / len(results)

Unnamed: 0,real,predicted,FGAdif,correct
914,0,1,5.437500,False
915,1,0,1.088235,False
916,0,0,1.058824,True
917,1,0,3.970588,False
918,1,0,-1.779167,False
919,0,0,-5.031250,True
920,0,0,-2.468750,True
921,1,1,5.409091,True
922,1,0,4.644385,False
923,0,0,4.683284,True


0.6119402985074627

In [46]:
print(features)

      Season  WTeamID  LTeamID  score_dif  TeamID       FGM        FGA  \
0       2003     1421     1411          8    1421 -0.354023   1.526437   
1       2003     1421     1400        -21    1421 -3.620690  -5.635468   
2       2003     1277     1400         -9    1277 -4.967742 -11.331797   
3       2003     1345     1400        -10    1345 -4.250000  -8.035714   
4       2003     1163     1400         -4    1163  1.533333  -0.228571   
5       2003     1436     1112        -29    1436 -5.493842  -9.852217   
6       2003     1242     1112          3    1242 -0.088095  -3.414286   
7       2003     1323     1112        -17    1323 -3.095622  -5.036866   
8       2003     1272     1113        -13    1272 -0.931034   3.103448   
9       2003     1242     1113         32    1242  3.026437   5.403448   
10      2003     1141     1166          6    1141 -2.076280  -4.764890   
11      2003     1143     1301          2    1143  3.011494   5.390805   
12      2003     1143     1328        