In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold

resultsWin = pd.read_csv('data/RegularSeasonDetailedResults.csv')
teams = pd.read_csv('data/Teams.csv')

# Get only winning teams game stats to predict their score
resultsWin = resultsWin.drop(['WTeamID', 'LTeamID', 'WLoc'], axis=1)

def neuralNetwork(results) :
    train_features, test_features, train_outcome, test_outcome = train_test_split(
        results.drop("WScore", axis=1),
        results.WScore,
        test_size=0.30, 
        random_state=11
    )
    scaler = MinMaxScaler()
    mlp_reg = MLPClassifier()

    imputer = Imputer()
    selector = SelectPercentile()
    threshold = VarianceThreshold(.1)
    pipe = make_pipeline(imputer, threshold, selector, scaler, mlp_reg)

    param_grid = {
        'selectpercentile__percentile':range(10, 30, 5)
        }

    crossVal = KFold()
    grid = GridSearchCV(pipe, param_grid, cv = crossVal, scoring="neg_mean_absolute_error")
    grid.fit(train_features, train_outcome)
    grid.score(test_features, test_outcome)

    score = grid.score(test_features, test_outcome)

    predictedValues = grid.predict(test_features)

    return [score, predictedValues, grid, test_outcome]

neural = neuralNetwork(resultsWin)

# How many points off were we from predicting the winning score?
print(neural[0])

# This can be exactly replicated for the other team using LScore for losing score
# Then need to randomize which teams get which grid and we run the grid for each playoff game invididually
# in their own csv. Take the two scores, see who won, move them manually to the next round.



In [None]:
resultsWin.head()

I'm not quite done withe the following but it is a way to predict the outcomes of the tournament games as they contunue. The games each round can be based off of the predicted outcome of the last round.

In [40]:
def next_round(teams_df):
    '''Takes a dataframe with two columns: School id, and ranking, none of 
    these teams should have been eliminated. It returns the next matchups for these 
    teams based on their ranking'''
    arar = np.char.array(['01','16','08','09','05','12','04','13','06','11','03','14','07','10','02','15'])
    arr = np.append(arar, arar)
    first_round_bracket = np.char.array(['W', 'X', 'Y', 'Z']).repeat(16) + np.append(arr, arr)
    if(len(teams_df) < 64):
        won_ranks = teams_df['Seed'].values
        first_round_bracket = np.array([x for x in first_round_bracket if x in won_ranks])
    #print(first_round_bracket)
    picks1 = first_round_bracket[np.arange(1, len(teams_df), 2)]
    picks2 = first_round_bracket[np.arange(0, len(teams_df), 2)]
    #print(picks1)
    #print(picks2)
    teams_df = teams_df.set_index('Seed')
    teams_next = pd.DataFrame()
    teams_next['team1'] = teams_df.loc[picks1, 'TeamID'].values
    teams_next['team2'] = teams_df.loc[picks2, 'TeamID'].values
    teams_next['rank1'] = picks1
    teams_next['rank2'] = picks2
    return teams_next

def find_winners(nx):
    '''Takes a dataframe with teamsids and ranks from matches and keeps only the 
    winners'''
    nx['score'] = (nx['team1'] - nx['team2'])
    nx['TeamID'] = nx.loc[:, 'team2']
    nx['Seed'] = nx.loc[:, 'rank2']
    
    condition = nx['score'] > 0
    #this can be replaced with an ouput from a neural net to predict winners
    underdogs = nx.loc[condition,['rank1', 'team1']]
    underdogs.columns = ['Seed', 'TeamID']
    nx.update(underdogs)
    return(nx)#.loc[:, ['Seed', 'TeamID']])

def first_four(teams_df, games_record):
    pregames = teams_df.loc[teams_df['Seed'].str.contains('a|b'),:]
    teams_df = teams_df.loc[~teams_df['Seed'].str.contains('a|b'),:]
    #features = pd.merge(games, team_summary_stats, how='left', left_on=['team1'], right_on=['TeamID'])
    #features = pd.merge(games, team_summary_stats, how='left', left_on=['team2'], right_on=['TeamID'], suffixes=('', '_t2'))
    teams_next = pd.DataFrame()
    teams_next['team1'] = pregames.iloc[np.arange(1, len(pregames), 2), 1].values
    teams_next['team2'] = pregames.iloc[np.arange(0, len(pregames), 2), 1].values
    teams_next['rank1'] = pregames.iloc[np.arange(1, len(pregames), 2), 0].values
    teams_next['rank2'] = pregames.iloc[np.arange(0, len(pregames), 2), 0].values
    to_begin = find_winners(teams_next.copy())
    to_begin['round'] = 0
    if len(games_record) < 1:
        games_record = to_begin.copy()#.loc[:,:]
    else:
        games_record = games_record.append(to_begin.copy(), ignore_index=True)
    to_begin['Seed'] = [x[0:-1] for x in to_begin['Seed'].values]
    teams_df = teams_df.append(to_begin.loc[:, ['Seed', 'TeamID']])
    #print(to_begin.loc[:, ['Seed', 'TeamID']])
    return([teams_df, games_record])

games_record = pd.DataFrame() # records all matches
teams_df = pd.read_csv('data/NCAATourneySeeds.csv')
teams_df = teams_df.loc[teams_df.Season == 2003, ['Seed', 'TeamID']]
teams_df, games_record = first_four(teams_df, games_record)
#print(teams_df)
round = 1 

#runs until only 1 team remains
while len(teams_df) > 1:
    games = next_round(teams_df)
    features = pd.merge(games, team_summary_stats, how='left', left_on=['team1'], right_on=['TeamID'])
    features = pd.merge(features, team_summary_stats, how='left', left_on=['team2'], right_on=['TeamID'], suffixes=('', '_t2'))
    print(features.head(2))
    games['round'] = round
    teams_df = find_winners(games)
    if len(games_record) < 1:
        games_record = games#.loc[:,:]
    else:
        games_record = games_record.append(games, ignore_index=True)
    round = round + 1
#print(games_record)

   team1  team2 rank1 rank2  Season  TeamID   FGM        FGA      FGM3  \
0   1354   1328   W16   W01    2003    1354  24.8  54.366667  5.166667   
1   1354   1328   W16   W01    2003    1354  24.8  54.366667  5.166667   

        FGA3    ...        FGA3_t2     FTM_t2     FTA_t2      OR_t2  \
0  15.466667    ...      18.966667  13.166667  18.600000  12.133333   
1  15.466667    ...      19.965517  13.655172  20.551724  13.275862   

       DR_t2     Ast_t2      TO_t2    Stl_t2    Blk_t2      PF_t2  
0  24.966667  14.166667  11.800000  6.933333  3.766667  18.600000  
1  20.896552  12.379310  12.965517  8.689655  5.034483  20.103448  

[2 rows x 34 columns]
    team1   team2 rank1 rank2  Season  TeamID        FGM        FGA      FGM3  \
0  1301.0  1354.0   W09   W16    2003    1301  24.333333  53.333333  7.966667   
1  1301.0  1354.0   W09   W16    2003    1301  24.333333  53.333333  7.966667   

   FGA3    ...        FGA3_t2     FTM_t2     FTA_t2      OR_t2      DR_t2  \
0  22.5    ... 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


    team1   team2 rank1 rank2  Season  TeamID        FGM        FGA      FGM3  \
0  1458.0  1462.0   Y05   X03    2003    1458  25.034483  53.896552  6.241379   
1  1458.0  1462.0   Y05   X03    2003    1458  25.034483  53.896552  6.241379   

       FGA3    ...        FGA3_t2     FTM_t2     FTA_t2      OR_t2     DR_t2  \
0  17.37931    ...      17.766667  18.200000  24.366667  13.933333  25.50000   
1  17.37931    ...      20.575758  13.727273  20.212121  11.787879  23.69697   

      Ast_t2      TO_t2    Stl_t2    Blk_t2      PF_t2  
0  16.400000  13.033333  5.466667  3.033333  15.766667  
1  13.484848  12.545455  7.030303  2.242424  16.636364  

[2 rows x 34 columns]


In [6]:
regular_season_results = pd.read_csv('data/RegularSeasonDetailedResults.csv')
post_season_outcomes = pd.read_csv('data/NCAATourneyDetailedResults.csv')

Creating a set of regular season features

In [39]:
winners = regular_season_results.loc[:,['Season', 'WTeamID', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 
                                         'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]
losers = regular_season_results.loc[:,['Season', 'LTeamID', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
                                      'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']]
winners.columns = ['Season', 'TeamID', 'FGM', 'FGA', 'FGM3', 'FGA3',
                                      'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']
losers.columns = ['Season', 'TeamID', 'FGM', 'FGA', 'FGM3', 'FGA3',
                                      'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']
all_teams = winners.copy()
all_teams = all_teams.append(losers.copy(), ignore_index=True)
team_summary_stats = all_teams.groupby(['Season', 'TeamID'], as_index=False).mean()
print(team_summary_stats.head())
team_summary_stats.to_csv('data/team_summary_stats.csv')

   Season  TeamID        FGM        FGA      FGM3       FGA3        FTM  \
0    2003    1102  19.142857  39.785714  7.821429  20.821429  11.142857   
1    2003    1103  27.148148  55.851852  5.444444  16.074074  19.037037   
2    2003    1104  24.035714  57.178571  6.357143  19.857143  14.857143   
3    2003    1105  24.384615  61.615385  7.576923  20.769231  15.423077   
4    2003    1106  23.428571  55.285714  6.107143  17.642857  10.642857   

         FTA         OR         DR        Ast         TO       Stl       Blk  \
0  17.107143   4.178571  16.821429  13.000000  11.428571  5.964286  1.785714   
1  25.851852   9.777778  19.925926  15.222222  12.629630  7.259259  2.333333   
2  20.928571  13.571429  23.928571  12.107143  13.285714  6.607143  3.785714   
3  21.846154  13.500000  23.115385  14.538462  18.653846  9.307692  2.076923   
4  16.464286  12.285714  23.857143  11.678571  17.035714  8.357143  3.142857   

          PF  
0  18.750000  
1  19.851852  
2  18.035714  
3  20.23

Creating a set of post season outcomes to test with

In [8]:
post_season_outcomes.head(2)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15


In [55]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
post_season_outcomes['score_dif'] = post_season_outcomes.WScore - post_season_outcomes.LScore
outcome = post_season_outcomes.loc[:,['Season', 'WTeamID', 'LTeamID', 'score_dif']]
mixing_matrix = np.random.choice([True, False], len(outcome))
mixed_outcome = outcome.copy()
print(mixed_outcome.head())
mixed_outcome.loc[mixing_matrix, ['WTeamID', 'LTeamID']] = mixed_outcome.loc[mixing_matrix, ['LTeamID', 'WTeamID']].values 
mixed_outcome.loc[mixing_matrix, ['score_dif']] = mixed_outcome.loc[mixing_matrix, ['score_dif']].mul(-1)
print(mixed_outcome.head())
features = pd.merge(mixed_outcome, team_summary_stats, how='left', left_on=['WTeamID', 'Season'], right_on=['TeamID', 'Season'])
#features = pd.merge(features, team_summary_stats, how='left', left_on=['LTeamID'], right_on=['TeamID'], suffixes=('', '_t2'))
features.to_csv('data/mixed_prepared_features.csv')
print(features.head(2))

   Season  WTeamID  LTeamID  score_dif
0    2003     1421     1411          8
1    2003     1112     1436         29
2    2003     1113     1272         13
3    2003     1141     1166          6
4    2003     1143     1301          2
   Season  WTeamID  LTeamID  score_dif
0    2003     1421     1411          8
1    2003     1436     1112        -29
2    2003     1272     1113        -13
3    2003     1166     1141         -6
4    2003     1301     1143         -2
   Season  WTeamID  LTeamID  score_dif  TeamID        FGM        FGA  \
0    2003     1421     1411          8    1421  24.379310  56.793103   
1    2003     1436     1112        -29    1436  24.827586  55.862069   

       FGM3       FGA3        FTM        FTA         OR         DR        Ast  \
0  6.482759  18.000000  15.965517  20.931034  12.275862  23.172414  13.034483   
1  5.275862  15.482759  12.862069  19.551724  12.965517  25.724138  14.206897   

          TO       Stl       Blk         PF  
0  16.206897  7.068966  3

### A possible model for predicting games based on regular season summary statistics

In [60]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import warnings
#warnings.filterwarnings('ignore') # disables warnings
warnings.filterwarnings('default')
#features.score_dif = 1*(features.score_dif > 0)
test_year = features.loc[features.Season == 2018, :]
test_features = features.loc[features.Season != 2018, :]
scaler = MinMaxScaler()
mlp_reg = MLPRegressor()

param_grid = {'mlpregressor__activation' : ['identity', 'logistic', 'tanh', 'relu']}
print(test_year.score_dif)
pipe = make_pipeline(scaler, mlp_reg)
grid = GridSearchCV(pipe, param_grid)
grid.fit(test_features.drop(columns=['score_dif', 'Season']), test_features.score_dif)
print(grid.score(test_year.drop(columns=['score_dif', 'Season']), test_year.score_dif))

results = pd.DataFrame()
results['real'] = test_year.score_dif
results['predicted'] = grid.predict(test_year.drop(columns=['score_dif', 'Season']))

results[results['real'] * results['predicted'] > 0]

981     10
982      7
983     -4
984    -18
985      3
986    -21
987     22
988    -15
989     -4
990     -2
991     16
992      5
993     -2
994     14
995     -8
996     -5
997    -11
998     26
999     10
1000    26
1001    -4
1002    17
1003   -15
1004    11
1005    13
1006    10
1007    -6
1008    -4
1009     4
1010   -18
        ..
1018     6
1019    -4
1020    20
1021     1
1022     1
1023    -3
1024    23
1025    31
1026    -5
1027    -7
1028    -2
1029    -3
1030     2
1031    21
1032    23
1033    15
1034    -3
1035    -1
1036   -27
1037     4
1038    -4
1039    13
1040    12
1041   -16
1042    -4
1043    -4
1044   -12
1045    12
1046   -16
1047    17
Name: score_dif, Length: 67, dtype: int64


  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


0.12314648500759062




Unnamed: 0,real,predicted
983,-4,-2.042864
984,-18,-8.608527
986,-21,-0.895164
987,22,9.389066
988,-15,-2.793003
989,-4,-1.816401
990,-2,-0.671544
991,16,4.924961
994,14,1.363233
995,-8,-2.006171


In [63]:
results['correct'] = results['real'] * results['predicted'] > 0
display(results)
len(results[results['real'] * results['predicted'] > 0]) / len(results)

Unnamed: 0,real,predicted,correct
981,10,-3.883313,False
982,7,-3.101839,False
983,-4,-2.042864,True
984,-18,-8.608527,True
985,3,-6.256976,False
986,-21,-0.895164,True
987,22,9.389066,True
988,-15,-2.793003,True
989,-4,-1.816401,True
990,-2,-0.671544,True


0.5970149253731343

In [2]:
print(features)

NameError: name 'features' is not defined