In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold

resultsWin = pd.read_csv('data/RegularSeasonDetailedResults.csv')
teams = pd.read_csv('data/Teams.csv')

# Get only winning teams game stats to predict their score
resultsWin = resultsWin.drop(['WTeamID', 'LTeamID', 'WLoc'], axis=1)

def neuralNetwork(results) :
    train_features, test_features, train_outcome, test_outcome = train_test_split(
        results.drop("WScore", axis=1),
        results.WScore,
        test_size=0.30, 
        random_state=11
    )
    scaler = MinMaxScaler()
    mlp_reg = MLPClassifier()

    imputer = Imputer()
    selector = SelectPercentile()
    threshold = VarianceThreshold(.1)
    pipe = make_pipeline(imputer, threshold, selector, scaler, mlp_reg)

    param_grid = {
        'selectpercentile__percentile':range(10, 30, 5)
        }

    crossVal = KFold()
    grid = GridSearchCV(pipe, param_grid, cv = crossVal, scoring="neg_mean_absolute_error")
    grid.fit(train_features, train_outcome)
    grid.score(test_features, test_outcome)

    score = grid.score(test_features, test_outcome)

    predictedValues = grid.predict(test_features)

    return [score, predictedValues, grid, test_outcome]

neural = neuralNetwork(resultsWin)

# How many points off were we from predicting the winning score?
print(neural[0])

# This can be exactly replicated for the other team using LScore for losing score
# Then need to randomize which teams get which grid and we run the grid for each playoff game invididually
# in their own csv. Take the two scores, see who won, move them manually to the next round.



In [187]:
resultsWin.head()

Unnamed: 0,Season,DayNum,WScore,LScore,NumOT,WFGM,WFGA,WFGM3,WFGA3,WFTM,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,2003,10,68,62,0,27,58,3,14,11,...,10,16,22,10,22,8,18,9,2,20
1,2003,10,70,63,0,26,62,8,20,10,...,24,9,20,20,25,7,12,8,6,16
2,2003,11,73,61,0,24,58,8,18,17,...,26,14,23,31,22,9,12,2,5,23
3,2003,11,56,50,0,18,38,3,9,17,...,22,8,15,17,20,9,19,4,3,23
4,2003,11,77,71,0,30,61,6,14,11,...,16,17,27,21,15,12,10,7,1,14


I'm not quite done withe the following but it is a way to predict the outcomes of the tournament games as they contunue. The games each round can be based off of the predicted outcome of the last round.

In [285]:
def next_round(teams_df):
    '''Takes a dataframe with two columns: School id, and ranking, none of 
    these teams should have been eliminated. It returns the next matchups for these 
    teams based on their ranking'''
    arar = np.char.array(['01','16','08','09','05','12','04','13','06','11','03','14','07','10','02','15'])
    arr = np.append(arar, arar)
    first_round_bracket = np.char.array(['W', 'X', 'Y', 'Z']).repeat(16) + np.append(arr, arr)
    if(len(teams_df) < 64):
        won_ranks = teams_df['Seed'].values
        first_round_bracket = np.array([x for x in first_round_bracket if x in won_ranks])
    #print(first_round_bracket)
    picks1 = first_round_bracket[np.arange(1, len(teams_df), 2)]
    picks2 = first_round_bracket[np.arange(0, len(teams_df), 2)]
    #print(picks1)
    #print(picks2)
    teams_df = teams_df.set_index('Seed')
    teams_next = pd.DataFrame()
    teams_next['team1'] = teams_df.loc[picks1, 'TeamID'].values
    teams_next['team2'] = teams_df.loc[picks2, 'TeamID'].values
    teams_next['rank1'] = picks1
    teams_next['rank2'] = picks2
    return teams_next

def find_winners(nx):
    '''Takes a dataframe with teamsids and ranks from matches and keeps only the 
    winners'''
    nx['score'] = (nx['team1'] - nx['team2'])
    nx['TeamID'] = nx.loc[:, 'team2']
    nx['Seed'] = nx.loc[:, 'rank2']
    
    condition = nx['score'] > 0
    #this can be replaced with an ouput from a neural net to predict winners
    underdogs = nx.loc[condition,['rank1', 'team1']]
    underdogs.columns = ['Seed', 'TeamID']
    nx.update(underdogs)
    return(nx)#.loc[:, ['Seed', 'TeamID']])

def first_four(teams_df, games_record):
    pregames = teams_df.loc[teams_df['Seed'].str.contains('a|b'),:]
    teams_df = teams_df.loc[~teams_df['Seed'].str.contains('a|b'),:]
    features = pd.merge(games, team_summary_stats, how='left', left_on=['team1'], right_on=['TeamID'])
    features = pd.merge(games, team_summary_stats, how='left', left_on=['team2'], right_on=['TeamID'], suffixes=('', '_t2'))
    teams_next = pd.DataFrame()
    teams_next['team1'] = pregames.iloc[np.arange(1, len(pregames), 2), 1].values
    teams_next['team2'] = pregames.iloc[np.arange(0, len(pregames), 2), 1].values
    teams_next['rank1'] = pregames.iloc[np.arange(1, len(pregames), 2), 0].values
    teams_next['rank2'] = pregames.iloc[np.arange(0, len(pregames), 2), 0].values
    to_begin = find_winners(teams_next.copy())
    to_begin['round'] = 0
    if len(games_record) < 1:
        games_record = to_begin.copy()#.loc[:,:]
    else:
        games_record = games_record.append(to_begin.copy(), ignore_index=True)
    to_begin['Seed'] = [x[0:-1] for x in to_begin['Seed'].values]
    teams_df = teams_df.append(to_begin.loc[:, ['Seed', 'TeamID']])
    print(to_begin.loc[:, ['Seed', 'TeamID']])
    return([teams_df, games_record])

games_record = pd.DataFrame() # records all matches
teams_df = pd.read_csv('data/NCAATourneySeeds.csv')
teams_df = teams_df.loc[teams_df.Season == 2003, ['Seed', 'TeamID']]
teams_df, games_record = first_four(teams_df, games_record)
print(teams_df)
round = 1 

#runs until only 1 team remains
while len(teams_df) > 1:
    games = next_round(teams_df)
    features = pd.merge(games, team_summary_stats, how='left', left_on=['team1'], right_on=['TeamID'])
    features = pd.merge(games, team_summary_stats, how='left', left_on=['team2'], right_on=['TeamID'], suffixes=('', '_t2'))
    games['round'] = round
    teams_df = find_winners(games)
    if len(games_record) < 1:
        games_record = games#.loc[:,:]
    else:
        games_record = games_record.append(games, ignore_index=True)
    round = round + 1
print(games_record)

  Seed  TeamID
0  X16    1421
     Seed  TeamID
1154  W01    1328
1155  W02    1448
1156  W03    1393
1157  W04    1257
1158  W05    1280
1159  W06    1329
1160  W07    1386
1161  W08    1143
1162  W09    1301
1163  W10    1120
1164  W11    1335
1165  W12    1139
1166  W13    1122
1167  W14    1264
1168  W15    1190
1169  W16    1354
1170  X01    1400
1171  X02    1196
1172  X03    1462
1173  X04    1390
1174  X05    1163
1175  X06    1268
1176  X07    1277
1177  X08    1261
1178  X09    1345
1179  X10    1160
1180  X11    1423
1181  X12    1140
1182  X13    1360
1183  X14    1407
...   ...     ...
1190  Y04    1173
1191  Y05    1458
1192  Y06    1281
1193  Y07    1231
1194  Y08    1332
1195  Y09    1428
1196  Y10    1104
1197  Y11    1356
1198  Y12    1451
1199  Y13    1409
1200  Y14    1221
1201  Y15    1447
1202  Y16    1237
1203  Z01    1112
1204  Z02    1242
1205  Z03    1181
1206  Z04    1228
1207  Z05    1323
1208  Z06    1166
1209  Z07    1272
1210  Z08    1153
1211  Z09    121

In [193]:
regular_season_results = pd.read_csv('data/RegularSeasonDetailedResults.csv')

In [251]:
winners = regular_season_results.loc[:,['Season', 'WTeamID', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 
                                         'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]
losers = regular_season_results.loc[:,['Season', 'LTeamID', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
                                      'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']]
winners.columns = ['Season', 'TeamID', 'FGM', 'FGA', 'FGM3', 'FGA3',
                                      'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']
losers.columns = ['Season', 'TeamID', 'FGM', 'FGA', 'FGM3', 'FGA3',
                                      'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']
all_teams = winners.copy()
all_teams = all_teams.append(losers.copy(), ignore_index=True)
team_summary_stats = all_teams.groupby(['Season', 'TeamID'], as_index=False).mean()
print(team_summary_stats.head())
team_summary_stats.to_csv('data/team_summary_stats.csv')

   Season  TeamID        FGM        FGA      FGM3       FGA3        FTM  \
0    2003    1102  19.142857  39.785714  7.821429  20.821429  11.142857   
1    2003    1103  27.148148  55.851852  5.444444  16.074074  19.037037   
2    2003    1104  24.035714  57.178571  6.357143  19.857143  14.857143   
3    2003    1105  24.384615  61.615385  7.576923  20.769231  15.423077   
4    2003    1106  23.428571  55.285714  6.107143  17.642857  10.642857   

         FTA         OR         DR        Ast         TO       Stl       Blk  \
0  17.107143   4.178571  16.821429  13.000000  11.428571  5.964286  1.785714   
1  25.851852   9.777778  19.925926  15.222222  12.629630  7.259259  2.333333   
2  20.928571  13.571429  23.928571  12.107143  13.285714  6.607143  3.785714   
3  21.846154  13.500000  23.115385  14.538462  18.653846  9.307692  2.076923   
4  16.464286  12.285714  23.857143  11.678571  17.035714  8.357143  3.142857   

          PF  
0  18.750000  
1  19.851852  
2  18.035714  
3  20.23