# IMPORTS

In [54]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.linear_model import BayesianRidge

# DATA PREP

In [55]:
GameResults = pd.read_csv('data/NCAATourneyCompactResults.csv')
teamAvgStats = pd.read_csv('data/team_summary_stats.csv')


GameResults = GameResults.drop(['NumOT', 'WLoc', 'DayNum'], axis = 1)
teamAvgStats = teamAvgStats.drop(teamAvgStats.columns[0], axis=1)

old_names = ['TeamID', 'FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl' , 'Blk', 'PF']
new_names = ['WTeamID', '1FGM', '1FGA', '1FGM3', '1FGA3', '1FTM', '1FTA', '1OR', '1DR', '1Ast', '1TO', '1Stl', '1Blk', '1PF']
WteamAvgStats = teamAvgStats.rename(columns=dict(zip(old_names, new_names)))

new_names = ['LTeamID', '2FGM', '2FGA', '2FGM3', '2FGA3', '2FTM', '2FTA', '2OR', '2DR', '2Ast', '2TO', '2Stl', '2Blk', '2PF']
LteamAvgStats = teamAvgStats.rename(columns=dict(zip(old_names, new_names)))

test1 = pd.merge(GameResults, WteamAvgStats, on=['Season', 'WTeamID'])

MainDf = pd.merge(test1 , LteamAvgStats, on=['Season', 'LTeamID'])

##test = test.merge(GameResults, LteamAvgStats, on=['Season', 'LTeamID'])
                                    
MainDf
##GameResults = GameResults[GameResults['Season'] != 2018]

Train = MainDf[MainDf['Season'] != 2018]

Test = MainDf[MainDf['Season'] == 2018]


Train.head()

Unnamed: 0,Season,WTeamID,WScore,LTeamID,LScore,1FGM,1FGA,1FGM3,1FGA3,1FTM,...,2FGA3,2FTM,2FTA,2OR,2DR,2Ast,2TO,2Stl,2Blk,2PF
0,2003,1421,92,1411,84,27.5,65.5,8.0,25.0,13.5,...,31.0,14.0,31.0,17.0,28.0,16.0,15.0,5.0,0.0,22.0
1,2003,1112,80,1436,51,31.0,67.75,7.75,20.75,15.0,...,16.0,7.0,7.0,8.0,26.0,12.0,17.0,10.0,3.0,15.0
2,2003,1112,96,1211,95,31.0,67.75,7.75,20.75,15.0,...,22.0,22.0,29.0,12.0,27.5,16.0,11.5,3.5,3.5,20.0
3,2003,1112,88,1323,71,31.0,67.75,7.75,20.75,15.0,...,20.0,12.333333,16.0,8.333333,30.666667,11.333333,16.666667,5.666667,4.666667,16.0
4,2003,1113,84,1272,71,29.5,64.0,5.0,14.5,16.0,...,28.0,14.0,21.0,20.0,22.0,11.0,12.0,2.0,5.0,18.0


In [59]:
WTeamTrainFeatures = Train.drop(columns=['WScore','LScore']).values
WTeamTrainOutcome = Train.WScore.values
LTeamTrainFeatures = Train.drop(columns=['WScore','LScore']).values
LTeamTrainOutcome = Train.LScore.values
WTeamTestFeatures = Test.drop(columns=['WScore','LScore']).values
WTeamTestOutcome = Test.WScore.values
LTeamTestFeatures = Test.drop(columns=['WScore','LScore']).values
LTeamTestOutcome = Test.LScore.values

# REGRESSION FUNCTIONS

In [1]:
# KNeighbors Regression
def KnnFunc(train_features, train_outcome):
    """This function takes in a set of train features and outcomes and runs a Kneighbors Regression through a pipelines
    and grid search through multiple parameters of kneighbors such as number of neighors (1-40), weights, and algorithms. 
    Pipeline run with Imputer to fill in missing values and SelectKBest as the feature selection method. The function returns
    the fitted most optimal prediction model from the grid search conducted."""
    scaler = MinMaxScaler()
    imputer = Imputer()
    knn = KNeighborsRegressor()
    param_grid = {'kneighborsregressor__n_neighbors': range(1,40),
                 'kneighborsregressor__weights': ['uniform', 'distance'],
                 'kneighborsregressor__algorithm' :['kd_tree', 'ball_tree', 'brute']}
    pipe = make_pipeline(imputer, scaler, SelectKBest(), knn)
    grid_search = GridSearchCV(pipe, param_grid, scoring="neg_mean_absolute_error")
    grid_search.fit(train_features, train_outcome)

    return grid_search


# Decision Tree Regression
def DecisionTreeFunc(train_features, train_outcome):
    """This function takes in a set of train features and outcomes and runs a Decision Tree Regression through a pipelines
    and grid search through max features parameter from 1-10. Pipeline run with Imputer to fill in missing values and
    SelectKBest as the feature selection method. The function returns the fitted most optimal prediction model from the
    grid search conducted."""
    tree = DecisionTreeRegressor()
    imputer = Imputer()
    param_grid = {'decisiontreeregressor__max_features': range(1,28)}
    pipe = make_pipeline(imputer, SelectKBest(), tree)
    grid = GridSearchCV(pipe, param_grid, scoring="neg_mean_absolute_error")
    grid.fit(train_features, train_outcome)
    return grid

# Neural Network Regression
def NeuralNetworkFunc(train_features, train_outcome):
    """This function takes in a set of train features and outcomes and runs a MLP Neural Network Regression through a pipelines
    and grid search of multiple variations. MLP Regression is done with a MinMaxScaler to scale data. Pipeline run with Imputer
    to fill in missing values and SelectKBest as the feature selection method. The function returns the fitted most optimal 
    prediction model from the grid search conducted."""
    clf = MLPRegressor()
    imputer = Imputer()
    param_grid = {}
    pipe = make_pipeline(imputer, MinMaxScaler(), SelectKBest(), clf)
    grid = GridSearchCV(pipe, param_grid, scoring="neg_mean_absolute_error")
    grid.fit(train_features, train_outcome)
    return grid

# Bayesian Ridge Regression
def BayesianRidgeFunc(train_features, train_outcome):
    """This function takes in a set of train features and outcomes and runs a Bayesian Ridge Regression through a pipelines
    and grid search of multiple variations. Pipeline run with Imputer to fill in missing values and SelectKBest as the feature 
    selection method. The function returns the fitted most optimal prediction model from the grid search conducted."""
    clf = BayesianRidge()
    imputer = Imputer()
    param_grid = {}
    pipe = make_pipeline(imputer, MinMaxScaler(),SelectKBest(), clf)
    grid = GridSearchCV(pipe, param_grid, scoring="neg_mean_absolute_error")
    grid.fit(train_features, train_outcome)
    return grid

# RUNNING MODELS

In [60]:
games = len(Test)

WModelPred = KnnFunc(WTeamTrainFeatures, WTeamTrainOutcome).predict(WTeamTestFeatures)
LModelPred = KnnFunc(LTeamTrainFeatures, LTeamTrainOutcome).predict(LTeamTestFeatures)

data = {'WinScorePred' : WModelPred, 'LoseScorePred': LModelPred}

results = pd.DataFrame(data=data)



NameError: name 'WinScorePred' is not defined

In [61]:
gamesRight = len(results[results['WinScorePred'] > results['LoseScorePred']])
gamesRight/games


1.0

# MISC

In [15]:
WTeamResults = GameResults.drop(columns=['LTeamID', 'LScore', 'LFGM', 'LFGA', 'LFGM3' , 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 'WLoc' ])

WTeamResults = WTeamResults.rename(columns=dict(zip(old_names, new_names))
LTeamResults = GameResults.drop(columns=['WTeamID', 'WScore', 'WFGM', 'WFGA', 'WFGM3' , 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'WLoc' ])

GameResults

SyntaxError: invalid syntax (<ipython-input-15-bafe656dabf8>, line 3)

In [None]:
resultsWin = pd.read_csv('data/RegularSeasonDetailedResults.csv')
teams = pd.read_csv('data/Teams.csv')

# Get only winning teams game stats to predict their score
resultsWin = resultsWin.drop(['WTeamID', 'LTeamID', 'WLoc'], axis=1)

def neuralNetwork(results) :
    train_features, test_features, train_outcome, test_outcome = train_test_split(
        results.drop("WScore", axis=1),
        results.WScore,
        test_size=0.30, 
        random_state=11
    )
    scaler = MinMaxScaler()
    mlp_reg = MLPClassifier()

    imputer = Imputer()
    selector = SelectPercentile()
    threshold = VarianceThreshold(.1)
    pipe = make_pipeline(imputer, threshold, selector, scaler, mlp_reg)

    param_grid = {
        'selectpercentile__percentile':range(10, 30, 5)
        }

    crossVal = KFold()
    grid = GridSearchCV(pipe, param_grid, cv = crossVal, scoring="neg_mean_absolute_error")
    grid.fit(train_features, train_outcome)
    grid.score(test_features, test_outcome)

    score = grid.score(test_features, test_outcome)

    predictedValues = grid.predict(test_features)

    return [score, predictedValues, grid, test_outcome]