In [1]:
# import packages
import os
import tarfile
import urllib
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import learning_curve
from util.util import BetUtil


TEAM_PATH = './resources/scores.csv'

In [2]:
def load_data(path=TEAM_PATH):

    return pd.read_csv(path,header=0)

In [3]:
def scatter_plot(df, label_column):
    sns.set(style="ticks")
    num_cols = df.select_dtypes(include=np.number).columns
    num_cols = [col for col in num_cols if col != label_column]
    
    ncols = 3
    nrows = np.ceil(len(num_cols) / ncols).astype(int)
    fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, nrows * 7))
    ax = ax.flatten()
    
    for i, col in enumerate(num_cols):
        sns.scatterplot(x=df[col], y=df[label_column], ax=ax[i])
        ax[i].set_xlabel(col)
        ax[i].set_ylabel(label_column)
        
    plt.tight_layout()
    plt.show()

In [4]:
def visualze(scatter=True, corr=True):

    teamData = load_data()
    teamData.head()
    labels = teamData["TmScore"].copy().values

    if scatter:
        scatter_plot(teamData, "TmScore")
    if corr:
        teamData.corr()

In [5]:
teamData = load_data(TEAM_PATH)
features = teamData.drop("TmScore",axis=1)


features.drop(columns=[*'TeamName,W/L,Opp,G,Date,TEAM_FG,TEAM_FGA,TEAM_3P,TEAM_3PA,TEAM_FT,TEAM_FTA,OPP_FG,OPP_FGA,OPP_3P,OPP_3PA,OPP_FT,OPP_FTA'.split(',')],inplace=True)
onehot = pd.get_dummies(features['Court'])
features.drop('Court', axis=1, inplace=True)
features = features.join(onehot)



labels = teamData["TmScore"].copy()
features.head()


Unnamed: 0,OppScore,TEAM_FG%,TEAM_3P%,TEAM_FT%,TEAM_ORB,TEAM_TRB,TEAM_AST,TEAM_STL,TEAM_BLK,TEAM_TOV,...,OPP_ORB,OPP_TRB,OPP_AST,OPP_STL,OPP_BLK,OPP_TOV,OPP_PF,@,H,N
0,56,0.404,0.286,0.55,4,32,12,9,5,15,...,7,40,9,6,1,21,21,0,1,0
1,77,0.385,0.444,0.833,9,24,12,11,0,20,...,10,33,7,11,3,19,17,1,0,0
2,46,0.603,0.294,0.773,15,44,22,15,5,18,...,4,17,7,9,2,27,18,0,1,0
3,77,0.431,0.318,0.4,6,26,15,11,3,23,...,1,24,18,12,5,18,14,0,0,1
4,77,0.491,0.529,0.545,2,22,20,7,3,15,...,8,29,10,4,0,18,13,0,0,1


In [6]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = "median")),
    ('std_scaler', StandardScaler())
])
featuresScaled = num_pipeline.fit_transform(features)

train_features, test_features, train_labels, test_labels = train_test_split(
    featuresScaled, labels, test_size=0.2, random_state=42
)

In [17]:
def plotLearningCurves(model, train_features, test_features, train_labels, test_labels):
    train_sizes, train_scores, test_scores = learning_curve(model, train_features, train_labels,
                                                            cv=5, scoring="neg_mean_squared_error")
    train_scores_mean = np.mean(-train_scores, axis=1)
    test_scores_mean = np.mean(-test_scores, axis=1)
    plt.plot(train_sizes, np.sqrt(train_scores_mean), "r-+", linewidth=2, label="train")
    plt.plot(train_sizes, np.sqrt(test_scores_mean), "b-", linewidth=3, label="test")
    plt.legend(loc="upper right", fontsize=14)
    plt.xlabel("Training set size", fontsize=14)
    plt.ylabel("RMSE", fontsize=14)
    plt.title("Learning Curves", fontsize=16)
    plt.show()
    return model

In [19]:
# Make predictions on the test set
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(train_features,train_labels)



In [9]:
predictions = rf_reg.predict(test_features)
differences = np.array(predictions) - np.array(test_labels)
std_dev = np.std(differences)

print("Standard deviation:", std_dev)

Standard deviation: 5.3745108995905175


In [16]:
data = '70	.389	.333	1.000	2	28	8	4	2	7	12	.500	.423	.625	3	28	13	2	3	6	1 1 0 0'.split()
dataScaled = num_pipeline.transform([data])
rf_reg.predict(dataScaled)



array([58.63])

In [11]:
def predictScore(team1, team2):

    team1 = num_pipeline.transform([team1])

    team2 = num_pipeline.transform([team2])
    
    print(f'Team1: {rf_reg.predict(team1)}')
    print(f'Team2: {rf_reg.predict(team2)}')

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.base")


In [None]:

msu = {
"team": [25.3,57.3,6.9,18.5,10.7,14.5,9.0,36.6,14.7,4.9,3.0,11.1,16.8,68.1],
"opponent": [23.6,57.6,6.3,21.4,12.1,17.0,8.9,33.1,12.7,5.9,3.8,9.7,15.9,65.6]
}

osu = {
    "team": [28.0,60.4,6.7,18.2,12.5,17.3,11.8,38.5,11.9,5.1,3.7,11.0,16.2,75.0],
    "opponent": [24.7,59.3,6.8,22.0,11.9,16.5,10.1,33.3,12.0,5.0,2.8,11.3,16.5,68.0]
}

averagesMsu = {
   'team': [70.3, 25.0, 58.3, 6.85, 20.25, 11.3, 15.5, 9.55, 34.95, 13.35, 4.95, 2.9, 11.2, 16.65], 
   'opponent': [25.8, 59.0, 6.5, 19.799999999999997, 12.3, 17.15, 10.350000000000001, 35.8, 12.3, 5.5, 3.75, 10.35, 16.05]
   }


averagesOsu = {'team': [68.05,25.8, 59.0, 6.5, 19.799999999999997, 12.3, 17.15, 10.350000000000001, 35.8, 12.3, 5.5, 3.75, 10.35, 16.05], 
               'opponent': [25.0, 58.3, 6.85, 20.25, 11.3, 15.5, 9.55, 34.95, 13.35, 4.95, 2.9, 11.2, 16.65]
}

msu = [70.3, 25.0, 58.3, 6.85, 20.25, 11.3, 15.5, 9.55, 34.95, 13.35, 4.95, 2.9, 11.2, 16.65] + [25.8, 59.0, 6.5, 19.799999999999997, 12.3, 17.15, 10.350000000000001, 35.8, 12.3, 5.5, 3.75, 10.35, 16.05]
osu = [68.05,25.8, 59.0, 6.5, 19.799999999999997, 12.3, 17.15, 10.350000000000001, 35.8, 12.3, 5.5, 3.75, 10.35, 16.05] + [25.0, 58.3, 6.85, 20.25, 11.3, 15.5, 9.55, 34.95, 13.35, 4.95, 2.9, 11.2, 16.65]


predictScore(msu, osu)



In [None]:
northwestern = {
    "team": np.array([23.6,57.8,7.7,23.9,13.5,18.1,10.4,35.3,13.5,7.9,4.4,10.3,16.0,68.4]),
    "opponent": np.array([21.7,53.6,7.9,22.8,10.9,16.0,9.5,34.9,13.5,4.8,3.6,14.4,17.1,62.1])
}

purdue = {
    "team": np.array([26.2,56.0,7.4,21.7,14.8,19.7,12.4,39.5,15.7,4.7,4.0,11.2,13.2,74.6]),
    "opponent": np.array([24.0,58.6,6.3,20.6,7.2,9.8,8.2,27.5,12.2,5.9,2.8,10.1,18.3,61.6])
}

pAve = {
    "team": (purdue['team'] + northwestern['opponent'])/2,
    "opponent": (purdue['opponent'] + northwestern['team'])/2
}

nwAve = {
     "team": (purdue['opponent'] + northwestern['team'])/2,
    "opponent": (purdue['team'] + northwestern['opponent'])/2
}

print(pAve)
print(nwAve)


pPredict = [68.35, 23.95, 54.8 ,  7.65, 22.25, 12.85, 17.85, 10.95, 37.2 , 14.6 , 4.75,  3.8 , 12.8 , 15.15]  + [23.95, 54.8 ,  7.65, 22.25, 12.85, 17.85, 10.95, 37.2 , 14.6 ,4.75,  3.8 , 12.8 , 15.15]
nwPredict = [68.35, 23.8 , 58.2 ,  7.  , 22.25, 10.35, 13.95,  9.3 , 31.4 , 12.85, 6.9 ,  3.6 , 10.2 , 17.15] + [23.95, 54.8 ,  7.65, 22.25, 12.85, 17.85, 10.95, 37.2 , 14.6 , 4.75,  3.8 , 12.8 , 15.15]

predictScore(pPredict,nwPredict)

In [None]:
from util.util import BetUtil

In [None]:
BetUtil.Line('+181').toProb()