In [1]:
import config
import sqlalchemy
import functions
import statsapi as mlb
import sql_alch_schema
from datetime import datetime

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String,DateTime,Date,Time,Boolean,func
from sqlalchemy import ForeignKey,and_
from sqlalchemy.orm import relationship
from sqlalchemy.schema import Table
from sqlalchemy import distinct

_sql_alchemy_connection = (
                                f'mysql+mysqlconnector://'
                                f'{config.user}:{config.password}'
                                f'@{config.host}:{config.port}'
                                f'/{config.schema}'
                           )
## Create the engine 
db = sqlalchemy.create_engine(_sql_alchemy_connection,
                              echo = False,
                              connect_args = {'ssl_disabled' : True,})

Base = sql_alch_schema.Base

Game = sql_alch_schema.Game
Play = sql_alch_schema.Play
Team = sql_alch_schema.Team
GameTeamLink = sql_alch_schema.GameTeamLink
Person = sql_alch_schema.Person


from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=db)
session = Session()

#Base.metadata.create_all(db)


import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
sns.set_style('whitegrid')
%config InlineBackend.figure_format = 'retina'

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(12,10)})

In [3]:
cat_df=functions.getGame_df()
num_df=functions.load_dataset(session)

grand_df=pd.merge(left=num_df,
                  right=cat_df,
                  left_on='ID',
                  right_on='id')
grand_df['homeTeam_win']=grand_df.apply(lambda x: 1 if x['home_score']>x['away_score'] else 0,axis=1)



## Dummy Classifier
If I didn't know anything, I would pick the team with the higher win percentage every time. 

In [4]:
def trailing_winPct(row):
    home_wins=(grand_df[
        (grand_df['homeTeam_id']==row['homeTeam_id'])
        &(grand_df['ID']<row['ID'])]
               ['homeTeam_win'].sum())
    
    away_wins=(grand_df[
        (grand_df['awayTeam_id']==row['homeTeam_id'])
        &(grand_df['ID']<row['ID'])
        &(grand_df['homeTeam_win']==0)].shape[0])
    
    total_games=(grand_df[
        ((grand_df['homeTeam_id']==row['homeTeam_id'])|(grand_df['awayTeam_id']==row['homeTeam_id']))
        &(grand_df['ID']<row['ID'])].shape[0])
    return (home_wins+away_wins)/total_games

In [5]:
def away_trailing_winPct(row):
    home_wins=(grand_df[
        (grand_df['homeTeam_id']==row['awayTeam_id'])
        &(grand_df['ID']<row['ID'])]
               ['homeTeam_win'].sum())
    
    away_wins=(grand_df[
        (grand_df['awayTeam_id']==row['awayTeam_id'])
        &(grand_df['ID']<row['ID'])
        &(grand_df['homeTeam_win']==0)].shape[0])
    
    total_games=(grand_df[
        ((grand_df['homeTeam_id']==row['awayTeam_id'])|(grand_df['awayTeam_id']==row['awayTeam_id']))
        &(grand_df['ID']<row['ID'])].shape[0])
    return (home_wins+away_wins)/total_games

In [6]:
grand_df['homeWin_pct']=grand_df.apply(lambda x: trailing_winPct(x),axis=1)

  from ipykernel import kernelapp as app


In [7]:
grand_df['awayWin_pct']=grand_df.apply(lambda x: away_trailing_winPct(x),axis=1)

  from ipykernel import kernelapp as app


In [8]:
from sklearn.base import BaseEstimator

class WinPercentageClassifier(BaseEstimator):
    def fit(self,X,y=None):
        pass
    def predict(self,X):
        preds=X.apply(lambda row: 1 if row['homeWin_pct']>row['awayWin_pct'] else 0,axis=1)
        return preds

In [117]:
import sklearn
from sklearn.model_selection import train_test_split

X=grand_df.drop(columns=['homeTeam_win','ID'])
y=grand_df['homeTeam_win']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=12)

In [118]:
dummy = WinPercentageClassifier()

In [119]:
dummy.fit(X_train)
dummy_preds = dummy.predict(X_test)

In [120]:
dummy_preds.value_counts()

0    455
1    452
dtype: int64

In [132]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,roc_curve

print(f1_score(y_test,dummy_preds))
cross_val_score(dummy,X_train,y_train,cv=5,scoring='f1')

0.5804347826086956


array([0.60104987, 0.60846561, 0.584     , 0.5948856 , 0.578019  ])

## Create a model to beat the dummy

In [122]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [128]:
cat_attribs = list(cat_df.drop(columns=['id','temp','wind_speed','away_probablePitcher','home_probablePitcher']))
num_attribs=list(num_df.drop(columns=['ID','home_score','away_score']))
[num_attribs.append(x) for x in ['temp','wind_speed']]

[None, None]

In [129]:
num_transform = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())])

full_transform = ColumnTransformer([
    ('num',num_transform,num_attribs),
    ('cat',OneHotEncoder(handle_unknown='ignore'),cat_attribs),
])

In [133]:
pipe=Pipeline([
    ('transformer',full_transform),
    ('classifier',RandomForestClassifier())
])

In [134]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipe,X_train,y_train,cv=5,scoring='f1')

array([0.5836478 , 0.5851198 , 0.59803922, 0.59028643, 0.61214374])

In [135]:
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
f1_score(y_test,y_pred)


0.5831663326653307

## Voting Classifier 

In [136]:
voting_clf = VotingClassifier(
    estimators=[('lr',LogisticRegression(solver='liblinear')),
                ('rf',RandomForestClassifier()),
                ('svc',SVC())],
    voting='hard')

In [137]:
from sklearn.model_selection import GridSearchCV

In [138]:
grid_search = GridSearchCV(pipe,{'transformer__''classifier':[RandomForestClassifier()]})

In [139]:
cross_val_score(grid_search,X,y,cv=5,scoring='f1')

ValueError: Invalid parameter classifier for estimator ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0)),
                                                 ('scaler',
                                                  StandardScaler(copy=True,
                                                     

array([nan, nan, nan, nan, nan])

In [140]:
cross_val_score(grid_search,X,y,cv=5,scoring='accuracy')

ValueError: Invalid parameter classifier for estimator ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0)),
                                                 ('scaler',
                                                  StandardScaler(copy=True,
                                                     

array([nan, nan, nan, nan, nan])