In [1]:
import config
import sqlalchemy
import functions
import statsapi as mlb
import sql_alch_schema
from datetime import datetime

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String,DateTime,Date,Time,Boolean,func
from sqlalchemy import ForeignKey,and_
from sqlalchemy.orm import relationship
from sqlalchemy.schema import Table
from sqlalchemy import distinct

_sql_alchemy_connection = (
                                f'mysql+mysqlconnector://'
                                f'{config.user}:{config.password}'
                                f'@{config.host}:{config.port}'
                                f'/{config.schema}'
                           )
## Create the engine 
db = sqlalchemy.create_engine(_sql_alchemy_connection,
                              echo = False,
                              connect_args = {'ssl_disabled' : True,})

Base = sql_alch_schema.Base

Game = sql_alch_schema.Game
Play = sql_alch_schema.Play
Team = sql_alch_schema.Team
GameTeamLink = sql_alch_schema.GameTeamLink
Person = sql_alch_schema.Person


from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=db)
session = Session()

#Base.metadata.create_all(db)


import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
sns.set_style('whitegrid')
%config InlineBackend.figure_format = 'retina'

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(12,10)})

In [125]:
cat_df=functions.getGame_df()
num_df=functions.load_dataset(session)

grand_df=pd.merge(left=num_df,
                  right=cat_df,
                  left_on='ID',
                  right_on='id')
grand_df['homeTeam_win']=grand_df.apply(lambda x: 1 if x['home_score']>x['away_score'] else 0,axis=1)

## Dummy Classifier
If I didn't know anything, I would pick the team with the higher win percentage every time. 

In [126]:
def trailing_winPct(row):
    home_wins=(grand_df[
        (grand_df['homeTeam_id']==row['homeTeam_id'])
        &(grand_df['ID']<row['ID'])]
               ['homeTeam_win'].sum())
    
    away_wins=(grand_df[
        (grand_df['awayTeam_id']==row['homeTeam_id'])
        &(grand_df['ID']<row['ID'])
        &(grand_df['homeTeam_win']==0)].shape[0])
    
    total_games=(grand_df[
        ((grand_df['homeTeam_id']==row['homeTeam_id'])|(grand_df['awayTeam_id']==row['homeTeam_id']))
        &(grand_df['ID']<row['ID'])].shape[0])
    return (home_wins+away_wins)/total_games

In [127]:
def away_trailing_winPct(row):
    home_wins=(grand_df[
        (grand_df['homeTeam_id']==row['awayTeam_id'])
        &(grand_df['ID']<row['ID'])]
               ['homeTeam_win'].sum())
    
    away_wins=(grand_df[
        (grand_df['awayTeam_id']==row['awayTeam_id'])
        &(grand_df['ID']<row['ID'])
        &(grand_df['homeTeam_win']==0)].shape[0])
    
    total_games=(grand_df[
        ((grand_df['homeTeam_id']==row['awayTeam_id'])|(grand_df['awayTeam_id']==row['awayTeam_id']))
        &(grand_df['ID']<row['ID'])].shape[0])
    return (home_wins+away_wins)/total_games

In [128]:
grand_df['homeWin_pct']=grand_df.apply(lambda x: trailing_winPct(x),axis=1)

  from ipykernel import kernelapp as app


In [129]:
grand_df['awayWin_pct']=grand_df.apply(lambda x: away_trailing_winPct(x),axis=1)

  from ipykernel import kernelapp as app


In [130]:
from sklearn.base import BaseEstimator

class WinPercentageClassifier(BaseEstimator):
    def fit(self,X,y=None):
        pass
    def predict(self,X):
        preds=X.apply(lambda row: 1 if row['homeWin_pct']>row['awayWin_pct'] else 0,axis=1)
        return preds

In [140]:
import sklearn
from sklearn.model_selection import train_test_split

X=grand_df.drop(columns=['homeTeam_win','ID'])
y=grand_df['homeTeam_win']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=12)

In [141]:
dummy = WinPercentageClassifier()

In [142]:
dummy.fit(X_train)
dummy_preds = dummy.predict(X_test)

In [143]:
dummy_preds.value_counts()

1    352
0    330
dtype: int64

In [144]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,roc_curve

print(f1_score(y_test,dummy_preds))
cross_val_score(dummy,X_train,y_train,cv=5,scoring='accuracy')

0.5914285714285714


array([0.54395604, 0.60622711, 0.56513761, 0.57798165, 0.56330275])

## Create a model to beat the dummy

In [145]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [150]:
cat_attribs = list(cat_df.drop(columns=['id','temp']))

In [137]:
num_transform = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',MinMaxScaler())])

cat_transform = Pipeline([])

In [10]:
from sklearn.model_selection import cross_val_score

cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy')

array([0.52010724, 0.50938338, 0.49865952, 0.47989276, 0.53763441])

In [11]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('imputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='median',
                               verbose=0)),
                ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('classifier',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=None,
                                        splitter='best'))]

In [12]:
y_pred=pipe.predict(X_test)

In [13]:


f1_score(y_test,y_pred)


0.4917355371900826

### 