In [1]:
import config
import sqlalchemy
import functions
import statsapi as mlb
import sql_alch_schema
from datetime import datetime

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String,DateTime,Date,Time,Boolean,func
from sqlalchemy import ForeignKey,and_
from sqlalchemy.orm import relationship
from sqlalchemy.schema import Table
from sqlalchemy import distinct

_sql_alchemy_connection = (
                                f'mysql+mysqlconnector://'
                                f'{config.user}:{config.password}'
                                f'@{config.host}:{config.port}'
                                f'/{config.schema}'
                           )
## Create the engine 
db = sqlalchemy.create_engine(_sql_alchemy_connection,
                              echo = False,
                              connect_args = {'ssl_disabled' : True,})

Base = sql_alch_schema.Base

Game = sql_alch_schema.Game
Play = sql_alch_schema.Play
Team = sql_alch_schema.Team
GameTeamLink = sql_alch_schema.GameTeamLink
Person = sql_alch_schema.Person


from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=db)
session = Session()

#Base.metadata.create_all(db)


import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
sns.set_style('whitegrid')
%config InlineBackend.figure_format = 'retina'

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(12,10)})

In [2]:
df = pd.read_csv('dataset.csv')

def merge_scores(df,session):
    """
    Takes in a dataframe and a sql alch session, returns df with game scores added. merges with 'ID' column.
    """
    games = [session.query(Game).filter(Game.id==x).one() for x in df['ID']]

    score_dict=([{'ID':g.id,
                  'home_score':max([x.homeScore for x in g.plays],default=-1),
                  'away_score':max([x.awayScore for x in g.plays],default=-1)} 
                  for g in games])

    df=pd.merge(right=pd.DataFrame.from_dict(score_dict),
            left=df,
            right_on='ID',
            left_on='ID')
    return df

df = merge_scores(df,session)

df['home_win']=0
df['home_win']=df.apply(lambda x: x['home_win']+1 if x['home_score']>x['away_score'] else x['home_win'], axis=1)

df=df.drop(columns=['home_score','away_score','ID'])

In [66]:
df.dropna(inplace=True)

In [67]:
X=df.drop(columns=['home_win'])
y=df['home_win']

In [68]:
import sklearn
from sklearn.model_selection import train_test_split

In [69]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=12)

In [70]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

In [81]:
pipe = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',MinMaxScaler()),
    ('classifier',RandomForestClassifier())])

In [88]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy')

array([0.55234657, 0.58483755, 0.53429603, 0.53068592, 0.50902527])

In [83]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('imputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='median',
                               verbose=0)),
                ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
                ('classifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_job

In [84]:
y_pred=pipe.predict(X_test)

In [89]:
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score,roc_curve

f1_score(y_test,y_pred)


0.5766233766233766