In [None]:
import pandas as pd
import os
from tqdm import tqdm
from glob import glob
import numpy as np

In [None]:
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(offline=False,world_readable=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
from sklearn import preprocessing

# KBO League(Baseball league) Match Prediction using statistical methods
We want to predidct 2020 KBO match result using 2015~2019 KBO data. 

### Import Data

In [None]:
os.chdir("../input/korea-baseball-datasetkbo-20152020")

In [None]:
#2015 ~ 2019 data
fileNames=['baseball_' + str(x) + '.csv' for x in list(range(2015,2020))]

In [None]:
#import data
taza = pd.DataFrame()
for fileName in tqdm(fileNames):
    
    temp = pd.read_csv(fileName)
    taza = taza.append(temp)

### Data description
There are various match informations.<br>
And we want to predict match result(win or lose) by using match informations(PA, AB, RUN, ...)<br>
But we can't use this data directly **because we can't get match informations(PA, AB, RUN, ...) before the match finish.**<br>
So I decide to use recent 50 games average informations of team.<br>
Before explain the preprocessing process, I'll explain **feature selection process.**

In [None]:
taza.head()

### Feature Selection using Correlation matrix of variables
There is correlation matrix. Predictor variable(y) is 'win' so I selected explanatory variables(X) that correlation coef from 'win' more than 0.3 .

In [None]:
taza.corr().iplot(kind='heatmap',colorscale="Blues")

In [None]:
colnames = ['G_ID','GDAY_DS','T_ID','VS_T_ID','TB_SC'] + taza.corr()[taza.corr()['win']>0.3].index.tolist()

taza = taza[colnames]
taza['YEAR']=taza['G_ID'].str[:4]
years = [str(x) for x in range(2015,2020)]

In [None]:
print(colnames)

Selected variable are 'T_ID' ~ 'OOO' (G_ID, GDAY_DS are information about match date, team)

* ### Preprocessing

In [None]:
def weightedMean(arr):
    return np.average(arr, weights=np.arange(0, len(arr), 1))

In [None]:
class GetX():
    def __init__(self,gi,N, taza):
        self.gi = gi
        self.N = N
        self.taza =taza
        self.date = gi[:8]
        self.team1 = gi[8:10]
        self.team2 = gi[10:12]
    
    def makeR(self,team):
        '''
        df1은 team1의 공격(타자)데이터
        df2는 team1의 수비(실점)데이터
        '''
        df1 = self.taza[self.taza['T_ID']==team]
        df1 = df1.reset_index(drop=True)
        
        df2 = self.taza[self.taza['VS_T_ID']==team]
        df2 = df2.reset_index(drop=True)
        
        
        
        dateIdx = df1[df1['GDAY_DS']==int(self.date)].index[0]
        if(dateIdx-self.N<0):
            return []
        else:
            first = dateIdx-self.N
            
        df1 = df1[first:dateIdx]
        sr1 = df1.iloc[:,5:-1].apply(lambda x:weightedMean(x),axis=0)
        
        df2 = df2[first:dateIdx]
        sr2 = df2.iloc[:,5:-1].apply(lambda x:weightedMean(x),axis=0)
        cN = sr2.index.values
        sr2.index = [x+'_VS' for x in cN]
        
        if(len(sr1)<2):
            return []
        
        temp = pd.Series({"WIN_RATIO":weightedMean(df1['win'])})
        result = sr1.append(sr2)
        result = result.append(temp)
        
        return result


    def makeDf(self):
        df1 = self.taza[self.taza['G_ID']==self.gi]

        result1 = self.makeR(self.team1) #어웨이팀의 정보
        result2 = self.makeR(self.team2) #홈(상대)팀의 정보

        

        if((len(result1)<=1)|(len(result2)<=1)):
            return []
        
        cN = result2.index.values
        result2.index = [x+'_ENE' for x in cN]
        result = result1.append(result2)
        
        tempDf = self.taza[self.taza['GDAY_DS']<int(self.date)]
        tempDf = tempDf[(tempDf['T_ID']==self.team1)&(tempDf['VS_T_ID']==self.team2)]
        if(len(tempDf)==0):
            result['RELATIVE_WIN'] = 0.5
        else:

            result['RELATIVE_WIN'] = tempDf['win'][-10:].mean()
        result['G_ID']=self.gi
        result['WIN'] = self.taza[self.taza['G_ID']==gi].iloc[0,-1]
        

        return result.to_dict()
    

In [None]:
def gapLeftRight(df):
    dfTemp = df.iloc[:,:-3].copy()

    n = dfTemp.shape[1]//2

    dfLeft = dfTemp.iloc[:,:n]
    dfRight = dfTemp.iloc[:,n:]

    dfColumns = dfLeft.columns

    dfRight.columns = dfColumns

    dfResult = dfLeft-dfRight

    dfResult.columns = [x+'_GAP' for x in dfColumns]
    dfResult = pd.concat([dfResult, df.iloc[:,-3:]], axis=1)
    return dfResult

In [None]:
def gapLeftRightRandom(df):
    dfTemp = df.iloc[:,:-3].copy()

    n = dfTemp.shape[1]//2

    dfLeft = dfTemp.iloc[:,:n]
    dfRight = dfTemp.iloc[:,n:]

    dfColumns = dfLeft.columns

    dfRight.columns = dfColumns

    dfResult = dfLeft-dfRight

    dfResult.columns = [x+'_GAP' for x in dfColumns]
    dfResult = pd.concat([dfResult, df.iloc[:,-3:]], axis=1)
    #Home, Base randomize
    dfResult=dfResult.sample(len(dfResult))

    nTemp = len(dfResult)//2

    dfResult.iloc[:nTemp,:-3] = -1 * dfResult.iloc[:nTemp,:-3]

    dfResult.iloc[:nTemp,[-3,-1]] = 1-dfResult.iloc[:nTemp,[-3,-1]]

    dfResult = dfResult.sort_values("G_ID")
    return dfResult

In [None]:
taza_ = taza[taza['YEAR']=='2015']

X = []
for gi in taza_['G_ID'][::2]:
    temp = GetX(gi,50, taza_.iloc[:,:-1])
    temp = temp.makeDf()
    if(type(temp)!=list):
        X.append(temp)

df = pd.DataFrame(X)    

Look This table.
First 17 columns are away team's information. And '~_VS' columns are about defend information. For example, in first row, 'RUN' means SS(SamSung) team's recent 50 games weighted mean of RUN score(=5.57) and 'RUN_VS' means SS team's recent 50 games weighted mean of loss RUN score. <br>

And 18 ~ 34 columns('~_ENE') are home team's information. 'WIN' column is the match result of **'away team'**<br>
'RELATIVE_WIN' is win ratio of away team vs home team(SS vs LG in first row) while playing 50 games.



In [None]:
df.head()

And this table is 1 ~ 17 columns(away team's information) - 18 ~ 34 columns data(home team's information)<br>
I'll use this data for modeling

In [None]:
gapLeftRight(df).head()

### Make train, test data set

In [None]:
#2015 ~ 2019(train set)
dfX = pd.DataFrame()
for year in years:
    taza_ = taza[taza['YEAR']==year]

    X = []
    for gi in taza_['G_ID'][::2]:
        temp = GetX(gi,50, taza_.iloc[:,:-1])
        temp = temp.makeDf()
        if(type(temp)!=list):
            X.append(temp)

    df = pd.DataFrame(X)
    dfX = dfX.append(df)

In [None]:
train = gapLeftRight(dfX)

In [None]:
#2020(test set)
taza_ = pd.read_csv("baseball_2020.csv")
taza_ = taza_[colnames]
taza_['YEAR']=taza_['G_ID'].str[:4]

X = []
for gi in taza_['G_ID'][::2]:
    temp = GetX(gi,50, taza_.iloc[:,:-1])
    temp = temp.makeDf()
    if(type(temp)!=list):
        X.append(temp)

df = pd.DataFrame(X)

In [None]:
test = gapLeftRight(df)

### Modeling
I'll use logistic reg, decision tree, random forest, deep learning models for prediction.

In [None]:
#remove draw cases
train = train[train['WIN'] !=0.5].reset_index(drop=True)
test = test[test['WIN'] !=0.5].reset_index(drop=True)

In [None]:
def logisticReg(XTrain, yTrain, XValid, yValid):
    clf = LogisticRegression(random_state=0).fit(XTrain, yTrain)
    pred = clf.predict(XValid)
    return pred

In [None]:
def decisionT(XTrain, yTrain, XValid, yValid, mD, cT):
    clf = tree.DecisionTreeClassifier(max_depth = mD, criterion=cT)
    clf = clf.fit(XTrain, yTrain)
    pred = clf.predict(XValid)
    return pred

In [None]:
def randomF(XTrain,yTrain, XValid, yValid, mD):
    rf = RandomForestClassifier(n_estimators=mD)
    rf.fit(XTrain, yTrain)
    pred = rf.predict(XValid)
    return pred

In [None]:
def DNN(XTrain,yTrain, XValid, yValid, lr, bs):

#     XTrain = pd.DataFrame(preprocessing.scale(XTrain))
#     XValid = pd.DataFrame(preprocessing.scale(XValid))
    
    model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(XValid.shape[1], )),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(200, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    optimizer = tf.keras.optimizers.Adagrad(lr=lr)

    model.compile(optimizer=optimizer, loss='binary_crossentropy',
                 metrics=['accuracy'])

    model.fit(XTrain,yTrain, epochs=20,batch_size=bs, verbose=0)

    
    return model.predict(XValid).flatten()

We can validate model by cross validation.<br>
And evaluaction function is **accuracy**.

In [None]:
kf = KFold(n_splits=10)

In [None]:
accuracyList=[]

In [None]:
 for trainIdx, validIdx in kf.split(train):
        #shulffing
        train_ = train.iloc[trainIdx].sample(len(trainIdx))
        valid_ = train.iloc[validIdx].sample(len(validIdx))

        XTrain = train_.iloc[:,:-2]
        yTrain = train_.iloc[:,-1]

        XTest = valid_.iloc[:,:-2]
        yTest = valid_.iloc[:,-1]

        lg = logisticReg(XTrain,yTrain,XTest,yTest)
        accuracyList.append(np.mean(yTest==lg))

Mean accuracy of logistic reg is about 55%

In [None]:
np.mean(accuracyList)

Decision tree, random forest, dnn need hyper parameter tuning. So I'll tune these roughly

In [None]:
dtResult = {"maxDepth":[],"accuracy":[],"criterion":[]}

In [None]:
for mD in tqdm(range(5,101,5)):
    for cT in ['gini','entropy']:
        dtResult['maxDepth'].append(mD)
        dtResult['criterion'].append(cT)
        
        accuracyList = []
        for trainIdx, validIdx in kf.split(train):
            #shulffing
            train_ = train.iloc[trainIdx].sample(len(trainIdx))
            valid_ = train.iloc[validIdx].sample(len(validIdx))

            XTrain = train_.iloc[:,:-2]
            yTrain = train_.iloc[:,-1]

            XTest = valid_.iloc[:,:-2]
            yTest = valid_.iloc[:,-1]
            
            dt = decisionT(XTrain,yTrain,XTest,yTest, mD, cT)
            accuracyList.append(np.mean(yTest==dt))
        dtResult['accuracy'].append(np.mean(accuracyList))

In [None]:
dtResult=pd.DataFrame(dtResult)

dtResult.iplot(mode='lines',x='maxDepth', y='accuracy', categories='criterion',
              xTitle='depth', yTitle='accuracy')

Max_depth = 5, criterion = entropy is best but accuracy score is lower than logistic reg

In [None]:
rfResult = {"maxDepth":[],"accuracy":[]}

In [None]:
for mD in tqdm(range(5,101,5)):
    
    rfResult['maxDepth'].append(mD)
    accuracyList = []
    for trainIdx, validIdx in kf.split(train):
        #shulffing
        train_ = train.iloc[trainIdx].sample(len(trainIdx))
        valid_ = train.iloc[validIdx].sample(len(validIdx))

        XTrain = train_.iloc[:,:-2]
        yTrain = train_.iloc[:,-1]

        XTest = valid_.iloc[:,:-2]
        yTest = valid_.iloc[:,-1]

        rf = randomF(XTrain,yTrain,XTest,yTest, mD)
        accuracyList.append(np.mean(yTest==rf))
    rfResult['accuracy'].append(np.mean(accuracyList))

In [None]:
rfResult=pd.DataFrame(rfResult)

rfResult.iplot(mode='lines',x='maxDepth', y='accuracy',
              xTitle='depth', yTitle='accuracy')

Random forest is also bad result

In [None]:
dnResult = {"learningRate":[],'batchSize':[],"accuracy":[]}

In [None]:
for lr in tqdm([0.05, 0.01, 0.005, 0.001]):
    for bS in [10,50,100,200]:
        dnResult['learningRate'].append(lr)
        dnResult['batchSize'].append(bS)
        
        accuracyList = []
        for trainIdx, validIdx in kf.split(train):
            #shulffing
            train_ = train.iloc[trainIdx].sample(len(trainIdx))
            valid_ = train.iloc[validIdx].sample(len(validIdx))

            XTrain = train_.iloc[:,:-2]
            yTrain = train_.iloc[:,-1]

            XTest = valid_.iloc[:,:-2]
            yTest = valid_.iloc[:,-1]
            
            dn = DNN(XTrain,yTrain,XTest,yTest, lr, bS)
            accuracyList.append(np.mean(yTest ==(dn>0.5)*1))
        dnResult['accuracy'].append(np.mean(accuracyList))

In [None]:
dnResult=pd.DataFrame(dnResult)

dnResult.iplot(mode='lines',x='batchSize', y='accuracy', categories = 'learningRate',
              xTitle='batchSize', yTitle='accuracy')

Deep learning's result is best of 4 methods. But I afraid of overfitting.<br>
learning rate = 0.01, batchsize = 100 is best case.

### Predict using 2020 kbo data

In [None]:
XTrain = train.iloc[:,:-2]
yTrain = train.iloc[:,-1]

In [None]:
XTest = test.iloc[:,:-2]
yTest = test.iloc[:,-1]

In [None]:
lR = LogisticRegression(random_state=0).fit(XTrain, yTrain)
logisticResult = np.mean(lR.predict(XTest) == yTest)

dtR = tree.DecisionTreeClassifier(max_depth = 5, criterion='entropy').fit(XTrain, yTrain)
decisionResult = np.mean(dtR.predict(XTest)==yTest)

rfR = RandomForestClassifier(n_estimators=45)
rfR.fit(XTrain, yTrain)
randomResult = np.mean(rfR.predict(XTest)==yTest)

In [None]:
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(XTrain.shape[1], )),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(200, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adagrad(lr=0.01)

model.compile(optimizer=optimizer, loss='binary_crossentropy',
             metrics=['accuracy'])

model.fit(XTrain,yTrain, epochs=20,batch_size=100, verbose=0)
dnResult = np.mean((model.predict(XTest)>0.5).flatten()*1 == yTest)

In [None]:
print("logistic regression : ",  logisticResult,"\n",
     "decision tree : ", decisionResult, "\n",
     "random forest : ", randomResult, "\n",
      "deep learning : ", dnResult)

Deep learning is best of all but terribly bad result i think... <br>
It is almost same result by predict all result as 0.<br>
More adequate preprocessing(such as PCA) or tuning parmeter is required.<br>