In [2]:
import pandas as pd
import numpy as np
import sqlite3
import warnings
warnings.filterwarnings('ignore')
from sklearn import neighbors
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from random import sample
from time import time
import sklearn.ensemble as sk
import sklearn.metrics as skm
import pylab as pl
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt

In [3]:
# Data Loading
database ='database.sqlite'
connection = sqlite3.connect(database)
match = pd.read_sql("SELECT * FROM Match;", connection)

In [4]:
#Dropping unwanted NaN rows from the dataset
selectColumns=['home_team_api_id', 'away_team_api_id', 'home_team_goal', 'away_team_goal', 
                'date',  'country_id', 'league_id', 'season', 'stage']
match.dropna(subset = selectColumns, inplace = True)

In [5]:
# Data Selection
selectColumns=[
               'home_team_api_id','away_team_api_id','home_team_goal','away_team_goal','date',
               'country_id','league_id','season','stage','B365H','BWH','IWH','LBH','PSH',
               'B365D','BWD','IWD','LBD','PSD','B365A','BWA','IWA','LBA', 'PSA','home_player_7',
               'away_player_7','home_player_10','home_player_11','away_player_10','away_player_11'
              ]

#Selecting important columns only
mBet=match[selectColumns]

#Converting data values to required format
mBet['date'] = pd.to_datetime(mBet['date'])
mBet = mBet.assign(month=mBet['date'].dt.month)
mBet['date'] = mBet['date'].dt.year
mBet['season']= mBet['season'].str[:4]

# Replacing null values with mean values
mBet['BWH'].fillna(mBet['BWH'].mean(),inplace=True)
mBet['BWA'].fillna(mBet['BWA'].mean(),inplace=True)
mBet['BWD'].fillna(mBet['BWD'].mean(),inplace=True)
mBet['IWH'].fillna(mBet['BWH'].mean(),inplace=True)
mBet['IWA'].fillna(mBet['BWA'].mean(),inplace=True)
mBet['IWD'].fillna(mBet['BWD'].mean(),inplace=True)
mBet['LBH'].fillna(mBet['BWH'].mean(),inplace=True)
mBet['LBA'].fillna(mBet['BWA'].mean(),inplace=True)
mBet['LBD'].fillna(mBet['BWD'].mean(),inplace=True)
mBet['PSH'].fillna(mBet['BWH'].mean(),inplace=True)
mBet['PSA'].fillna(mBet['BWA'].mean(),inplace=True)
mBet['PSD'].fillna(mBet['BWD'].mean(),inplace=True)
mBet['B365H'].fillna(mBet['B365H'].mean(),inplace=True)
mBet['B365A'].fillna(mBet['B365A'].mean(),inplace=True)
mBet['B365D'].fillna(mBet['B365D'].mean(),inplace=True)
mBet.fillna(0,inplace=True)

matchData=mBet

In [6]:
# Creating train, test and validate data from a single data
def train_test_validate_split(dataframe, trainPercent=.7, validatePercent=.15, seed=None):
    # Referred from StackOverflow
    np.random.seed(seed)
    perm = np.random.permutation(dataframe.index)
    length = len(dataframe)
    trainEnd = int(trainPercent * length)
    validateEnd = int(validatePercent * length) + trainEnd
    train = dataframe.ix[perm[:trainEnd]]
    test = dataframe.ix[perm[validateEnd:]]
    validate = dataframe.ix[perm[trainEnd:validateEnd]]
    return train, test, validate

np.random.seed([243])
train, test, validate = train_test_validate_split(matchData)

# Length of all different data
print("Length of Train data %d" %len(train))
print("Length of Test data %d" %len(test))
print("Length of Validate data %d" %len(validate))

Length of Train data 18185
Length of Test data 3898
Length of Validate data 3896


In [7]:
def fullTime(h,a):
    if (h>a) : 
        return "Win"
    elif (h<a) : 
        return "Loss"
    else:
        return "Draw"
# Fetching full time results of the different data sets i.e. Test, Train and Validate
testResult=test.apply(lambda row: fullTime(row['home_team_goal'], row['away_team_goal']), axis=1)
trainResult=train.apply(lambda row: fullTime(row['home_team_goal'], row['away_team_goal']), axis=1)
validateResult=validate.apply(lambda row: fullTime(row['home_team_goal'], row['away_team_goal']), axis=1)

# Deleting goals column from all data, so that we can predict and get desired results
# If we don't remove them, it'll give actual results which will fail our purpose

del train['home_team_goal']
del train['away_team_goal']
del test['home_team_goal']
del test['away_team_goal']
del validate['home_team_goal']
del validate['away_team_goal']

In [8]:
trainResult.value_counts()

Win     8405
Loss    5188
Draw    4592
dtype: int64

In [None]:
# Applying the different classifier's and predicting the results

clfKNN = neighbors.KNeighborsClassifier(30, weights = 'uniform') 
# Results were validation set = 0.463715 & test set = 0.459407 # Dropper it

# clf=LogisticRegression(random_state=0)
# Results were validation set = 0.520115 & test set = 0.514813

clfLR=LogisticRegression(penalty='l1', C=10)
# Results were validation set = 0.522618 & test set = 0.520970 # Not getting desired results, Dropped it

clfDT= DecisionTreeClassifier(random_state=100)
# Results were validation set = 0.397690 & test set = 0.402462 # Worst Result

clfSVC=SVC(kernel='rbf', random_state=0, gamma=.01, C=1)
# Results were validation set = 0.461983 & test set = 0.455175 # Not getting desired results, Dropped it

clfMNB=MultinomialNB()

# Going forward with Random Forest Classfier and more tuning

# When adding different parameters or levers, accuracy kept increasing
# With n_estimators only accuracy was 0.501542 & 0.493981
# When adding oob_score nothing much changed  accuracy was 0.502 & 0.495 but processing time was somewhat faster than before
# When adding random_state accuracy was 0.510491 & 0.495575
# When adding max_features accuracy didn't changed 0.510491 & 0.495575
# When added min_samples_leaf, accuracy made a good gap (atleast for me) from before 0.520693 & 0.514044
# By increasing n_estimators value from 100->1000 (10 Times), processing time increased and better accuracy 0.524350 & 0.530589
# By increasing n_estimators value from 1000->5000 (5 Times), not much difference in accuracy 0.524158 & 0.531743
# but processing time is unacceptably long.





clfRF = sk.RandomForestClassifier(n_estimators=1000, oob_score = True,random_state =42,max_features = "auto", min_samples_leaf = 50)
model = clfRF.fit(train, trainResult)
model = clfKNN.fit(train, trainResult)
model = clfLR.fit(train, trainResult)
model = clfDT.fit(train, trainResult)
model = clfSVC.fit(train, trainResult)
model = clfMNB.fit(train, trainResult)

RF=clfRF.score(test, testResult)
KNN=clfKNN.score(test, testResult)
LR=clfLR.score(test, testResult)
DT=clfDT.score(test, testResult)
SVC=clfSVC.score(test, testResult)
MNB=clfMNB.score(test, testResult)

perf=[RF,KNN,LR,DT,SVC,MNB] # Values of different classifiers 

objects = ('RF','KNN','LR','DT','SVC','MNB') # Names of different classifierss value
y_pos = np.arange(len(objects))
plt.ylim(0,0.60) # Increased y-axis limit for better visualization
plt.bar(y_pos, perf, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Accuracy Scores')
plt.title('Different Classifier Used')

#print(RF)

In [39]:
clfRF = sk.RandomForestClassifier(n_estimators=1000, oob_score = True,random_state =42,max_features = "auto", min_samples_leaf = 50)
model = clfRF.fit(train, trainResult)

validatePredictions = clfRF.predict(validate)
print("Calculated mean accuracy score of the Model for validation set = %f" %(clfRF.score(validate, validateResult)))

testPredictions = clfRF.predict(test)
print("Calculated mean accuracy score of the Model for test set = %f" %(clfRF.score(test, testResult)))

In [None]:
theResultofValidate = skm.confusion_matrix(validateResult,validatePredictions)
f1=skm.f1_score()
pl.matshow(theResultofValidate)
pl.title('Confusion matrix for Validate data\n\n')
pl.colorbar()
pl.show()
print('Prediction of Validate data')
pd.crosstab(validateResult, validatePredictions, rownames=['Actual Results'], colnames=['Predicted Results'])

In [None]:
theResultofTest = skm.confusion_matrix(testResult,testPredictions)
pl.matshow(theResultofTest)
pl.title('Confusion matrix for Test data\n\n')
pl.colorbar()
pl.show()
print('Prediction of Test data')
pd.crosstab(testResult, testPredictions, rownames=['Actual Results'], colnames=['Predicted Results'])

In [36]:
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
 
objects = ('Python', 'C++', 'Java', 'Perl', 'Scala', 'Lisp')
y_pos = np.arange(len(objects))
performance = [10,8,6,4,2,1]
 
plt.bar(y_pos, performance, align='center', alpha=.2)
plt.xticks(y_pos, objects)
plt.ylabel('Usage')
plt.title('Programming language usage')
plt.ylim(0,12)
plt.show()