## English Premiere League (EPL) | Season 2017/18 | Which teams will finish top 3 ?

In [102]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("fivethirtyeight")


from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.datasets import make_classification

from sklearn.feature_selection import RFE

In [103]:
df = pd.read_csv(os.path.join('..', 'dataset', 'epl-predict-dataset-01'))

In [104]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,team_fifa_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,...,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass,team_name,qualify
0,70,71,1,2010-02-22,66,2,1,30,1,2,...,2,30,1,40,1,50,2,1,Arsenal,0
1,71,72,1,2011-02-22,75,1,1,40,2,2,...,2,50,2,40,1,45,2,1,Arsenal,1
2,72,73,1,2012-02-22,25,3,1,32,1,1,...,2,57,2,57,1,52,2,1,Arsenal,0
3,73,74,1,2013-09-20,30,3,1,29,1,1,...,2,64,2,54,1,52,2,1,Arsenal,0
4,74,75,1,2014-09-19,59,2,2,26,1,1,...,2,51,2,44,1,52,2,1,Arsenal,1


In [105]:
df.columns

Index([u'Unnamed: 0', u'id', u'team_fifa_api_id', u'date', u'buildUpPlaySpeed',
       u'buildUpPlaySpeedClass', u'buildUpPlayDribblingClass',
       u'buildUpPlayPassing', u'buildUpPlayPassingClass',
       u'buildUpPlayPositioningClass', u'chanceCreationPassing',
       u'chanceCreationPassingClass', u'chanceCreationCrossing',
       u'chanceCreationCrossingClass', u'chanceCreationShooting',
       u'chanceCreationShootingClass', u'chanceCreationPositioningClass',
       u'defencePressure', u'defencePressureClass', u'defenceAggression',
       u'defenceAggressionClass', u'defenceTeamWidth',
       u'defenceTeamWidthClass', u'defenceDefenderLineClass', u'team_name',
       u'qualify'],
      dtype='object')

In [106]:
df.drop('Unnamed: 0',inplace=True, axis=1)

Accuracy Score 

In [185]:
#Create X and y variables
X = df.drop(["qualify",'team_name','date','id','team_fifa_api_id'], axis=1)
y = df.qualify

#Intialize, fit, and score the model
lr = LogisticRegression()

lr.fit(X,y)

score = lr.score(X,y)

print ("The model produces an accuracy score of {:.2f} percent".format(score*100))

The model produces an accuracy score of 89.17 percent


Null Accuracy ?

In [186]:
#Find the null accuracy 
y.value_counts(normalize=True)

0    0.85
1    0.15
Name: qualify, dtype: float64

## Training and Test Set

In [187]:
#Step 1
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.4,
                                                    random_state=42)
#Step 2
lr = LogisticRegression()
lr.fit(X_train, y_train)

#Step 3
preds = lr.predict(X_test)

#Step 4
testing_score = accuracy_score(y_test, preds)

print ("The model accurately classified {:.2f} percent of the testing data".format(testing_score*100))

The model accurately classified 87.50 percent of the testing data


In [188]:
#Use cross_val_score method to generate the average accuracy score for 5 CVs
mean_cv_score = cross_val_score(LogisticRegression(), X,y, cv=5, scoring="accuracy").mean()

print ("The cross validated accuracy score is {:.2f} percent").format(mean_cv_score*100)

The cross validated accuracy score is 86.58 percent


## Feature Selection | Coefficients

In [196]:
X = df.drop(["qualify",'team_name','date','id', 'team_fifa_api_id'], axis=1)
y = df.qualify
lr = LogisticRegression()
lr.fit(X,y);

In [197]:
#Table of coefficients and their values
coef = pd.DataFrame(zip(X.columns, np.transpose(lr.coef_[0])), columns=["coef", "value"])
coef

Unnamed: 0,coef,value
0,buildUpPlaySpeed,0.002941
1,buildUpPlaySpeedClass,-0.917882
2,buildUpPlayDribblingClass,-0.198782
3,buildUpPlayPassing,-0.027742
4,buildUpPlayPassingClass,-0.283588
5,buildUpPlayPositioningClass,-0.055747
6,chanceCreationPassing,-0.032999
7,chanceCreationPassingClass,-0.721296
8,chanceCreationCrossing,-0.00713
9,chanceCreationCrossingClass,0.309687


In [198]:
coef.value = coef.value.apply(lambda x: pd.to_numeric(x))

In [199]:
coef.value = coef.value.round(2)

In [200]:
coef

Unnamed: 0,coef,value
0,buildUpPlaySpeed,0.0
1,buildUpPlaySpeedClass,-0.92
2,buildUpPlayDribblingClass,-0.2
3,buildUpPlayPassing,-0.03
4,buildUpPlayPassingClass,-0.28
5,buildUpPlayPositioningClass,-0.06
6,chanceCreationPassing,-0.03
7,chanceCreationPassingClass,-0.72
8,chanceCreationCrossing,-0.01
9,chanceCreationCrossingClass,0.31


In [204]:
coef[coef.value >= 0.05].coef.values

array(['chanceCreationCrossingClass', 'chanceCreationShooting',
       'chanceCreationShootingClass', 'chanceCreationPositioningClass',
       'defencePressure'], dtype=object)

In [225]:
#Create X and y variables
X = df[coef[coef.value != 0.00].coef.values]
y = df.qualify

#Intialize, fit, and score the model
lr = LogisticRegression()

lr.fit(X,y)

score = lr.score(X,y)

print ("The model produces an accuracy score of {:.2f} percent".format(score*100))

The model produces an accuracy score of 89.17 percent


### Recursive Feature Elimination

source: https://machinelearningmastery.com/feature-selection-in-python-with-scikit-learn/

In [219]:


X = df.drop(["qualify",'team_name','date','id','team_fifa_api_id'], axis=1)
y = df.qualify

# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 15 attributes
rfe = RFE(model, 15)
rfe = rfe.fit(X, y)



In [220]:
# summarize the selection of the attributes
rfe_final= pd.DataFrame(zip(X.columns, np.transpose(rfe.support_)), columns=["attributes", "sel"])

In [221]:
rfe_final

Unnamed: 0,attributes,sel
0,buildUpPlaySpeed,False
1,buildUpPlaySpeedClass,True
2,buildUpPlayDribblingClass,True
3,buildUpPlayPassing,False
4,buildUpPlayPassingClass,True
5,buildUpPlayPositioningClass,True
6,chanceCreationPassing,True
7,chanceCreationPassingClass,True
8,chanceCreationCrossing,False
9,chanceCreationCrossingClass,True


In [222]:
features_final = rfe_final[rfe_final.sel == 1 ].attributes.values

In [223]:
features_final

array(['buildUpPlaySpeedClass', 'buildUpPlayDribblingClass',
       'buildUpPlayPassingClass', 'buildUpPlayPositioningClass',
       'chanceCreationPassing', 'chanceCreationPassingClass',
       'chanceCreationCrossingClass', 'chanceCreationShooting',
       'chanceCreationShootingClass', 'chanceCreationPositioningClass',
       'defencePressure', 'defencePressureClass', 'defenceAggressionClass',
       'defenceTeamWidthClass', 'defenceDefenderLineClass'], dtype=object)

In [224]:
#Create X and y variables
X = df[features_final]
y = df.qualify

#Intialize, fit, and score the model
lr = LogisticRegression()

lr.fit(X,y)

score = lr.score(X,y)

print ("The model produces an accuracy score of {:.2f} percent".format(score*100))

The model produces an accuracy score of 88.33 percent
