## English Premiere League (EPL) | Season 2017/18 | Which teams will finish top 3 ?

In [1]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 20)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import linear_model

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import seaborn as sns

import sqlite3;

  from pandas.core import datetools


Data Source: https://www.kaggle.com/hugomathien/soccer

In [114]:
data_source_01 = "database.sqlite"
data_source_02 = "soccer2017.db"

In [115]:
db01 = sqlite3.connect(os.path.join('..', 'dataset', data_source_01))
db02 = sqlite3.connect(os.path.join('..', 'dataset', data_source_02))

In [116]:
# Teams in EPL
df_epl_teams = pd.io.sql.read_sql(
'''
    select distinct league_id, team_id as team_fifa_api_id, name from teams  where league_id = 13
    
;
''', con = db02)

In [117]:
df_epl_teams= df_epl_teams.set_index('team_fifa_api_id')

In [118]:
df_epl_teams.head()

Unnamed: 0_level_0,league_id,name
team_fifa_api_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,13,Arsenal
5,13,Chelsea
7,13,Everton
9,13,Liverpool
10,13,Manchester City


In [119]:
df_teams_attr = pd.io.sql.read_sql(
'''
    select * from team_attributes
;
''', con = db01)

In [193]:
df_teams_attr.shape

(1458, 25)

## Take only EPL teams

In [178]:
# filter EPL teams 
df_epl_teams_attr = df_teams_attr[df_teams_attr.team_fifa_api_id.isin(df_epl_teams.index)]

In [179]:
df_epl_teams_attr.shape

(120, 25)

In [180]:
df_epl_teams_attr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 70 to 1389
Data columns (total 25 columns):
id                                120 non-null int64
team_fifa_api_id                  120 non-null int64
team_api_id                       120 non-null int64
date                              120 non-null object
buildUpPlaySpeed                  120 non-null int64
buildUpPlaySpeedClass             120 non-null object
buildUpPlayDribbling              40 non-null float64
buildUpPlayDribblingClass         120 non-null object
buildUpPlayPassing                120 non-null int64
buildUpPlayPassingClass           120 non-null object
buildUpPlayPositioningClass       120 non-null object
chanceCreationPassing             120 non-null int64
chanceCreationPassingClass        120 non-null object
chanceCreationCrossing            120 non-null int64
chanceCreationCrossingClass       120 non-null object
chanceCreationShooting            120 non-null int64
chanceCreationShootingClass       120

## Remove un-necessary columns

In [181]:
# get rid of "buildUpPlayDribbling" and "team_api_id" column
df_epl_teams_attr.drop(['buildUpPlayDribbling','team_api_id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [182]:
df_epl_teams_attr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 70 to 1389
Data columns (total 23 columns):
id                                120 non-null int64
team_fifa_api_id                  120 non-null int64
date                              120 non-null object
buildUpPlaySpeed                  120 non-null int64
buildUpPlaySpeedClass             120 non-null object
buildUpPlayDribblingClass         120 non-null object
buildUpPlayPassing                120 non-null int64
buildUpPlayPassingClass           120 non-null object
buildUpPlayPositioningClass       120 non-null object
chanceCreationPassing             120 non-null int64
chanceCreationPassingClass        120 non-null object
chanceCreationCrossing            120 non-null int64
chanceCreationCrossingClass       120 non-null object
chanceCreationShooting            120 non-null int64
chanceCreationShootingClass       120 non-null object
chanceCreationPositioningClass    120 non-null object
defencePressure                   12

## Convert categorical value to number

In [194]:
df_teams_attr.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,...,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22 00:00:00,60,...,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19 00:00:00,52,...,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10 00:00:00,47,...,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22 00:00:00,70,...,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22 00:00:00,47,...,47,Press,52,Normal,Cover


In [183]:
# Find and replace class value into number

#buildUpPlaySpeedClass             
#buildUpPlayDribblingClass
#buildUpPlayPassingClass
#buildUpPlayPositioningClass
#chanceCreationPassingClass
#chanceCreationCrossingClass
#chanceCreationShootingClass  
#chanceCreationPositioningClass 
#defencePressureClass
#defenceAggressionClass
#defenceTeamWidthClass 
#defenceDefenderLineClass

In [184]:
df_epl_teams_attr.buildUpPlayDribblingClass.unique()

array([u'Little', u'Normal'], dtype=object)

In [185]:
category_num = {
    'buildUpPlaySpeedClass':
        {
            'Fast':1,
            'Balanced': 2,
            'Slow':3
        },
    'buildUpPlayDribblingClass':
        {
            'Little':1,
            'Normal': 2,
            'Lots':3
        },
    'buildUpPlayPassingClass':
        {
            'Short':1,
            'Mixed': 2,
            'Long':3
        },
    'buildUpPlayPositioningClass':
        {
            'Organised':1,
            'Free Form': 2
        },
    'chanceCreationPassingClass':
        {
            'Safe':1,
            'Normal': 2,
            'Risky':3
        },
    'chanceCreationCrossingClass':
        {
            'Little':1,
            'Normal': 2,
            'Lots':3
        },
    'chanceCreationShootingClass':
        {
            'Little':1,
            'Normal': 2,
            'Lots':3
        },
    'chanceCreationPositioningClass':
        {
            'Organised':1,
            'Free Form': 2
        },
     'defencePressureClass':
        {
            'Deep':1,
            'Medium': 2,
            'High':3
        },
    'defenceAggressionClass':
        {
            'Press':1,
            'Double': 2,
            'Contain':3
        },
    'defenceTeamWidthClass':
        {
            'Narrow':1,
            'Normal': 2,
            'Wide':3
        },
    'defenceDefenderLineClass':
        {
            'Cover':1
        }
    }
    

In [186]:
df_epl_teams_attr.replace(category_num, inplace=True)

## Add Team Name

In [187]:
#Add team name column in dataframe
df_epl_teams_attr['team_name'] = df_epl_teams_attr.team_fifa_api_id.apply(lambda row: df_epl_teams.loc[row,'name'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Convert Date to date time column

In [191]:
df_epl_teams_attr.date = pd.to_datetime(df_epl_teams_attr.date)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [192]:
df_epl_teams_attr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 70 to 1389
Data columns (total 24 columns):
id                                120 non-null int64
team_fifa_api_id                  120 non-null int64
date                              120 non-null datetime64[ns]
buildUpPlaySpeed                  120 non-null int64
buildUpPlaySpeedClass             120 non-null int64
buildUpPlayDribblingClass         120 non-null int64
buildUpPlayPassing                120 non-null int64
buildUpPlayPassingClass           120 non-null int64
buildUpPlayPositioningClass       120 non-null int64
chanceCreationPassing             120 non-null int64
chanceCreationPassingClass        120 non-null int64
chanceCreationCrossing            120 non-null int64
chanceCreationCrossingClass       120 non-null int64
chanceCreationShooting            120 non-null int64
chanceCreationShootingClass       120 non-null int64
chanceCreationPositioningClass    120 non-null int64
defencePressure                   12

In [197]:
df_epl_teams_attr.date.unique()

array(['2010-02-22T00:00:00.000000000', '2011-02-22T00:00:00.000000000',
       '2012-02-22T00:00:00.000000000', '2013-09-20T00:00:00.000000000',
       '2014-09-19T00:00:00.000000000', '2015-09-10T00:00:00.000000000'], dtype='datetime64[ns]')

## Add qualify flag for top 3 teams

Top 4 teams in the league will automatic qualify to Champions League

In [243]:
def is_qualify(row):    
    if pd.to_datetime(row.date).year == 2010:
        if row.team_name in ['Manchester United','Chelsea','Manchester City']:
            return 1
        else:
            return 0
    elif pd.to_datetime(row.date).year == 2011:
        if row.team_name in ['Manchester United','Arsenal','Manchester City']:
            return 1
        else:
            return 0
    elif pd.to_datetime(row.date).year == 2012:
        if row.team_name in ['Manchester United','Chelsea','Manchester City']:
            return 1
        else:
            return 0
    elif pd.to_datetime(row.date).year == 2013:
        if row.team_name in ['Liverpool','Chelsea','Manchester City']:
            return 1
        else:
            return 0
    elif pd.to_datetime(row.date).year == 2014:
        if row.team_name in ['Arsenal','Chelsea','Manchester City']:
            return 1
        else:
            return 0
    elif pd.to_datetime(row.date).year == 2015:
        if row.team_name in ['Arsenal','Leicester City','Tottenham Hotspur']:
            return 1
        else:
            return 0    
    
    else:
        return -1
    

In [246]:
df_epl_teams_attr['qualify'] = df_epl_teams_attr[['date','team_name']].apply(lambda row: is_qualify(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [248]:
df_epl_teams_attr[df_epl_teams_attr.qualify==1]

Unnamed: 0,id,team_fifa_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,...,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass,team_name,qualify
71,72,1,2011-02-22,75,1,...,45,2,1,Arsenal,1
74,75,1,2014-09-19,59,2,...,52,2,1,Arsenal,1
75,76,1,2015-09-10,59,2,...,52,2,1,Arsenal,1
306,307,5,2010-02-22,70,1,...,35,2,1,Chelsea,1
308,309,5,2012-02-22,60,2,...,46,2,1,Chelsea,1
309,310,5,2013-09-20,46,2,...,46,2,1,Chelsea,1
310,311,5,2014-09-19,67,1,...,46,2,1,Chelsea,1
717,718,95,2015-09-10,63,2,...,55,2,1,Leicester City,1
745,746,9,2013-09-20,39,2,...,61,2,1,Liverpool,1
800,801,10,2010-02-22,70,1,...,45,2,1,Manchester City,1


In [254]:
df_epl_teams_attr.to_csv(os.path.join('..', 'dataset', 'epl-predict-dataset-01'))