# Regression with scikit-learn using Soccer Dataset

We will use the open dataset from Kaggle. This European Soccer Database has more than 25,000 matches and more than 10,000 players for European professional soccer seasons from 2008 to 2016.

## Import Libraries

In [1]:
import sqlite3
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

## Read Data from the Database into pandas

In [2]:
# create your connection
cnx=sqlite3.connect('Data/database.sqlite')
df=pd.read_sql_query("SELECT * FROM Player_Attributes", cnx)

In [3]:
df.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
2,3,218353,505942,2015-09-21 00:00:00,62.0,66.0,right,medium,medium,49.0,...,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0
3,4,218353,505942,2015-03-20 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0
4,5,218353,505942,2007-02-22 00:00:00,61.0,65.0,right,medium,medium,48.0,...,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0


In [4]:
df.shape

(183978, 42)

In [5]:
df.columns

Index([u'id', u'player_fifa_api_id', u'player_api_id', u'date',
       u'overall_rating', u'potential', u'preferred_foot',
       u'attacking_work_rate', u'defensive_work_rate', u'crossing',
       u'finishing', u'heading_accuracy', u'short_passing', u'volleys',
       u'dribbling', u'curve', u'free_kick_accuracy', u'long_passing',
       u'ball_control', u'acceleration', u'sprint_speed', u'agility',
       u'reactions', u'balance', u'shot_power', u'jumping', u'stamina',
       u'strength', u'long_shots', u'aggression', u'interceptions',
       u'positioning', u'vision', u'penalties', u'marking', u'standing_tackle',
       u'sliding_tackle', u'gk_diving', u'gk_handling', u'gk_kicking',
       u'gk_positioning', u'gk_reflexes'],
      dtype='object')

## Declare the columns you want to use as features

In [6]:
features = [
       'potential', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes']

## Specify the Prediction Target

In [7]:
target=['overall_rating']

## Clean the Data

In [8]:
df=df.dropna()

## Extract Features and Target ('overall_rating') Values into Separate Dataframe

In [9]:
X=df[features]
y=df[target]

In [12]:
X.iloc[2]

potential             66.0
crossing              49.0
finishing             44.0
heading_accuracy      71.0
short_passing         61.0
volleys               44.0
dribbling             51.0
curve                 45.0
free_kick_accuracy    39.0
long_passing          64.0
ball_control          49.0
acceleration          60.0
sprint_speed          64.0
agility               59.0
reactions             47.0
balance               65.0
shot_power            55.0
jumping               58.0
stamina               54.0
strength              76.0
long_shots            35.0
aggression            63.0
interceptions         41.0
positioning           45.0
vision                54.0
penalties             48.0
marking               65.0
standing_tackle       66.0
sliding_tackle        69.0
gk_diving              6.0
gk_handling           11.0
gk_kicking            10.0
gk_positioning         8.0
gk_reflexes            8.0
Name: 2, dtype: float64

In [14]:
y.head()

Unnamed: 0,overall_rating
0,67.0
1,67.0
2,62.0
3,61.0
4,61.0


## Split the Dataset into Training and Test Datasets

In [15]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.33, random_state=324)

## (1) Linear regression: Fit a model to the training set

In [17]:
regressor=LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

## Perform Prediction using Linear Regression Model

In [18]:
y_prediction=regressor.predict(X_test)
y_prediction

array([[66.51284879],
       [79.77234615],
       [66.57371825],
       ...,
       [69.23780133],
       [64.58351696],
       [73.6881185 ]])

## What is the mean of the expected target value in the test set?

In [19]:
y_test.describe()

Unnamed: 0,overall_rating
count,59517.0
mean,68.635818
std,7.041297
min,33.0
25%,64.0
50%,69.0
75%,73.0
max,94.0


## Evaluate Linear Regression Accuracy using Root Mean Square Error

In [20]:
RMSE=sqrt(mean_squared_error(y_true=y_test, y_pred=y_prediction))

In [21]:
print(RMSE)

2.80530304686


## (2) Decision Tree Regressor: Fit a new regression model to the training set

In [22]:
regressor=DecisionTreeRegressor(max_depth=20)
regressor.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

## Perform Prediction using Decision Tree Regressor

In [23]:
y_prediciton=regressor.predict(X_test)
y_prediction

array([[66.51284879],
       [79.77234615],
       [66.57371825],
       ...,
       [69.23780133],
       [64.58351696],
       [73.6881185 ]])

## For comparison: What is the mean of the expected target value in the test set?

In [24]:
y_test.describe()

Unnamed: 0,overall_rating
count,59517.0
mean,68.635818
std,7.041297
min,33.0
25%,64.0
50%,69.0
75%,73.0
max,94.0


## Evaluate Decision Tree Regression Accuracy using Root Mean Square Error

In [25]:
RMSE=sqrt(mean_squared_error(y_true=y_test, y_pred=y_prediction))

In [26]:
print(RMSE)

2.80530304686
