In [1]:
#In this project you are going to predict the overall rating of soccer player based on their attributes
#such as 'crossing', 'finishing etc.

In [2]:
import sqlite3
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer

In [3]:
cnx = sqlite3.connect('database.sqlite')
data = pd.read_sql_query("SELECT * FROM Player_Attributes", cnx)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 42 columns):
id                     183978 non-null int64
player_fifa_api_id     183978 non-null int64
player_api_id          183978 non-null int64
date                   183978 non-null object
overall_rating         183142 non-null float64
potential              183142 non-null float64
preferred_foot         183142 non-null object
attacking_work_rate    180748 non-null object
defensive_work_rate    183142 non-null object
crossing               183142 non-null float64
finishing              183142 non-null float64
heading_accuracy       183142 non-null float64
short_passing          183142 non-null float64
volleys                181265 non-null float64
dribbling              183142 non-null float64
curve                  181265 non-null float64
free_kick_accuracy     183142 non-null float64
long_passing           183142 non-null float64
ball_control           183142 non-null float64
accele

In [4]:
#As seen above all the features of the dataframe are not of data type float we use Label Encoder to convert the categorical
#values with categorical codes which are numeric
data_obj = data[data.columns[data.dtypes == object]]
data_obj.drop('date',axis=1, inplace=True)
data_obj = data_obj[data_obj.columns].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [5]:
#Instantiating the Label Encoder
catenc = LabelEncoder()

In [6]:
#Applying the Label Encoder to all the features with data type as object
data_obj = data_obj.apply(catenc.fit_transform)

In [7]:
#Replacing all object values with encoded values in dataframe
data[data_obj.columns] = data_obj

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 42 columns):
id                     183978 non-null int64
player_fifa_api_id     183978 non-null int64
player_api_id          183978 non-null int64
date                   183978 non-null object
overall_rating         183142 non-null float64
potential              183142 non-null float64
preferred_foot         183978 non-null int32
attacking_work_rate    183978 non-null int32
defensive_work_rate    183978 non-null int32
crossing               183142 non-null float64
finishing              183142 non-null float64
heading_accuracy       183142 non-null float64
short_passing          183142 non-null float64
volleys                181265 non-null float64
dribbling              183142 non-null float64
curve                  181265 non-null float64
free_kick_accuracy     183142 non-null float64
long_passing           183142 non-null float64
ball_control           183142 non-null float64
accelerat

In [9]:
#Creating a new dataframe with features and target for imputation process, Imputer helps in filling the missing values
#it uses statistical values or a constant value
df = data[data.columns[4:]]

In [10]:
#Instantiate the Imputer 
imputer = Imputer(strategy='mean')



In [11]:
#the output of imputed values of dataframe is an array
df_encoded_array = imputer.fit_transform(df)

In [12]:
features = df.columns[1:]

In [13]:
target = df.columns[0]

In [14]:
#Creating a dataframe with imputed and encoded values
df_encoded = pd.DataFrame(df_encoded_array, columns = df.columns)

In [15]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 38 columns):
overall_rating         183978 non-null float64
potential              183978 non-null float64
preferred_foot         183978 non-null float64
attacking_work_rate    183978 non-null float64
defensive_work_rate    183978 non-null float64
crossing               183978 non-null float64
finishing              183978 non-null float64
heading_accuracy       183978 non-null float64
short_passing          183978 non-null float64
volleys                183978 non-null float64
dribbling              183978 non-null float64
curve                  183978 non-null float64
free_kick_accuracy     183978 non-null float64
long_passing           183978 non-null float64
ball_control           183978 non-null float64
acceleration           183978 non-null float64
sprint_speed           183978 non-null float64
agility                183978 non-null float64
reactions              183978 non-null floa

In [16]:
X = df_encoded[features]

In [17]:
y = df_encoded[target]

In [18]:
y = np.ravel(y)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 5)

In [20]:
reg = LinearRegression()

In [21]:
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [22]:
reg.score(X_train, y_train)

0.8420106763835232

In [23]:
reg.score(X_test, y_test)

0.8404894148232488

In [24]:
y_reg_pred = reg.predict(X_test)

In [25]:
tree = DecisionTreeRegressor()

In [26]:
tree.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [27]:
tree.score(X_train, y_train)

0.9994700595132561

In [28]:
tree.score(X_test, y_test)

0.9540237106736155

In [29]:
y_tree_pred = tree.predict(X_test)

In [30]:
mse_reg = mean_squared_error(y_test, y_reg_pred)
print('The mean squared error using linear regression is',mse_reg)

The mean squared error using linear regression is 7.856259827423725


In [31]:
rmse_reg = sqrt(mse_reg)
print('The root mean squared error using linear regression is',rmse_reg)

The root mean squared error using linear regression is 2.802902036715469


In [32]:
mse_tree = mean_squared_error(y_test, y_tree_pred)
print('The mean squared error using Decision Tree is',mse_tree)

The mean squared error using Decision Tree is 2.2644370243432013


In [33]:
rmse_tree = sqrt(mse_tree)
print('The root mean squared error using Decision Tree is',rmse_tree)

The root mean squared error using Decision Tree is 1.5048046465715081


In [34]:
#To test the algorithm let us predict the overall rating of an unseen player with both algorithms
a = np.array([[71, 1, 3, 15, 60,30, 80, 55, 50, 90, 85, 90, 60, 85, 80, 80, 75, 80, 80, 50, 60, 70, 75, 60, 40, 70, 45, 70, 70, 65, 80, 70, 5, 4, 30, 10, 10]])

In [35]:
reg.predict(a)

array([72.74658219])

In [36]:
tree.predict(a)

array([70.])

In [None]:
#Based on the accuracy, mean squared error and root mean squared error calculated for both the algorithms
#it can be concluded that Decision Tree performs better than linear regression for the above dataset