## ML Project

### Predicting players rating

Predict the overall rating of soccer player based on their attributes such as 'crossing', 'finishing etc.

The dataset is from European Soccer Database.
(https://www.kaggle.com/hugomathien/soccer) 
It has more than 25,000 matches and more than 10,000 players for European professional soccer seasons from 2008 to 2016.

In [38]:
import sqlite3
import numpy as np
import pandas as pd
%matplotlib notebook
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from xgboost import plot_importance

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit, RandomizedSearchCV
from sklearn.pipeline import make_pipeline

import pickle

In [63]:
# Creating connection.
cnx = sqlite3.connect('database.sqlite')
df = pd.read_sql_query("SELECT * FROM Player_Attributes", cnx)

In [64]:
df.head(2)

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,...,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,...,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0


In [65]:
y=df.pop('overall_rating')

In [66]:
y.isnull().sum()

836

In [67]:
y = y.fillna(y.mean())

In [68]:
y.isnull().values.any()

False

In [69]:
df.dtypes == object

id                     False
player_fifa_api_id     False
player_api_id          False
date                    True
potential              False
preferred_foot          True
attacking_work_rate     True
defensive_work_rate     True
crossing               False
finishing              False
heading_accuracy       False
short_passing          False
volleys                False
dribbling              False
curve                  False
free_kick_accuracy     False
long_passing           False
ball_control           False
acceleration           False
sprint_speed           False
agility                False
reactions              False
balance                False
shot_power             False
jumping                False
stamina                False
strength               False
long_shots             False
aggression             False
interceptions          False
positioning            False
vision                 False
penalties              False
marking                False
standing_tackl

There are 4 columns which are objects.
Let us convert them into numeric by one hot encoding and dropping ID, Date column.

In [70]:
X=df.drop(['id', 'date'], axis=1)

In [71]:
X= pd.get_dummies(X, columns=['preferred_foot', 'attacking_work_rate', 'defensive_work_rate'])
X.head()

Unnamed: 0,player_fifa_api_id,player_api_id,potential,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,...,defensive_work_rate_9,defensive_work_rate__0,defensive_work_rate_ean,defensive_work_rate_es,defensive_work_rate_high,defensive_work_rate_low,defensive_work_rate_medium,defensive_work_rate_o,defensive_work_rate_ormal,defensive_work_rate_tocky
0,218353,505942,71.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,...,0,0,0,0,0,0,1,0,0,0
1,218353,505942,71.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,...,0,0,0,0,0,0,1,0,0,0
2,218353,505942,66.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,...,0,0,0,0,0,0,1,0,0,0
3,218353,505942,65.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,...,0,0,0,0,0,0,1,0,0,0
4,218353,505942,65.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,...,0,0,0,0,0,0,1,0,0,0


In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [73]:
from sklearn.preprocessing import Imputer,StandardScaler
imput = Imputer()
X_train = imput.fit_transform(X_train)
X_test = imput.fit_transform(X_test)



In [74]:
X_train.shape

(137983, 65)

In [83]:
#selector = SelectFromModel(XGBRegressor())
#selector.fit(X_train, y_train)                         
#selector.estimator_.coef_
#s_X_train=selector.transform(X_train)
#s_X_test=selector.transform(X_test)

In [85]:
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(.5 * (1 - .5)))
s_X_train=sel.fit_transform(X_train)
s_X_test=sel.fit_transform(X_test)

In [86]:
s_X_train.shape

(137983, 36)

In [14]:
#from sklearn.feature_selection import VarianceThreshold

#sel = VarianceThreshold(threshold=(.5 * (1 - .5)))
#s_X_train=sel.fit_transform(X_train)
#s_X_test=sel.fit_transform(X_test)

In [87]:
s_X_train.shape, s_X_test.shape

((137983, 36), (45995, 36))

In [103]:
# Put models in a dictionary
models = {"LinReg": LinearRegression(),
          "DecTree": DecisionTreeRegressor(), 
          "Random Forest": RandomForestRegressor()}

# Create function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)    
    model_scores = {}
    # Loop through modelsfor name, model in models.items():
    for name, model in models.items():
        # Fit the model to the data
        model.fit(s_X_train, y_train)
        # Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(s_X_test, y_test)
    return model_scores

In [104]:
model_scores = fit_and_score(models=models,
                             X_train=X_train,
                             X_test=X_test,
                             y_train=y_train,
                             y_test=y_test)
model_scores



{'LinReg': 0.8565819342122898,
 'DecTree': 0.9606292832395238,
 'Random Forest': 0.9786332036813247}