# Initialization

In [14]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge

In [15]:
train=pd.read_csv('../data/data.csv.zip', compression='zip', index_col=1)
train=train.drop(columns=['Unnamed: 0'])

# Data Cleaning

In [16]:
train.columns = train.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
train=train.drop(columns=['photo', 'flag', 'club_logo'])

train['value_unit'] = train['value'].str[-1]
train['nvalue'] = np.where(train['value_unit'] == '0', 0,
                          train['value'].str[1:-1].replace(r'[a-zA-Z]',''))
train['nvalue'] = train['nvalue'].astype(float)
train['nvalue'] = np.where(train['value_unit'] == 'M', train['nvalue'], train['nvalue'] / 1000)

train['wage_unit'] = train['wage'].str[-1]
train['nwage'] = np.where(train['wage_unit'] == '0', 0,
                          train['wage'].str[1:-1].replace(r'[a-zA-Z]',''))
train['nwage'] = train['nwage'].astype(float)
train['nwage'] = np.where(train['wage_unit'] == 'K', train['nwage'], train['nwage'] * 1000)


In [17]:
train.dropna(subset=['nvalue', 'aggression', 'overall', 'age', 'potential', 'international_reputation',
                    'weak_foot', 'skill_moves', 'crossing', 'finishing', 'headingaccuracy', 
                    'shortpassing', 'volleys', 'dribbling', 'curve', 'fkaccuracy', 'longpassing',
                    'ballcontrol', 'acceleration', 'sprintspeed',
                    'agility', 'reactions', 'balance', 'shotpower', 'jumping', 'stamina', 'strength',
                    'longshots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties',
                    'composure', 'marking', 'standingtackle', 'slidingtackle', 'gkdiving', 
                    'gkhandling', 'gkkicking', 'gkpositioning', 'gkreflexes', 'nwage'], inplace=True)


# Modelling

## Predictingt the overall rating of a player

In [None]:
X = train[['nvalue', 'aggression', 'age', 'potential', 'international_reputation',
                    'weak_foot', 'skill_moves', 'crossing', 'finishing', 'headingaccuracy', 
                    'shortpassing', 'volleys', 'dribbling', 'curve', 'fkaccuracy', 'longpassing',
                    'ballcontrol', 'acceleration', 'sprintspeed',
                    'agility', 'reactions', 'balance', 'shotpower', 'jumping', 'stamina', 'strength',
                    'longshots', 'aggression', 'interceptions', 'positioning', 'vision', 'penalties',
                    'composure', 'marking', 'standingtackle', 'slidingtackle', 'gkdiving', 
                    'gkhandling', 'gkkicking', 'gkpositioning', 'gkreflexes', 'nwage']]
y = train.overall

In [None]:
models = []
models.append(('LR', LinearRegression()))
models.append(('SGD', SGDRegressor(max_iter=1000, tol=1e-3)))
models.append(('ARD', ARDRegression()))
models.append(('BR', BayesianRidge()))


In [None]:
nfolds=10
i=-10
results = []
for name, model in models:
    kf = KFold(n_splits=nfolds, random_state=17)
    model.fit(X, y)
    y_pred = model.predict(X)
    score = cross_val_score(model, X, y, cv=kf)
    print('Model score and std dev', name, score.mean(), score.std())
    # The coefficients
    # print('Coefficients: \n', model.coef_)
    # The mean squared error
    print("Mean squared error: %.2f"
          % mean_squared_error(y, y_pred))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' % r2_score(y, y_pred))
    # Regression score 
    print('Regression score: %.2f' % model.score(X, y))
    results.append((name, score))
    if i is None or score.mean() > i :
        imodel=model
        i=score.mean()

print("Done")


Model score and std dev LR -5.608716605178189 2.980139959918183
Mean squared error: 3.42
Variance score: 0.93
Regression score: 0.93
Model score and std dev SGD -8.597546069827622e+25 1.1071139184990446e+26
Mean squared error: 84251049957601848951046144.00
Variance score: -1762233722755156533051392.00
Regression score: -1762233722755156533051392.00


# Results

In [None]:
results=np.array(results)

In [None]:
fig, ax = plt.subplots()
ax.boxplot(results[:,1])
ax.set_xticklabels(results[:,0])
ax.set_title('Algorithm comparision')
plt.show()

In [None]:
rf = np.array(models)[5:,1]
fi = imodel.fit(X, y).coef_
pos = np.arange(len(X.columns))
print("Best performing model", imodel)
plt.figure(figsize=(13, 8))
plt.barh(pos, fi)
plt.title("Feature Importance")
plt.xlabel("Model Accuracy")
plt.ylabel("Features")
plt.yticks(pos, (list(X)))
plt.show()