In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import warnings
import seaborn as sns
import statsmodels.api as sm

from scipy import stats
from scipy.stats import norm, skew, pearsonr
from sklearn.preprocessing import OneHotEncoder, scale, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

import plotly.express as px

%matplotlib inline
# warnings.filterwarnings("ignore") # warnings were turned off at the end
sns.set_style("white")
pylab.rcParams['figure.figsize'] = 10, 8
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

  import pandas.util.testing as tm


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import sys
sys.path.append('/content/drive/My Drive/Capstone/Helpers/ML/')
from ml_helpers import *

# Data Preperation

In [0]:
# gk = pd.read_pickle("../ml_data/gk_final.pkl")
gk = pd.read_pickle("/content/drive/My Drive/Capstone/Data/Clean/model_data/gk_final.pkl").drop_duplicates(keep = False,ignore_index = True)
gk["type"] = gk["type"].astype(object)
gk["type"] = gk["type"].replace({"mid_seson":"mid_season"})

In [0]:
gk["mv_log"] = np.log1p(gk.mv)
gk["cum_mv_log"] = np.log1p(gk.cum_mv)

In [0]:
gk_drop_cols = ["tm_id", "sf", "from", "to", "transfer_season",
               "loan", "main_field_position", "field_position", 
               "dob", "stats_season", "fee", "mv", "year", "date", "cum_mv"]

gk_cat_cols = ["type","continent"]

X,y = prepare_data(gk, gk_drop_cols, gk_cat_cols)
encoded_cat_cols = X.select_dtypes(include='int64').columns
X_non_cat = X.drop(encoded_cat_cols, axis = 1)
X_non_cat = remove_corr(X_non_cat)
X = pd.concat([X[encoded_cat_cols],X_non_cat],axis = 1)

# Feature Selection

## FS by p_value

In [35]:
gk_initial_ols = sm.OLS(y,X_non_cat).fit()
gk_initial_ols.summary()

0,1,2,3
Dep. Variable:,fee_log,R-squared (uncentered):,0.996
Model:,OLS,Adj. R-squared (uncentered):,0.996
Method:,Least Squares,F-statistic:,7384.0
Date:,"Tue, 09 Jun 2020",Prob (F-statistic):,0.0
Time:,17:36:45,Log-Likelihood:,-403.16
No. Observations:,309,AIC:,826.3
Df Residuals:,299,BIC:,863.7
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
runner_up,0.3040,0.185,1.648,0.100,-0.059,0.667
winner,0.1682,0.208,0.808,0.420,-0.241,0.578
height,0.0210,0.003,6.155,0.000,0.014,0.028
age,-0.0620,0.015,-4.236,0.000,-0.091,-0.033
mp,6.724e-05,4.6e-05,1.461,0.145,-2.33e-05,0.000
ppg,-0.1728,0.096,-1.808,0.072,-0.361,0.015
gcpcs,-0.0223,0.021,-1.086,0.278,-0.063,0.018
cum_mpcs,0.0004,0.001,0.550,0.582,-0.001,0.002
mv_log,0.8203,0.048,17.032,0.000,0.725,0.915

0,1,2,3
Omnibus:,0.901,Durbin-Watson:,1.495
Prob(Omnibus):,0.637,Jarque-Bera (JB):,0.967
Skew:,-0.127,Prob(JB):,0.617
Kurtosis:,2.895,Cond. No.,12000.0


In [0]:
gk_ols,gk_cols = significant_inputs(X_non_cat, y)
gk_cols = gk_cols + list(encoded_cat_cols) + ['ppg']
X = X[gk_cols]

In [37]:
X

Unnamed: 0,height,age,mv_log,tourn_year,x0_mid_season,x0_summer,x0_winter,x1_AF,x1_AS,x1_EU,x1_NA,x1_OC,ppg
0,187.0,23.0,14.731802,1,0,1,0,0,0,1,0,0,1.392500
1,196.0,20.0,12.429220,1,0,1,0,0,0,1,0,0,0.556667
2,196.0,33.0,12.899222,0,0,1,0,0,0,1,0,0,0.765000
3,192.0,25.0,14.038655,0,0,1,0,0,0,1,0,0,1.223333
4,192.0,25.0,14.038655,0,0,1,0,0,0,1,0,0,1.223333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,195.0,29.0,14.731802,1,0,1,0,0,0,1,0,0,2.050000
305,195.0,28.0,14.731802,0,0,1,0,0,0,1,0,0,1.290000
306,195.0,25.0,14.914123,1,0,1,0,0,0,1,0,0,2.187500
307,195.0,24.0,13.122365,1,0,1,0,0,0,1,0,0,0.565000


In [38]:
gk_ols.summary()

0,1,2,3
Dep. Variable:,fee_log,R-squared:,0.569
Model:,OLS,Adj. R-squared:,0.565
Method:,Least Squares,F-statistic:,134.1
Date:,"Tue, 09 Jun 2020",Prob (F-statistic):,2.03e-55
Time:,17:36:45,Log-Likelihood:,-407.96
No. Observations:,309,AIC:,823.9
Df Residuals:,305,BIC:,838.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.4780,2.577,-0.574,0.567,-6.549,3.593
height,0.0267,0.013,2.102,0.036,0.002,0.052
age,-0.0567,0.014,-3.923,0.000,-0.085,-0.028
mv_log,0.8444,0.042,19.965,0.000,0.761,0.928

0,1,2,3
Omnibus:,0.918,Durbin-Watson:,1.496
Prob(Omnibus):,0.632,Jarque-Bera (JB):,0.879
Skew:,-0.13,Prob(JB):,0.644
Kurtosis:,2.977,Cond. No.,9540.0


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 33)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((216, 13), (93, 13), (216,), (93,))

# Modeling

## Linear Regression

In [0]:
reg_model = LinearRegression(normalize=True)

## ElasticNet

In [41]:
elastic_model = ElasticNetCV(n_alphas = 500, cv = 5)
elastic_model.fit(X_train, y_train)
elastic_model.score(X_test, y_test)

0.5244629226402295

## Poly reg

In [0]:
poly_predictors = PolynomialFeatures(degree = 2, interaction_only = True)
x_train_poly = poly_predictors.fit_transform(X_train)
x_test_poly = poly_predictors.fit_transform(X_test)
    
poly_model = LinearRegression()
poly_model_fit = poly_model.fit(x_train_poly, y_train)

## DTR

In [0]:
tune_tree_reg = DecisionTreeRegressor(random_state = 33)

tree_params = {
    "max_depth": np.arange(1,30),
    "min_samples_split": np.linspace(0.1, 1.0, 10, endpoint = True),
    "max_features": list(range(1,X_train.shape[1])),
}

tune_tree_reg_cv = GridSearchCV(tune_tree_reg, tree_params, cv = 5, scoring = "neg_mean_squared_error")
tune_tree_reg_cv.fit(X_train, y_train)

best_params = tune_tree_reg_cv.best_params_

tree_reg = DecisionTreeRegressor(max_depth = best_params["max_depth"], 
                                 min_samples_split= best_params["min_samples_split"],
                                 max_features = best_params["max_features"], random_state = 33)
tree_reg.fit(X_train, y_train)

tree_pred = tree_reg.predict(X_test)
rmse_tree = rmse(y_test, tree_pred)
r2_tree = r2_score(y_test, tree_pred)

## RFR

In [18]:
tune_rand_forest = RandomForestRegressor(random_state = 33)

rand_forest_params = {
    "max_depth": np.arange(1,30),
}

tune_rand_forest_cv = GridSearchCV(tune_rand_forest, rand_forest_params, cv = 5, scoring = "neg_mean_squared_error")
tune_rand_forest_cv.fit(X_train, y_train)

best_forest_params = tune_rand_forest_cv.best_params_

tuned_rand_forest = RandomForestRegressor(max_depth = best_forest_params["max_depth"], random_state = 33)
tuned_rand_forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=4, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=33, verbose=0, warm_start=False)

# Summary

In [0]:
voting_regressor = VotingRegressor([
    ("lr", reg_model),
    ("dt", tree_reg),
    ("rf", tuned_rand_forest),
    ("en", elastic_model)
])

In [44]:
poly_model_df = model_summary([poly_model], x_train_poly, y_train, x_test_poly, y_test)
poly_model_df.loc[0,'model'] = "PolynomialRegression"

model_summary_df = model_summary([reg_model,
                                  tree_reg,
                                  tuned_rand_forest,
                                  elastic_model, 
                                  voting_regressor],X_train, y_train, X_test, y_test)

gk_results_df = pd.concat([model_summary_df, poly_model_df])
gk_results_df = gk_results_df.sort_values(by = ["rmse_cv_mean"]).reset_index(drop=True)
gk_results_df

Unnamed: 0,model,train_r2,test_r2,train_rmse,test_rmse,rmse_cv_mean,r2_cv_mean
0,LinearRegression,0.588419,0.537297,5484805,2011413,4537298,0.59961
1,PolynomialRegression,0.616647,0.60098,5382389,1867874,4559589,0.571423
2,ElasticNetCV,0.586685,0.569935,5694579,1939175,4654375,0.573936
3,VotingRegressor,0.667315,0.606506,5018984,1854894,4700893,0.591076
4,DecisionTreeRegressor,0.632005,0.448419,5275111,2196114,5004455,0.437145
5,RandomForestRegressor,0.734828,0.460779,3700839,2171369,5359578,0.309089


In [0]:
gk_results_df.to_pickle("/content/drive/My Drive/Capstone/Data/Summary/gk_results.pkl")

In [46]:
rmse_fig = px.bar(gk_results_df.sort_values(by = "test_rmse"), x = 'test_rmse', y = 'model',
                  orientation = 'h', color = 'model', 
                  title = 'Test RMSE for each model', 
                  labels = {'rmse_cv_mean': 'RMSE(test)', 'model': 'Model used'})
rmse_fig.update_layout(showlegend=False)
rmse_fig.show()

In [47]:
rmse_fig = px.bar(gk_results_df, x = 'rmse_cv_mean', y = 'model',
                  orientation = 'h', color = 'model', 
                  title = 'CV RMSE for each model - Goalkeepers', 
                  labels = {'rmse_cv_mean': 'RMSE(cv)', 'model': 'Model used'})
rmse_fig.update_layout(showlegend=False)
rmse_fig.show()

# Visualizing the best model's predictions

In [0]:
y_best_pred = reg_model.predict(X_test)

In [0]:
coef_df = pd.DataFrame({"score":np.array(reg_model.coef_), "feature":X_test.columns})
coef_df = coef_df[coef_df.score != 0]
coef_df.sort_values(by = "score", ascending = False, inplace = True)

In [27]:
fig_ft = px.bar(coef_df, x = 'score', y = 'feature',
       orientation = 'h', color = 'feature', 
       title = 'Weight of each feature in MLR', 
       labels = {'score': 'Weight', 'feature': 'Feature'})
fig_ft.update_layout(showlegend=False)
fig_ft.show()

In [28]:
chart_regression(y_best_pred.flatten(),y_test, scaled_back = True,
                 title="Goalkeepers' actual and predicted prices",
                 y_axis="Price in Millions")

In [29]:
fig = px.scatter(x = np.expm1(y_best_pred), 
                 y = np.expm1(y_test), labels = {"x":"Predicted Price","y":"Actual Price"},
                 title = "Goalkeepers' actual transfer price vs predicted transfer price")
corr, _ = pearsonr(y_best_pred,y_test)
fig.show()
print("Correlation between the variables",corr)

Correlation between the variables 0.7173850563967604


In [30]:
imp_ft = pd.DataFrame({'feature':X_train.columns,'importance':tuned_rand_forest.feature_importances_})
imp_ft = imp_ft[imp_ft.importance != 0] 
imp_ft = imp_ft.sort_values(by = "importance", ascending = False)

fig_ft = px.bar(imp_ft, x = 'importance', y = 'feature',
       orientation = 'h', color = 'feature', 
       title = 'Importance of each feature in RFR', 
       labels = {'importance': 'Importance', 'feature': 'Feature'})
fig_ft.update_layout(showlegend=False)
fig_ft.show()

In [0]:
pd.DataFrame({"actual":y_test, "predicted":y_best_pred,"position":"Goalkeepers"}).to_pickle("/content/drive/My Drive/Capstone/Data/Summary/gk_predicted.pkl")