In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import warnings
import seaborn as sns
import statsmodels.api as sm

from scipy import stats
from scipy.stats import norm, skew, pearsonr
from sklearn.preprocessing import OneHotEncoder, scale, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

import plotly.express as px

seed = 55


%matplotlib inline
# warnings.filterwarnings("ignore") # warnings were turned off at the end
sns.set_style("white")
pylab.rcParams['figure.figsize'] = 10, 8
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import sys
sys.path.append('/content/drive/My Drive/Capstone/Helpers/ML/')
from ml_helpers import *

In [0]:
# defenders = pd.read_pickle("../ml_data/defenders_final.pkl")
defenders = pd.read_pickle("/content/drive/My Drive/Capstone/Data/Clean/defenders_final.pkl").drop_duplicates(keep = False,ignore_index = True)
defenders.replace({"mid_seson": "mid_season"}, inplace = True)
defenders["type"] = defenders["type"].astype('object')

# Data Preperation

In [0]:
defenders["mv_log"] = np.log1p(defenders.mv)
defenders["cum_mv_log"] = np.log1p(defenders.cum_mv)

drop_cols = ["tm_id", "transfer_season","main_field_position","stats_season", "fee", "year", "mv", "cum_mv"]
cat_cols = ["type", "continent", "sf", "field_position"]

X,y = prepare_data(defenders, drop_cols, cat_cols)

In [0]:
encoded_cat_cols = X.select_dtypes('int64').columns

In [9]:
X_non_cat = X.drop(encoded_cat_cols, axis = 1)
X_non_cat = remove_corr(X_non_cat)
X = pd.concat([X[encoded_cat_cols],X_non_cat],axis = 1)
X

Unnamed: 0,tourn_year,x0_mid_season,x0_summer,x1_AF,x1_AS,x1_EU,x1_NA,x1_OC,x2_both,x2_left,x2_no_info,age,a,g,height,app,ppg,rc,fmpct,mpyc,cum_games_won,cum_rc,cum_mpyc,cum_fmpct,runner_up,winner,cum_runner_up,cum_winner,mv_log,cum_mv_log
0,0,0,1,0,0,1,0,0,0,0,0,21.0,3.0,2.0,195.0,34.0,0.9250,0.0,0.921262,751.750000,41.892500,0.0,294.672222,0.899774,0.0,1.0,2.0,0.0,17.034386,16.677711
1,0,0,1,0,0,1,0,0,0,0,0,18.0,0.0,0.0,192.0,26.0,0.6000,0.0,0.728423,326.333333,102.290000,0.0,1159.833333,0.762813,0.0,0.0,0.0,0.0,13.527830,12.959847
2,0,0,1,0,0,1,0,0,0,0,0,18.0,0.0,0.0,192.0,26.0,0.6000,0.0,0.728423,326.333333,102.290000,0.0,1159.833333,0.762813,0.0,0.0,0.0,0.0,13.527830,13.527830
3,1,0,1,0,0,1,0,0,0,0,0,20.0,0.0,0.0,186.0,17.0,2.4100,0.0,0.298713,243.750000,31.991111,0.0,355.912500,0.855659,1.0,0.0,0.0,0.0,13.592368,0.000000
4,1,0,1,0,0,1,0,0,0,0,0,20.0,0.0,0.0,186.0,17.0,2.4100,0.0,0.298713,243.750000,31.991111,0.0,355.912500,0.855659,1.0,0.0,0.0,0.0,13.592368,13.592368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1279,0,0,0,0,0,1,0,0,0,0,0,28.0,0.0,1.0,191.0,8.0,2.3000,0.0,0.681818,180.000000,5.335000,0.0,478.750000,0.288518,0.0,0.0,0.0,0.0,13.815512,0.000000
1280,0,0,0,0,0,1,0,0,0,0,0,28.0,1.0,0.0,191.0,29.0,1.2400,0.0,0.937500,652.500000,6.686667,0.0,523.500000,0.226799,0.0,0.0,0.0,0.0,13.815512,0.000000
1281,0,0,1,0,0,1,0,0,0,1,0,21.0,3.0,1.0,183.0,19.0,1.8675,0.0,0.365036,806.000000,122.915833,3.0,341.958333,0.471778,0.0,0.0,0.0,0.0,14.187075,14.187075
1282,0,0,1,0,0,1,0,0,0,0,0,23.0,0.0,0.0,184.0,20.0,0.4850,0.0,0.609167,1462.000000,62.872500,1.0,578.061667,0.731671,1.0,0.0,1.0,2.0,15.068274,14.827112


# Feature selection

## Selecting features based on p value

In [10]:
X_non_cat = X.drop(encoded_cat_cols, axis = 1)
def_initial_ols = sm.OLS(y,X_non_cat).fit()
def_initial_ols.summary()

0,1,2,3
Dep. Variable:,fee_log,R-squared (uncentered):,0.996
Model:,OLS,Adj. R-squared (uncentered):,0.996
Method:,Least Squares,F-statistic:,15590.0
Date:,"Sun, 07 Jun 2020",Prob (F-statistic):,0.0
Time:,13:54:12,Log-Likelihood:,-1713.8
No. Observations:,1284,AIC:,3466.0
Df Residuals:,1265,BIC:,3564.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-0.1010,0.009,-11.257,0.000,-0.119,-0.083
a,0.0123,0.019,0.662,0.508,-0.024,0.049
g,0.0087,0.021,0.425,0.671,-0.032,0.049
height,0.0168,0.002,8.641,0.000,0.013,0.021
app,0.0070,0.003,2.388,0.017,0.001,0.013
ppg,0.0360,0.048,0.752,0.452,-0.058,0.130
rc,0.1215,0.080,1.518,0.129,-0.036,0.278
fmpct,-0.1347,0.147,-0.915,0.360,-0.423,0.154
mpyc,8.624e-06,7.52e-05,0.115,0.909,-0.000,0.000

0,1,2,3
Omnibus:,23.024,Durbin-Watson:,1.337
Prob(Omnibus):,0.0,Jarque-Bera (JB):,39.686
Skew:,-0.107,Prob(JB):,2.41e-09
Kurtosis:,3.834,Cond. No.,5260.0


In [11]:
cols = significant_inputs(X_non_cat,y)
X = X[list(cols[1]) + list(encoded_cat_cols)]
X

Unnamed: 0,age,app,cum_games_won,cum_mpyc,mv_log,tourn_year,x0_mid_season,x0_summer,x1_AF,x1_AS,x1_EU,x1_NA,x1_OC,x2_both,x2_left,x2_no_info
0,21.0,34.0,41.892500,294.672222,17.034386,0,0,1,0,0,1,0,0,0,0,0
1,18.0,26.0,102.290000,1159.833333,13.527830,0,0,1,0,0,1,0,0,0,0,0
2,18.0,26.0,102.290000,1159.833333,13.527830,0,0,1,0,0,1,0,0,0,0,0
3,20.0,17.0,31.991111,355.912500,13.592368,1,0,1,0,0,1,0,0,0,0,0
4,20.0,17.0,31.991111,355.912500,13.592368,1,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1279,28.0,8.0,5.335000,478.750000,13.815512,0,0,0,0,0,1,0,0,0,0,0
1280,28.0,29.0,6.686667,523.500000,13.815512,0,0,0,0,0,1,0,0,0,0,0
1281,21.0,19.0,122.915833,341.958333,14.187075,0,0,1,0,0,1,0,0,0,1,0
1282,23.0,20.0,62.872500,578.061667,15.068274,0,0,1,0,0,1,0,0,0,0,0


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)

X_train.shape, X_test.shape, y_train.shape, y_test.shape, 

((1027, 16), (257, 16), (1027,), (257,))

# Polynomial Regression

In [0]:
reg_model = LinearRegression()

poly_predictors = PolynomialFeatures(degree = 2, interaction_only = True)
x_train_poly = poly_predictors.fit_transform(X_train)
x_test_poly = poly_predictors.fit_transform(X_test)
    
poly_model = LinearRegression()
poly_model_fit = poly_model.fit(x_train_poly, y_train)

# ElasticNet

In [14]:
elastic_model = ElasticNetCV(n_alphas = 500, cv = 5)
elastic_model.fit(X_train, y_train)
elastic_model.score(X_test, y_test)

0.6366700704275412

# DTR

In [0]:
tune_tree_reg = DecisionTreeRegressor(random_state = 33)

tree_params = {
    "max_depth": np.arange(1,30),
    "min_samples_split": np.linspace(0.1, 1.0, 10, endpoint = True),
    "max_features": list(range(1,X_train.shape[1])),
}

tune_tree_reg_cv = GridSearchCV(tune_tree_reg, tree_params, cv = 5, scoring = "neg_mean_squared_error")
tune_tree_reg_cv.fit(X_train, y_train)

best_params = tune_tree_reg_cv.best_params_

tree_reg = DecisionTreeRegressor(max_depth = best_params["max_depth"], 
                                 min_samples_split= best_params["min_samples_split"],
                                 max_features = best_params["max_features"], random_state = 33)
tree_reg.fit(X_train, y_train)

tree_pred = tree_reg.predict(X_test)
rmse_tree = rmse(y_test, tree_pred)
r2_tree = r2_score(y_test, tree_pred)

# RFR

In [16]:
tune_rand_forest = RandomForestRegressor(random_state = 10)

rand_forest_params = {
    "n_estimators": [100,150,350,450,550],
}

tune_rand_forest_cv = GridSearchCV(tune_rand_forest, rand_forest_params, cv = 5, scoring = "neg_mean_squared_error")
tune_rand_forest_cv.fit(X_train, y_train)

best_forest_params = tune_rand_forest_cv.best_params_


tuned_rand_forest = RandomForestRegressor(n_estimators = best_forest_params["n_estimators"], random_state = 33)
tuned_rand_forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=450, n_jobs=None, oob_score=False,
                      random_state=33, verbose=0, warm_start=False)

# Voting Regressor

In [0]:
voting_regressor = VotingRegressor([
    ("lr", reg_model),
    ("dt", tree_reg),
    ("rf", tuned_rand_forest),
    ("en", elastic_model)
])

# Summary

In [18]:
poly_model_df = model_summary([poly_model], x_train_poly, y_train, x_test_poly, y_test)
poly_model_df.loc[0,'model'] = "PolynomialRegression"

model_summary_df = model_summary([reg_model,
                                  tree_reg,
                                  tuned_rand_forest,
                                  elastic_model, 
                                  voting_regressor],X_train, y_train, X_test, y_test)

def_results_df = pd.concat([model_summary_df, poly_model_df])
def_results_df = def_results_df.sort_values(by = ["rmse_cv_mean"]).reset_index(drop=True)
def_results_df

Unnamed: 0,model,train_r2,test_r2,train_rmse,test_rmse,rmse_cv_mean,r2_cv_mean
0,RandomForestRegressor,0.946481,0.81891,1708066,3159869,3124487,0.659802
1,LinearRegression,0.559143,0.537838,3703883,5047993,3529852,0.562871
2,PolynomialRegression,0.590897,0.484096,3574205,5333422,3576742,0.552401
3,VotingRegressor,0.721328,0.535518,3422360,5060647,3612032,0.544927
4,ElasticNetCV,0.549167,0.505089,3887274,5223784,3733384,0.507511
5,DecisionTreeRegressor,0.597552,0.326928,3924977,6091895,4059426,0.422195


In [0]:
def_results_df.to_pickle("/content/drive/My Drive/Capstone/Data/Summary/def_results.pkl")

In [20]:
rmse_fig = px.bar(def_results_df, x = 'rmse_cv_mean', y = 'model',
                  orientation = 'h', color = 'model', 
                  title = 'RMSE(CV) for each model - Defenders', 
                  labels = {'rmse_cv_mean': 'RMSE(cv)', 'model': 'Model used'})
rmse_fig.update_layout(showlegend=False)
rmse_fig.show()

# Visualizing the best model's predictions

In [0]:
y_best_pred = tuned_rand_forest.predict(X_test)

In [22]:
chart_regression(y_best_pred.flatten(),y_test, scaled_back = True,
                 title="Defenders' actual and predicted prices",
                 y_axis="Price in Millions")

In [23]:
len(y_test)

257

In [24]:
fig = px.scatter(x = np.expm1(y_best_pred), 
                 y = np.expm1(y_test), labels = {"x":"Predicted Price","y":"Actual Price"},
                 title = "Defenders' actual transfer price vs predicted transfer price")
corr, _ = pearsonr(y_best_pred,y_test)
fig.show()
print("Correlation between the variables",corr)

Correlation between the variables 0.8487202583918848


In [25]:
imp_ft = pd.DataFrame({'feature':X_train.columns,'importance':tuned_rand_forest.feature_importances_})
imp_ft = imp_ft[imp_ft.importance != 0] 
imp_ft = imp_ft.sort_values(by = "importance", ascending = False)

fig_ft = px.bar(imp_ft, x = 'importance', y = 'feature',
       orientation = 'h', color = 'feature', 
       title = 'Importance of each feature in RFR', 
       labels = {'importance': 'Importance', 'feature': 'Feature'})
fig_ft.update_layout(showlegend=False)
fig_ft.show()

In [0]:
pd.DataFrame({"actual":y_test, "predicted":y_best_pred,"position":"Defenders"}).to_pickle("/content/drive/My Drive/Capstone/Data/Summary/def_predicted.pkl")