In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import warnings
import seaborn as sns
import statsmodels.api as sm

import plotly.express as px

from scipy import stats
from scipy.stats import norm, skew, pearsonr
from sklearn.preprocessing import OneHotEncoder, scale, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.svm import SVR

seed = 33
%matplotlib inline
# warnings.filterwarnings("ignore") # warnings were turned off at the end
sns.set_style("white")
pylab.rcParams['figure.figsize'] = 10, 8

  import pandas.util.testing as tm


In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import sys
sys.path.append('/content/drive/My Drive/Capstone/Helpers/ML/')
from ml_helpers import *

In [0]:
attackers = pd.read_pickle("/content/drive/My Drive/Capstone/Data/Clean/attackers_final.pkl").drop_duplicates(keep = False,ignore_index = True)
attackers.replace({"mid_seson": "mid_season"}, inplace = True)
attackers["type"] = attackers["type"].astype('object')

### Pipelines

In [0]:
attackers["mv_log"] = np.log1p(attackers.mv)
attackers["cum_mv_log"] = np.log1p(attackers.cum_mv)
drop_cols = ["tm_id", "transfer_season",
              "main_field_position", 'mpyc', 
              "stats_season", "fee", "year", "mv", "cum_mv"]

cat_cols = ["type", "continent", "sf", "field_position"]

X,y = prepare_data(attackers, drop_cols, cat_cols)
encoded_cat_cols = X.select_dtypes(include='int64').columns

In [0]:
X_non_cat = X.drop(encoded_cat_cols, axis = 1)
X_non_cat = remove_corr(X_non_cat)
X = pd.concat([X[encoded_cat_cols],X_non_cat],axis = 1)

In [0]:
X_non_cat = X.drop(encoded_cat_cols, axis = 1)
def_initial_ols = sm.OLS(y,X_non_cat).fit()
def_initial_ols.summary()

0,1,2,3
Dep. Variable:,fee_log,R-squared (uncentered):,0.996
Model:,OLS,Adj. R-squared (uncentered):,0.996
Method:,Least Squares,F-statistic:,21600.0
Date:,"Sun, 07 Jun 2020",Prob (F-statistic):,0.0
Time:,14:04:29,Log-Likelihood:,-2133.4
No. Observations:,1619,AIC:,4305.0
Df Residuals:,1600,BIC:,4407.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-0.0411,0.008,-5.293,0.000,-0.056,-0.026
a,0.0040,0.011,0.371,0.710,-0.017,0.025
g,0.0056,0.005,1.094,0.274,-0.004,0.016
pg,0.0165,0.026,0.646,0.519,-0.034,0.067
height,0.0147,0.002,9.614,0.000,0.012,0.018
mpg,-0.0004,0.000,-1.615,0.107,-0.001,8.02e-05
mpa,1.489e-05,6.78e-05,0.220,0.826,-0.000,0.000
ppg,-0.0076,0.043,-0.176,0.860,-0.093,0.078
cum_games_won,6.55e-05,0.002,0.042,0.967,-0.003,0.003

0,1,2,3
Omnibus:,45.53,Durbin-Watson:,1.27
Prob(Omnibus):,0.0,Jarque-Bera (JB):,98.929
Skew:,-0.122,Prob(JB):,3.3e-22
Kurtosis:,4.186,Cond. No.,14700.0


In [0]:
cols = fsel_rforest(X_non_cat,y,10)
X = X[list(cols) + list(encoded_cat_cols)]
X

Unnamed: 0,mv_log,cum_fmpct,cum_games_won,ppg,age,mpa,height,cum_mpg,mpg,cum_mv_log,tourn_year,x0_mid_season,x0_summer,x1_AF,x1_AS,x1_EU,x1_NA,x1_OC,x2_both,x2_left,x2_no_info,x3_CF,x3_ST
0,15.319588,0.418527,28.420000,1.697500,21.0,390.875000,184.0,211.500000,111.000000,14.938816,1,0,1,0,0,1,0,0,0,0,0,1,0
1,16.118096,0.078125,18.000000,2.000000,22.0,175.900000,174.0,0.000000,460.000000,16.118096,0,0,1,0,0,1,0,0,0,0,0,0,0
2,13.122365,0.268403,0.000000,0.535000,19.0,665.000000,174.0,0.000000,0.000000,0.000000,1,0,1,0,0,1,0,0,0,0,0,0,0
3,13.458837,0.362096,89.950000,0.865000,23.0,395.833333,179.0,167.750000,67.500000,12.988834,0,0,0,0,0,1,0,0,0,0,0,0,0
4,13.458837,0.362096,89.950000,0.865000,23.0,395.833333,179.0,167.750000,67.500000,13.458837,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1614,13.304687,0.784528,20.981250,0.000000,29.0,1276.000000,174.0,238.125000,0.000000,14.253766,1,0,0,1,0,0,0,0,0,0,0,0,0
1615,14.220976,0.397293,34.947917,1.210000,24.0,390.600000,174.0,305.875000,93.000000,13.815512,0,0,0,1,0,0,0,0,0,0,0,0,0
1616,13.527830,0.266634,31.593333,1.000000,23.0,712.990000,178.0,99.266667,0.000000,13.217675,1,0,1,1,0,0,0,0,0,0,0,1,0
1617,13.527830,0.607292,4.372500,1.053333,23.0,534.000000,178.0,424.750000,135.333333,13.217675,1,0,1,1,0,0,0,0,0,0,0,1,0


### ML

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 33)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1295, 23), (324, 23), (1295,), (324,))

In [0]:
reg_model = LinearRegression(normalize = True)

elastic_model = ElasticNetCV(n_alphas = 500, cv = 5)
elastic_model.fit(X_train, y_train)

poly_predictors = PolynomialFeatures(degree = 2, interaction_only = True)
x_train_poly = poly_predictors.fit_transform(X_train)
x_test_poly = poly_predictors.fit_transform(X_test)
    
poly_model = LinearRegression()
poly_model_fit = poly_model.fit(x_train_poly, y_train)

In [0]:
tune_tree_reg = DecisionTreeRegressor(random_state = 33)

tree_params = {
    "max_depth": np.arange(1,30),
    "min_samples_split": np.linspace(0.1, 1.0, 10, endpoint = True),
    "max_features": list(range(1,X_train.shape[1])),
}

tune_tree_reg_cv = GridSearchCV(tune_tree_reg, tree_params, cv = 5, scoring = "neg_mean_squared_error")
tune_tree_reg_cv.fit(X_train, y_train)

best_params = tune_tree_reg_cv.best_params_

tree_reg = DecisionTreeRegressor(max_depth = best_params["max_depth"], 
                                 min_samples_split= best_params["min_samples_split"],
                                 max_features = best_params["max_features"], random_state = 33)
tree_reg.fit(X_train, y_train)

tree_pred = tree_reg.predict(X_test)
rmse_tree = rmse(y_test, tree_pred)
r2_tree = r2_score(y_test, tree_pred)

In [0]:
tune_rand_forest = RandomForestRegressor(random_state = 33)

rand_forest_params = {
    "max_depth": np.arange(1,30),
}

tune_rand_forest_cv = GridSearchCV(tune_rand_forest, rand_forest_params, cv = 5, scoring = "neg_mean_squared_error")
tune_rand_forest_cv.fit(X_train, y_train)

best_forest_params = tune_rand_forest_cv.best_params_

tuned_rand_forest = RandomForestRegressor(max_depth = best_forest_params["max_depth"], random_state = 33)
tuned_rand_forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=19, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=33, verbose=0, warm_start=False)

In [0]:
voting_regressor = VotingRegressor([
    ("lr", reg_model),
    ("dt", tree_reg),
    ("rf", tuned_rand_forest),
    ("en", elastic_model)
])

### Summary

In [0]:
poly_model_df = model_summary([poly_model], x_train_poly, y_train, x_test_poly, y_test)
poly_model_df.loc[0,'model'] = "PolynomialRegression"

model_summary_df = model_summary([reg_model,
                                  tree_reg,
                                  tuned_rand_forest,
                                  elastic_model, 
                                  voting_regressor],X_train, y_train, X_test, y_test)

att_results_df = pd.concat([model_summary_df, poly_model_df])
att_results_df = att_results_df.sort_values(by = ["rmse_cv_mean"]).reset_index(drop=True)
att_results_df

Unnamed: 0,model,train_r2,test_r2,train_rmse,test_rmse,rmse_cv_mean,r2_cv_mean
0,RandomForestRegressor,0.951507,0.666142,1611317,3628356,3898994,0.650713
1,VotingRegressor,0.723311,0.605439,3861062,3944451,4151339,0.6059
2,LinearRegression,0.584302,0.488157,4199891,4492597,4163750,0.599084
3,ElasticNetCV,0.573495,0.533671,4482559,4288206,4416323,0.550417
4,PolynomialRegression,0.639215,0.695313,4198732,3466223,4426038,0.518651
5,DecisionTreeRegressor,0.585943,0.58871,4662162,4027199,4573473,0.520771


In [0]:
rmse_fig = px.bar(att_results_df, x = 'rmse_cv_mean', y = 'model',
                  orientation = 'h', color = 'model', 
                  title = 'RMSE(CV) for each model - Attackers', 
                  labels = {'test_rmse': 'RMSE(CV)', 'model': 'Model used'})
rmse_fig.update_layout(showlegend=False)
rmse_fig.show()

In [0]:
results_path = "/content/drive/My Drive/Capstone/Data/Summary/att_results.pkl"
att_results_df.to_pickle(results_path)

In [0]:
y_best_pred = poly_model.predict(x_test_poly)

In [0]:
chart_regression(y_best_pred.flatten(),y_test, scaled_back = True,
                 title="Attackers' actual and predicted prices",
                 y_axis="Price in Millions")

In [0]:
fig = px.scatter(x = np.expm1(y_best_pred), 
                 y = np.expm1(y_test), labels = {"x":"Predicted Price","y":"Actual Price"},
                 title = "Attackers' actual transfer price vs predicted transfer price")
corr, _ = pearsonr(y_best_pred,y_test)
fig.show()
print("Correlation between the variables",corr)

Correlation between the variables 0.7927616605474648


In [0]:
imp_ft = pd.DataFrame({'feature':X_train.columns,'importance':tuned_rand_forest.feature_importances_})
imp_ft = imp_ft[imp_ft.importance != 0] 
imp_ft = imp_ft.sort_values(by = "importance", ascending = False)

fig_ft = px.bar(imp_ft, x = 'importance', y = 'feature',
       orientation = 'h', color = 'feature', 
       title = 'Importance of each feature in RFR', 
       labels = {'importance': 'Importance', 'feature': 'Feature'})
fig_ft.update_layout(showlegend=False)
fig_ft.show()

In [0]:
predicted_path = "/content/drive/My Drive/Capstone/Data/Summary/Machine Learning/att_predicted.pkl"
pd.DataFrame({"actual":y_test, "predicted":y_best_pred,"position":"Attackers"}).to_pickle(predicted_path)