In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import warnings
import seaborn as sns
import statsmodels.api as sm

from scipy import stats
from scipy.stats import norm, skew, pearsonr
from sklearn.preprocessing import OneHotEncoder, scale, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.svm import SVR
import plotly.express as px



seed = 33

%matplotlib inline
# warnings.filterwarnings("ignore") # warnings were turned off at the end
sns.set_style("white")
pylab.rcParams['figure.figsize'] = 10, 8

  import pandas.util.testing as tm


In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import sys
sys.path.append('/content/drive/My Drive/Capstone/Helpers/ML/')
from ml_helpers import *

In [0]:
# midfielders = pd.read_pickle("../ml_data/midfielders_final.pkl")
midfielders = pd.read_pickle("/content/drive/My Drive/Capstone/Data/Clean/midfielders_final.pkl").drop_duplicates(keep = False,ignore_index = True)
midfielders.replace({"mid_seson": "mid_season"}, inplace = True)
midfielders["type"] = midfielders["type"].astype('object')

### Pipelines

In [0]:
midfielders["mv_log"] = np.log1p(midfielders.mv)
midfielders["cum_mv_log"] = np.log1p(midfielders.cum_mv)


drop_cols = ["tm_id", "transfer_season",
          "main_field_position","mv", "cum_mv",
          "stats_season", "fee", "year"]
cat_cols = ["type", "continent", "sf", "field_position"]

X,y = prepare_data(midfielders, drop_cols, cat_cols)

In [0]:
encoded_cat_cols = X.select_dtypes(include='int64').columns

In [0]:
X_non_cat = X.drop(encoded_cat_cols, axis = 1)
X_non_cat = remove_corr(X_non_cat)
X = pd.concat([X[encoded_cat_cols],X_non_cat],axis = 1)

In [0]:
X_non_cat = X.drop(encoded_cat_cols, axis = 1)
def_initial_ols = sm.OLS(y,X_non_cat).fit()
def_initial_ols.summary()

0,1,2,3
Dep. Variable:,fee_log,R-squared (uncentered):,0.997
Model:,OLS,Adj. R-squared (uncentered):,0.996
Method:,Least Squares,F-statistic:,14970.0
Date:,"Sun, 07 Jun 2020",Prob (F-statistic):,0.0
Time:,13:58:40,Log-Likelihood:,-1335.1
No. Observations:,1066,AIC:,2710.0
Df Residuals:,1046,BIC:,2810.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,-0.0766,0.009,-8.540,0.000,-0.094,-0.059
a,0.0224,0.014,1.555,0.120,-0.006,0.051
g,0.0074,0.010,0.710,0.478,-0.013,0.028
height,0.0106,0.002,5.656,0.000,0.007,0.014
app,-0.0056,0.004,-1.357,0.175,-0.014,0.003
mpg,-0.0001,0.000,-0.972,0.331,-0.000,0.000
mpa,0.0001,6.31e-05,2.238,0.025,1.74e-05,0.000
ppg,7.105e-05,0.050,0.001,0.999,-0.097,0.097
yc,0.0180,0.014,1.245,0.213,-0.010,0.046

0,1,2,3
Omnibus:,38.64,Durbin-Watson:,1.292
Prob(Omnibus):,0.0,Jarque-Bera (JB):,97.706
Skew:,-0.102,Prob(JB):,6.0700000000000005e-22
Kurtosis:,4.469,Cond. No.,6680.0


In [0]:
cols = fsel_rforest(X_non_cat,y,10)
# cols = significant_inputs(X_non_cat, y)
# cols = list(cols) + ['mpa','cum_mpg','height','cum_mv_log']
X = X[list(cols) + list(encoded_cat_cols)]
X

Unnamed: 0,mv_log,age,mpyc,cum_fmpct,cum_games_won,cum_mpg,mpa,ppg,app,height,tourn_year,x0_mid_season,x0_summer,x1_AF,x1_AS,x1_EU,x1_NA,x1_OC,x2_both,x2_left,x2_no_info,x3_CAM,x3_CDM,x3_LM,x3_RM
0,15.424949,20.0,574.250000,0.088542,0.000000,0.000000,1148.500000,1.470000,29.0,176.0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
1,15.424949,20.0,574.250000,0.088542,0.000000,0.000000,1148.500000,1.470000,29.0,176.0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
2,16.906553,22.0,1302.333333,0.761937,54.657778,112.500000,558.142857,1.002000,47.0,181.0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
3,15.830414,20.0,227.538462,0.530277,52.765694,298.175000,591.600000,1.706667,39.0,186.0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0
4,14.220976,19.0,284.666667,0.269176,6.104444,31.166667,854.000000,2.086000,21.0,186.0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1061,14.220976,24.0,251.800000,0.602387,12.503333,0.000000,629.500000,1.570000,14.0,182.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1062,14.508658,24.0,647.500000,0.387598,25.812778,0.000000,1079.609524,1.630000,30.0,186.0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
1063,12.429220,22.0,315.555556,0.517603,79.162750,0.000000,931.233333,1.220000,32.0,171.0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
1064,12.429220,22.0,320.000000,0.456318,19.205000,205.500000,960.000000,1.090000,11.0,171.0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0


### ML

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 20)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((852, 25), (214, 25), (852,), (214,))

In [0]:
reg_model = LinearRegression(normalize = True)

elastic_model = ElasticNetCV(n_alphas = 500, cv = 5)
elastic_model.fit(X_train, y_train)

poly_predictors = PolynomialFeatures(degree = 2, interaction_only = True)
x_train_poly = poly_predictors.fit_transform(X_train)
x_test_poly = poly_predictors.fit_transform(X_test)
    
poly_model = LinearRegression()
poly_model_fit = poly_model.fit(x_train_poly, y_train)

In [0]:
tune_tree_reg = DecisionTreeRegressor(random_state = 20)

tree_params = {
    "max_depth": np.arange(1,30),
    "min_samples_split": np.linspace(0.1, 1.0, 10, endpoint = True),
    "max_features": list(range(1,X_train.shape[1])),
}

tune_tree_reg_cv = GridSearchCV(tune_tree_reg, tree_params, cv = 5, scoring = "neg_mean_squared_error")
tune_tree_reg_cv.fit(X_train, y_train)

best_params = tune_tree_reg_cv.best_params_

tree_reg = DecisionTreeRegressor(max_depth = best_params["max_depth"], 
                                 min_samples_split= best_params["min_samples_split"],
                                 max_features = best_params["max_features"], random_state = 20)
tree_reg.fit(X_train, y_train)

tree_pred = tree_reg.predict(X_test)
rmse_tree = rmse(y_test, tree_pred)
r2_tree = r2_score(y_test, tree_pred)

In [0]:
tune_rand_forest = RandomForestRegressor(random_state = 20)

rand_forest_params = {
    "max_depth": np.arange(1,30),
}

tune_rand_forest_cv = GridSearchCV(tune_rand_forest, rand_forest_params, cv = 5, scoring = "neg_mean_squared_error")
tune_rand_forest_cv.fit(X_train, y_train)

best_forest_params = tune_rand_forest_cv.best_params_

tuned_rand_forest = RandomForestRegressor(max_depth = best_forest_params["max_depth"], random_state = 20)
tuned_rand_forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=12, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=20, verbose=0, warm_start=False)

In [0]:
voting_regressor = VotingRegressor([
    ("lr", reg_model),
    ("dt", tree_reg),
    ("rf", tuned_rand_forest),
    ("en", elastic_model)
])

### Summary

In [0]:
poly_model_df = model_summary([poly_model], x_train_poly, y_train, x_test_poly, y_test)
poly_model_df.loc[0,'model'] = "PolynomialRegression"

model_summary_df = model_summary([reg_model,
                                  tree_reg,
                                  tuned_rand_forest,
                                  elastic_model, 
                                  voting_regressor],X_train, y_train, X_test, y_test)

mid_results_df = pd.concat([model_summary_df, poly_model_df])
mid_results_df = mid_results_df.sort_values(by = ["rmse_cv_mean"]).reset_index(drop=True)
mid_results_df

Unnamed: 0,model,train_r2,test_r2,train_rmse,test_rmse,rmse_cv_mean,r2_cv_mean
0,RandomForestRegressor,0.955941,0.695471,1431946,2856453,3157480,0.798863
1,VotingRegressor,0.790454,0.722966,3436275,2724451,3766141,0.713859
2,LinearRegression,0.670826,0.679398,3924088,2930862,3927258,0.680225
3,ElasticNetCV,0.656858,0.627697,4266641,3158352,4272658,0.627051
4,DecisionTreeRegressor,0.697846,0.75043,4257863,2585880,4363558,0.618961
5,PolynomialRegression,0.746573,0.487859,3769087,3704311,4411008,0.602239


In [0]:
rmse_fig = px.bar(mid_results_df, x = 'rmse_cv_mean', y = 'model',
                  orientation = 'h', color = 'model', 
                  title = 'RMSE(CV) for each model - Midfielders', 
                  labels = {'rmse_cv_mean': 'RMSE(CV)', 'model': 'Model used'})
rmse_fig.update_layout(showlegend=False)
rmse_fig.show()

In [0]:
mid_results_df.to_pickle("/content/drive/My Drive/Capstone/Data/Summary/mid_results.pkl")

In [0]:
y_best_pred = voting_regressor.predict(X_test)

In [0]:
chart_regression(y_best_pred.flatten(),y_test, scaled_back = True,
                 title="Midfielders' actual and predicted prices",
                 y_axis="Price in Millions")

In [0]:
fig = px.scatter(x = np.expm1(y_best_pred), 
                 y = np.expm1(y_test), labels = {"x":"Predicted Price","y":"Actual Price"},
                 title = "Midfielders' actual transfer price vs predicted transfer price")
corr, _ = pearsonr(y_best_pred,y_test)
fig.show()
print("Correlation between the variables",corr)

Correlation between the variables 0.8095688121470268


In [0]:
imp_ft = pd.DataFrame({'feature':X_train.columns,'importance':tuned_rand_forest.feature_importances_})
imp_ft = imp_ft[imp_ft.importance != 0] 
imp_ft = imp_ft.sort_values(by = "importance", ascending = False)


fig_ft = px.bar(imp_ft, x = 'importance', y = 'feature',
       orientation = 'h', color = 'feature', 
       title = 'Importance of each feature in RFR', 
       labels = {'importance': 'Importance', 'feature': 'Feature'})
fig_ft.update_layout(showlegend=False)
fig_ft.show()

In [0]:
pd.DataFrame({"actual":y_test, "predicted":y_best_pred,"position":"Midfielders"}).to_pickle("/content/drive/My Drive/Capstone/Data/Summary/mid_predicted.pkl")