In [None]:
### Import required libraries

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics 
import seaborn as sns
import statsmodels.api as sm

from scipy import stats
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

import os

In [None]:
### Make sure that 'ggplot' style is used for all plots
plt.style.use('ggplot')
# plt.style.available ### To view all other available styles

In [None]:
### Set Working Directory (WD)
os.chdir('/Volumes/GoogleDrive/My Drive/CEMEX/Data Translators/GitHub/rgamerosl/capstone-project')

In [None]:
### Read Final Datasets using pickle
# df = pickle.load(open('dataset/data_v2.pkl', 'rb'))
new_data = pickle.load(open('dataset/data_12f_mileage_vf.pkl', 'rb'))
data = pickle.load(open('dataset/data_42f_mileage_vf.pkl', 'rb'))

In [None]:
### Read Tree base Models
new_best_rf = pickle.load(open('dataset/PythonModels/RandomForest_19F.pkl', 'rb'))
best_rf = pickle.load(open('dataset/PythonModels/RandomForest_36F.pkl', 'rb'))
new_rfecv_RF = pickle.load(open('dataset/PythonModels/RandomForestRFE_19F.pkl', 'rb'))
rfecv_RF = pickle.load(open('dataset/PythonModels/RandomForestRFE_36F.pkl', 'rb'))

In [None]:
### Read LinearRegression Models
new_est2 = pickle.load(open('dataset/PythonModels/LinearRegression_19F.pkl', 'rb'))
est2 = pickle.load(open('dataset/PythonModels/LinearRegression_36F.pkl', 'rb'))
new_rfecv_LR = pickle.load(open('dataset/PythonModels/LinearRegressionRFE_19F.pkl', 'rb'))
rfecv_LR = pickle.load(open('dataset/PythonModels/LinearRegressionRFE_36F.pkl', 'rb'))

In [None]:
### Pairplot that takes a LOT of time

# sns.pairplot(new_data, palette='husl', corner=True, diag_kind='kde', kind='reg', markers='.', 
#                  plot_kws={'line_kws':{'color':'red', 'alpha':0.5}}, height=1.5)
# plt.savefig(f'figures/paitplots.png')

In [None]:
plt.hist(data['Mileage'], density=True, bins=25)
plt.show()

In [None]:
data.info()

In [None]:
y = data['liters_per_hour']
x = data['Mileage']

plt.scatter(x, y, alpha=0.3, color='b')

z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x,p(x),"r--")

plt.show()

In [None]:
best10cols = new_data.columns[0:10].values.tolist()
best10cols.append('liters_per_hour')
best10cols

# Select the best 10 features and rerun both models: RandomForest and LinearRegression

## Select variables, Train/Test Split and Standarised numeric variables

In [None]:
mini_data = new_data[best10cols]
mini_data.info()

In [None]:
mini_data_train, mini_data_test = train_test_split(mini_data, test_size=0.25, random_state=42, shuffle=True)

In [None]:
num_col = mini_data.columns[[0,1,2,4,7,8,9,10]]
print(num_col)

In [None]:
### Standarize numerical variables in Train Set
scaler = StandardScaler()
mini_data_train_scale = mini_data_train.copy(deep=True)
mini_data_train_scale[num_col] = scaler.fit_transform(mini_data_train[num_col].to_numpy()) 
display(mini_data_train_scale.head(10))

In [None]:
### Standarize numerical variables in Test Set
scaler = StandardScaler()
mini_data_test_scale = mini_data_test.copy(deep=True)
mini_data_test_scale[num_col] = scaler.fit_transform(mini_data_test[num_col].to_numpy()) 
display(mini_data_test_scale.head(10))

In [None]:
mini_X_train = mini_data_train_scale.loc[:, mini_data_train_scale.columns != 'liters_per_hour'].values
mini_y_train = mini_data_train_scale['liters_per_hour'].values

mini_X_test = mini_data_test_scale.loc[:, mini_data_test_scale.columns != 'liters_per_hour'].values
mini_y_test = mini_data_test_scale['liters_per_hour'].values

## RandomForest Model

In [None]:
### Adjusting Best model to answer the following questions

mini_best_rf = RandomForestRegressor(n_estimators=80, n_jobs=-1, random_state=1, max_features='sqrt',
                                min_samples_leaf=2, min_samples_split=2, max_depth=None, bootstrap=False)
mini_best_rf.fit(mini_X_train, mini_y_train)

In [None]:
mini_best_rf_y_train_pred = mini_best_rf.predict(mini_X_train)
mini_best_rf_train_MSE_score = mean_squared_error(mini_y_train, mini_best_rf_y_train_pred)
print("MSE for the Best Random Forest in the Train data:", round(mini_best_rf_train_MSE_score,4))
mini_best_rf_train_R2_score = r2_score(mini_y_train, mini_best_rf_y_train_pred)
print("R2 for the Best Random Forest in the Train data:", round(mini_best_rf_train_R2_score,4))

mini_best_rf_y_test_pred = mini_best_rf.predict(mini_X_test)
mini_best_rf_test_MSE_score = mean_squared_error(mini_y_test, mini_best_rf_y_test_pred)
print("MSE for the Best Random Forest in the Test data:", round(mini_best_rf_test_MSE_score,4))
mini_best_rf_test_R2_score = r2_score(mini_y_test, mini_best_rf_y_test_pred)
print("R2 for the Best Random Forest in the Test data:", round(mini_best_rf_test_R2_score,4))

In [None]:
importances = mini_best_rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in mini_best_rf.estimators_],
             axis=0)
indices = np.argsort(importances)

# Plot the feature importances of the forest
plt.figure(figsize=(10,10))
plt.title("Feature importances")
plt.barh(range(mini_X_train.shape[1]), importances[indices],
       color="r", xerr=std[indices], align="center")
# If you want to define your own labels,
# change indices to a list of labels on the following line.
plt.yticks(range(mini_X_train.shape[1]), mini_data_train_scale.loc[:, mini_data_train_scale.columns != 'liters_per_hour'].columns[indices[::1]])
plt.ylim([-1, mini_X_train.shape[1]])
plt.savefig(f'figures/mini_feature_importances_mileage.png')
plt.show()

In [None]:
### Use RFECV with the complete data (X_train contains 36 different variables)

from sklearn.feature_selection import RFECV

min_features_to_select = 1  # Minimum number of features to consider
mini_rfecv_RF = RFECV(estimator=mini_best_rf, step=1, cv=5,
              scoring='r2',
              min_features_to_select=min_features_to_select)

mini_rfecv_RF.fit(mini_X_train, mini_y_train)

In [None]:
print("Optimal number of features : %d" % mini_rfecv_RF.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation R2 score")
plt.plot(range(min_features_to_select,
               len(mini_rfecv_RF.grid_scores_) + min_features_to_select),
         mini_rfecv_RF.grid_scores_)
plt.title("RFE - Random Forest")
plt.savefig(f'figures/mini_RFE_RF_mileageCEM - Pedal del acelerador arriba del 50% con vehículo detenido .png')
plt.show()


In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score
scores = cross_val_score(mini_best_rf, mini_X_train, mini_y_train, cv=5, scoring='r2')
print(scores)
print(scores.mean())

## LinearRegression Model

In [None]:
mini_X2_train = sm.add_constant(mini_X_train)
est = sm.OLS(mini_y_train, mini_X2_train)
mini_est = est.fit()
print(mini_est.summary())

### Doubt: How to interpret coefficients with non-standarized data?

In [None]:
mini_y_train_pred_lr = mini_est.predict(mini_X2_train)
mini_lr_train_MSE_score = mean_squared_error(mini_y_train, mini_y_train_pred_lr)
print("MSE for the Multiple Linear Regression in the Train data:", round(mini_lr_train_MSE_score,4))
mini_lr_train_R2_score = r2_score(mini_y_train, mini_y_train_pred_lr)
print("R2 for the Multiple Linear Regression in the Train data:", round(mini_lr_train_R2_score,4))

mini_X2_test = sm.add_constant(mini_X_test)
mini_y_test_pred_lr = mini_est.predict(mini_X2_test)
mini_lr_test_MSE_score = mean_squared_error(mini_y_test, mini_y_test_pred_lr)
print("MSE for the Multiple Linear Regression in the Test data:", round(mini_lr_test_MSE_score,4))
mini_lr_test_R2_score = r2_score(mini_y_test, mini_y_test_pred_lr)
print("R2 for the Multiple Linear Regression in the Test data:", round(mini_lr_test_R2_score,4))

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["Features"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
### First VIF Analysis with all numerical variables (data)
mini_vif_y = mini_data['liters_per_hour']
mini_vif_X = mini_data.drop('liters_per_hour',axis=1)
mini_vif_model = calc_vif(mini_vif_X)
mini_vif_model.round(1)

In [None]:
### Use RFECV with the complete data (X_train contains 36 different variables)

from sklearn.feature_selection import RFECV

lr = LinearRegression()

min_features_to_select = 1  # Minimum number of features to consider
mini_rfecv_LR = RFECV(estimator=lr, step=1, cv=5,
              scoring='r2',
              min_features_to_select=min_features_to_select)

mini_rfecv_LR.fit(mini_X2_train, mini_y_train)

In [None]:
print("Optimal number of features : %d" % mini_rfecv_LR.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation R2 score")
plt.plot(range(min_features_to_select,
               len(mini_rfecv_LR.grid_scores_) + min_features_to_select),
         mini_rfecv_LR.grid_scores_)
plt.title("RFE - Linear Regression")
plt.savefig(f'figures/mini_RFE_LR_mileage.png')
plt.show()

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lr, mini_X2_train, mini_y_train, cv=5, scoring='r2')
print(scores)
print(scores.mean())

### In both cases taking into account the next 10 features only improves the R2 score by ~5%... it seems likes is not worth it

## Pairplots

In [None]:
mini_data.info()

In [None]:
print(mini_data.columns[0:10])

In [None]:
sampling = mini_data.sample(n=80000, random_state=42)
plt.style.use("fivethirtyeight")

In [None]:
### Pairplot that takes a LOT of time

sns.pairplot(sampling, palette='husl', corner=True, diag_kind='kde', kind='reg', markers='.', 
                 plot_kws={'line_kws':{'color':'red', 'alpha':0.5}}, height=1.5)
plt.savefig(f'figures/pairplots_mileage.png')

In [None]:
y = sampling['liters_per_hour']
for i in sampling.columns[0:10]:
    x = sampling[i]

    plt.scatter(x, y, alpha=0.3, color='b')
    plt.title(f'{i}')
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    plt.plot(x,p(x),"r--")
    plt.savefig(f'figures/{i}-feature_mileage.png', bbox_inches='tight')
    plt.show()

## Simple LinearRegression using only Mileage and Idle_Time

In [None]:
MileF = mini_X2_train[:,0:5]

In [None]:
est = sm.OLS(mini_y_train, MileF)
mini_est_S = est.fit()
print(mini_est_S.summary())

### Doubt: How to interpret coefficients with non-standarized data?

In [None]:
est = sm.OLS(mini_y_train, mini_X2_train[:,0:2])
mini_est_S1 = est.fit()
print(mini_est_S1.summary())

### Doubt: How to interpret coefficients with non-standarized data?

In [None]:
mini_y_train_pred_lrS = mini_est_S.predict(MileF)
mini_lrS_train_MSE_score = mean_squared_error(mini_y_train, mini_y_train_pred_lrS)
print("MSE for the Simple Linear Regression in the Train data:", round(mini_lrS_train_MSE_score,4))
mini_lrS_train_R2_score = r2_score(mini_y_train, mini_y_train_pred_lrS)
print("R2 for the Simple Linear Regression in the Train data:", round(mini_lrS_train_R2_score,4))

MileF_test = mini_X2_test[:,0:5]
mini_y_test_pred_lrS = mini_est_S.predict(MileF_test)
mini_lrS_test_MSE_score = mean_squared_error(mini_y_test, mini_y_test_pred_lrS)
print("MSE for the Simple Linear Regression in the Test data:", round(mini_lrS_test_MSE_score,4))
mini_lrS_test_R2_score = r2_score(mini_y_test, mini_y_test_pred_lrS)
print("R2 for the Simple Linear Regression in the Test data:", round(mini_lrS_test_R2_score,4))

In [None]:
### Train Test split
data_train, data_test = train_test_split(data, test_size=0.25, random_state=42, shuffle=True)

In [None]:
col_indexes = data.columns[0:20]
### Standarize numerical variables in Train Set
scaler = StandardScaler()
data_train_scale = data_train.copy(deep=True)
data_train_scale[col_indexes] = scaler.fit_transform(data_train[col_indexes].to_numpy()) 
display(data_train_scale.head(10))

In [None]:
### Standarize numerical variables in Test Set
scaler = StandardScaler()
data_test_scale = data_test.copy(deep=True)
data_test_scale[col_indexes] = scaler.fit_transform(data_test[col_indexes].to_numpy()) 
display(data_test_scale.head(10))

In [None]:
X_train = data_train_scale.loc[:, data_train_scale.columns != 'liters_per_hour'].values
y_train = data_train_scale['liters_per_hour'].values

X_test = data_test_scale.loc[:, data_test_scale.columns != 'liters_per_hour'].values
y_test = data_test_scale['liters_per_hour'].values

In [None]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.05)
clf.fit(X_train, y_train)
print(clf.coef_)

In [None]:
cross_val_score(clf,X_train,y_train,scoring='r2',cv=5)

In [None]:
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import mean_squared_log_error
def get_ridge_lasso_coeff(x,y,min_alpha_exp, max_alpha_exp, reg='Lasso',cv=True):
    '''
    builds multiple ridge or lasso models for 50 values of alpha between 
    10^(min_alpha_exp) to 10^(max_alpha_exp)
    
    Returns array of alphas and cooresponding coefficient for each feature
    '''
    nalphas = 50
    nfeatures = len(x[0])
    coefs = np.zeros((nalphas, nfeatures))
    scores = np.zeros(nalphas)
    alphas = np.logspace(min_alpha_exp, max_alpha_exp, nalphas)
    for i, alpha in enumerate(alphas):
        if reg == 'Lasso':
            model = Lasso(alpha=alpha,normalize=True,max_iter=2000000)
        else:
            model = Ridge(alpha=alpha,normalize=True,max_iter=200000)
        model.fit(x, y)
        if cv:
            scores[i] = (cross_val_score(model, x, y, 
                                        scoring= 'r2',
                                        cv=5)).mean()
#             y_pred = model.predict(x)
#             scores[i] = mean_squared_log_error(y, y_pred)
#         else:
            # Note the test data used for scoring is specific to the polynomial degree 30 data
#             scores[i] =np.sqrt(np.sum((model.predict(x30_test) - y_ex1_test)**2)) #model.score(x30_test,y_ex1_test)
        coefs[i] = model.coef_
    return alphas, coefs, scores

In [None]:
def plot_alpha_v_coef(ax, alphas, coefs, column_names, method='Lasso'):
    '''
    plots alpha versus the beta coefficients 
    '''
    for feature in range(len(coefs[0])):
        if np.absolute(coefs[0,feature]) > 0.1 or feature == 2:  # only plot large coefficients + 3 order
            ax.plot(alphas, coefs[:, feature],
                     label="$\\beta_{{{}}}$".format(column_names[feature]))  #'{} order'.format(feature+1))
    ax.set_xscale('log')
    ax.set_title("$\\beta$ as a function of $\\alpha$ for {} regression".format(method))
    ax.set_xlabel("$\\alpha$")
    ax.set_ylabel("$\\beta$")
    ax.legend(loc="upper left",bbox_to_anchor=(1,1))

def plot_alpha_v_scores(ax, alphas, scores):
    '''
    plots alpha versus the RMSE
    '''
    ax.plot(alphas,scores,c='k')
    ax.scatter(alphas[np.argmax(scores)],scores.max(), c='k', s=100)
    print('The best R2 is {}'.format( scores.max() ))
    ax.set_title('Model score as a function of $\\alpha$')
    ax.set_xscale('log')
    ax.set_xlabel("$\\alpha$")
    ax.set_ylabel('$R^{2}$');    #'$R^{2}$');

In [None]:
column_names =  data_train.columns[data_train.columns != 'liters_per_hour']

alphas, coefs, scores = get_ridge_lasso_coeff(X_train,y_train,-5.5,-2.5, reg='Lasso',cv=True)
fig, ax = plt.subplots(2,1,figsize=(10,8))
plot_alpha_v_coef(ax[0], alphas, coefs, column_names, method='Lasso')
plot_alpha_v_scores(ax[1], alphas, scores)
fig.tight_layout()
fig.savefig(f'figures/Lasso_coefficients_mileage.png')


In [None]:
column_names =  data_train.columns[data_train.columns != 'liters_per_hour']

alphas, coefs, scores = get_ridge_lasso_coeff(X_train,y_train,-2,1, reg='Ridge',cv=True)
fig, ax = plt.subplots(2,1,figsize=(10,8))
plot_alpha_v_coef(ax[0], alphas, coefs, column_names, method='Ridge')
plot_alpha_v_scores(ax[1], alphas, scores)
fig.tight_layout()
fig.savefig(f'figures/Ridge_coefficients_mileage.png')

In [None]:
data_train.columns[10]

In [None]:
### Rename speciic columns
data_train.rename(columns = {'CEM - Pedal del acelerador arriba del 50% con vehículo detenido':'CEM - Pedal del acelerador arriba del 50p on vehículo detenido'}, inplace = True) 