# Regression

## Objectives

*   Fit and evaluate a regression model to predict tenure levels for a prospect that will likely churn


## Inputs

* outputs/datasets/collection/TelcoCustomerChurn.csv
* Instructions on which variables to use for data cleaning and feature engineering. They are found in their respective notebooks.

## Outputs

* Train set (features and target)
* Test set (features and target)
* ML pipeline to predict tenure
* labels map
* Feature Importance Plot



---

# Change working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/workspace/filter-maintenance-predictor/jupyter_notebooks'

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [3]:
current_dir = os.getcwd()
current_dir

'/workspace/filter-maintenance-predictor'

---

# Load Data

In [4]:
import numpy as np
import pandas as pd
import xgboost as xgb

df = pd.read_csv(f'outputs/datasets/transformed/dfTransformedTotal.csv') # data with all negative log_EWM values removed
df_total = pd.read_csv(f'outputs/datasets/transformed/dfTransformedTotal.csv') # data with all negative log_EWM values removed
df_total_model = (pd.read_csv('outputs/datasets/transformed/dfTransformedTotal.csv')
        .drop(labels=['4point_EWM', 'change_DP', 'change_EWM'], axis=1)
    )
df_train_even_dist = (pd.read_csv(f'outputs/datasets/transformed/dfTransformedTrain.csv')
        .drop(labels=['4point_EWM', 'change_DP', 'change_EWM', 'std_DP', 'median_DP', 'bin_size'], axis=1)
    )
print(df.shape, '= df')
print(df_total.shape, '= df_total')
print(df_total_model.shape, '= df_total_model')
print(df_train_even_dist.shape, '= df_train_even_dist')
df_total


(69686, 15) = df
(69686, 15) = df_total
(69686, 12) = df_total_model
(20931, 12) = df_train_even_dist


Unnamed: 0,Data_No,Differential_pressure,4point_EWM,log_EWM,Flow_rate,Time,Dust_feed,Dust,RUL,change_DP,change_EWM,mass_g,cumulative_mass_g,Tt,filter_balance
0,1,1.537182,1.046296,0.045257,54.143527,5.5,236.428943,1.025,,0.000000,0.327257,0.242340,13.328682,44.9,99.74
1,1,1.537182,1.242651,0.217247,54.518255,5.6,236.428943,1.025,,0.000000,0.196354,0.242340,13.571021,44.9,99.74
2,1,1.537182,1.360463,0.307825,54.658781,5.7,236.428943,1.025,,0.000000,0.117813,0.242340,13.813361,44.9,99.74
3,1,3.345631,2.154530,0.767573,54.780562,5.8,236.428943,1.025,,1.808449,0.794067,0.242340,14.055701,44.9,99.44
4,1,5.244502,3.390519,1.220983,54.574466,5.9,236.428943,1.025,,1.898871,1.235989,0.242340,14.298040,44.9,99.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69681,100,465.494800,457.888170,6.126625,82.675521,52.0,316.985065,1.200,8.2,6.329500,5.071087,0.380382,197.798681,52.4,22.42
69682,100,464.228900,460.424462,6.132149,82.421873,52.1,316.985065,1.200,8.1,-1.265900,2.536292,0.380382,198.179063,52.4,22.63
69683,100,466.037300,462.669597,6.137013,82.743156,52.2,316.985065,1.200,8.0,1.808400,2.245135,0.380382,198.559445,52.4,22.33
69684,100,472.276500,466.512358,6.145285,82.785427,52.3,316.985065,1.200,7.9,6.239200,3.842761,0.380382,198.939827,52.4,21.29


Remove NaN Values

In [5]:
# n = df_total_model['Data_No'].iloc[0:len(df_total)]
# df_train = df_total_model[n < 51].reset_index(drop=True) # larger dataset
# df_test = df_total_model[n > 50].reset_index(drop=True)
n = df_total['Data_No'].iloc[0:len(df_total)]
df_train = df_total[n < 51].reset_index(drop=True) # larger dataset
df_test = df_total[n > 50].reset_index(drop=True)
print(df_train.shape, '= df_train')
print(df_test.shape, '= df_test')
df = df_test
print(df.shape, '= df')
df


(33324, 15) = df_train
(36362, 15) = df_test
(36362, 15) = df


Unnamed: 0,Data_No,Differential_pressure,4point_EWM,log_EWM,Flow_rate,Time,Dust_feed,Dust,RUL,change_DP,change_EWM,mass_g,cumulative_mass_g,Tt,filter_balance
0,51,2.622251,1.159577,0.148056,55.524146,0.4,236.428943,1.025,58.6,2.622251,0.975116,0.242340,0.969359,36.6,99.56
1,51,3.888165,2.251012,0.811380,55.852018,0.5,236.428943,1.025,58.5,1.265914,1.091435,0.242340,1.211698,36.6,99.35
2,51,4.521122,3.159056,1.150273,56.130203,0.6,236.428943,1.025,58.4,0.632957,0.908044,0.242340,1.454038,36.6,99.25
3,51,4.521122,3.703883,1.309382,56.150070,0.7,236.428943,1.025,58.3,0.000000,0.544826,0.242340,1.696378,36.6,99.25
4,51,4.521122,4.030778,1.393959,56.090457,0.8,236.428943,1.025,58.2,0.000000,0.326896,0.242340,1.938717,36.6,99.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36357,100,465.494800,457.888170,6.126625,82.675521,52.0,316.985065,1.200,8.2,6.329500,5.071087,0.380382,197.798681,52.4,22.42
36358,100,464.228900,460.424462,6.132149,82.421873,52.1,316.985065,1.200,8.1,-1.265900,2.536292,0.380382,198.179063,52.4,22.63
36359,100,466.037300,462.669597,6.137013,82.743156,52.2,316.985065,1.200,8.0,1.808400,2.245135,0.380382,198.559445,52.4,22.33
36360,100,472.276500,466.512358,6.145285,82.785427,52.3,316.985065,1.200,7.9,6.239200,3.842761,0.380382,198.939827,52.4,21.29


# MP Pipeline: Regressor

## Create ML pipeline

The target and all requirements are already in a numerical format (float and integer)

In [6]:
from sklearn.pipeline import Pipeline

# Feature Engineering
from feature_engine.encoding import OrdinalEncoder
from feature_engine.selection import SmartCorrelatedSelection

# Feat Scaling
from sklearn.preprocessing import StandardScaler

# Feat Selection
from sklearn.feature_selection import SelectFromModel

# ML algorithms
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor


def PipelineOptimization(model):
    pipeline_base = Pipeline([

        # ("OrdinalCategoricalEncoder", OrdinalEncoder(encoding_method='arbitrary',
        #                                              variables=['Data_No', '4point_EWM', 'log_EWM',
        #                                                         'Flow_rate', 'Time', 'Dust_feed', 'Dust', 'RUL',
        #                                                         'change_DP', 'change_EWM', 'mass_g',
        #                                                         'cumulative_mass_g', 'Tt','filter_balance'])),

        ("SmartCorrelatedSelection", SmartCorrelatedSelection(variables=None,
         method="spearman", threshold=0.6, selection_method="variance")),

        ("feat_scaling", StandardScaler()),

        ("feat_selection",  SelectFromModel(model)),

        ("model", model),

    ])

    return pipeline_base


Custom Class for hyperparameter optimisation

In [7]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")
            model = PipelineOptimization(self.models[key])

            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring)
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score (R²)'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score (R²)': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score',
                   'mean_score (R²)', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns], self.grid_searches


## Split Train Test Set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    # df.drop(['Differential_pressure'], axis=1),
    df.drop(['RUL'], axis=1),
    # df['Differential_pressure'],
    df['RUL'],
    test_size=0.2,
    random_state=8,
    shuffle=True
)

X_train, X_validate, y_train, y_validate = train_test_split(
    # df.drop(['Differential_pressure'], axis=1),
    df.drop(['RUL'], axis=1),
    # df['Differential_pressure'],
    df['RUL'],
    test_size=0.25,
    random_state=8,
    shuffle=True
)

print("* Train set:", X_train.shape, y_train.shape,
      "\n* Validate set:",  X_validate.shape, y_validate.shape,
      "\n* Test set:",  X_test.shape, y_test.shape)


* Train set: (27271, 14) (27271,) 
* Validate set: (9091, 14) (9091,) 
* Test set: (7273, 14) (7273,)


In [9]:
X_train

Unnamed: 0,Data_No,Differential_pressure,4point_EWM,log_EWM,Flow_rate,Time,Dust_feed,Dust,change_DP,change_EWM,mass_g,cumulative_mass_g,Tt,filter_balance
16214,69,103.624100,100.920282,4.614331,81.432657,95.4,59.107236,0.900,2.44140,1.802545,0.053197,50.749473,145.7,82.73
25091,79,318.558300,317.685856,5.761063,81.035291,249.7,59.107236,1.200,0.63290,0.581629,0.070929,177.108921,258.1,46.91
25469,80,13.653790,13.794567,2.624275,80.786058,60.8,59.107236,1.200,-1.17549,-0.093852,0.070929,43.124639,193.9,97.72
15670,69,43.312350,41.457182,3.724661,81.280469,41.0,59.107236,0.900,3.70732,1.236779,0.053197,21.810570,145.7,92.78
24504,79,142.505800,143.883742,4.969006,80.528166,191.0,59.107236,1.200,-1.26590,-0.918628,0.070929,135.473784,258.1,76.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10376,66,8.138021,8.225592,2.107250,57.920776,14.5,118.214472,1.025,0.00000,-0.058380,0.121170,17.569626,55.3,98.64
34949,99,99.645550,99.186213,4.596999,80.579744,150.5,59.107236,1.200,0.00000,0.306225,0.070929,106.747668,248.2,83.39
18417,73,20.887590,20.667816,3.028578,58.930376,31.5,118.214472,1.025,0.00000,0.146516,0.121170,38.168498,114.5,96.52
25940,80,37.525320,37.714611,3.630048,81.627117,107.9,59.107236,1.200,-1.26591,-0.126194,0.070929,76.532049,193.9,93.75


## Grid Search CV - Sklearn

### Use default hyperparameters to find most suitable algorithm

In [10]:
models_quick_search = {
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    'LinearRegression': LinearRegression(),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "SGDRegressor": SGDRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(random_state=0),
}

params_quick_search = {
    "AdaBoostRegressor": {},
    "DecisionTreeRegressor": {},
    "ExtraTreesRegressor": {},
    "GradientBoostingRegressor": {},
    'LinearRegression': {},
    "RandomForestRegressor": {},
    "SGDRegressor": {},
    "XGBRegressor": {},
}

Do a hyperparameter optimisation search using default hyperparameters

In [11]:
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)


Running GridSearchCV for AdaBoostRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for DecisionTreeRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for ExtraTreesRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for GradientBoostingRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for LinearRegression 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for RandomForestRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for SGDRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for XGBRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Check results

In [12]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score (R²)')
grid_search_summary

Unnamed: 0,estimator,min_score,mean_score (R²),max_score,std_score
5,RandomForestRegressor,0.997893,0.997974,0.998044,5.2e-05
7,XGBRegressor,0.997875,0.997925,0.998039,5.9e-05
2,ExtraTreesRegressor,0.997779,0.997872,0.99795,5.5e-05
1,DecisionTreeRegressor,0.997692,0.99776,0.997832,6.1e-05
3,GradientBoostingRegressor,0.982586,0.983843,0.985559,0.001303
0,AdaBoostRegressor,0.914072,0.917311,0.922712,0.00319
4,LinearRegression,0.749667,0.755462,0.759751,0.00378
6,SGDRegressor,0.749423,0.755308,0.759548,0.003767


---

## Remove Calculated Requirements
These are naturally be cross correlated to the base requirement they are calculated and may unduly skew the model.

In [13]:
df.head(3)

Unnamed: 0,Data_No,Differential_pressure,4point_EWM,log_EWM,Flow_rate,Time,Dust_feed,Dust,RUL,change_DP,change_EWM,mass_g,cumulative_mass_g,Tt,filter_balance
0,51,2.622251,1.159577,0.148056,55.524146,0.4,236.428943,1.025,58.6,2.622251,0.975116,0.24234,0.969359,36.6,99.56
1,51,3.888165,2.251012,0.81138,55.852018,0.5,236.428943,1.025,58.5,1.265914,1.091435,0.24234,1.211698,36.6,99.35
2,51,4.521122,3.159056,1.150273,56.130203,0.6,236.428943,1.025,58.4,0.632957,0.908044,0.24234,1.454038,36.6,99.25


In [14]:
df = df.drop(['4point_EWM', 'log_EWM', 'change_DP', 'change_EWM', 'mass_g', 'cumulative_mass_g', 'Tt', 'filter_balance'], axis=1)
df

Unnamed: 0,Data_No,Differential_pressure,Flow_rate,Time,Dust_feed,Dust,RUL
0,51,2.622251,55.524146,0.4,236.428943,1.025,58.6
1,51,3.888165,55.852018,0.5,236.428943,1.025,58.5
2,51,4.521122,56.130203,0.6,236.428943,1.025,58.4
3,51,4.521122,56.150070,0.7,236.428943,1.025,58.3
4,51,4.521122,56.090457,0.8,236.428943,1.025,58.2
...,...,...,...,...,...,...,...
36357,100,465.494800,82.675521,52.0,316.985065,1.200,8.2
36358,100,464.228900,82.421873,52.1,316.985065,1.200,8.1
36359,100,466.037300,82.743156,52.2,316.985065,1.200,8.0
36360,100,472.276500,82.785427,52.3,316.985065,1.200,7.9


## Re-Split Train, Test & Validation Sets

In [15]:
# from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    # df.drop(['Differential_pressure'], axis=1),
    df.drop(['RUL'], axis=1),
    # df['Differential_pressure'],
    df['RUL'],
    test_size=0.2,
    random_state=8,
    shuffle=True
)

X_train, X_validate, y_train, y_validate = train_test_split(
    # df.drop(['Differential_pressure'], axis=1),
    df.drop(['RUL'], axis=1),
    # df['Differential_pressure'],
    df['RUL'],
    test_size=0.25,
    random_state=8,
    shuffle=True
)

print("* Train set:", X_train.shape, y_train.shape,
      "\n* Validate set:",  X_validate.shape, y_validate.shape,
      "\n* Test set:",  X_test.shape, y_test.shape)

* Train set: (27271, 6) (27271,) 
* Validate set: (9091, 6) (9091,) 
* Test set: (7273, 6) (7273,)


Re-Run hyperparameter optimisation search using default hyperparameters

In [16]:
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)


Running GridSearchCV for AdaBoostRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for DecisionTreeRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for ExtraTreesRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for GradientBoostingRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for LinearRegression 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for RandomForestRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for SGDRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits

Running GridSearchCV for XGBRegressor 

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Check Results

In [17]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score (R²)')
grid_search_summary

Unnamed: 0,estimator,min_score,mean_score (R²),max_score,std_score
5,RandomForestRegressor,0.967787,0.970117,0.972005,0.00153
2,ExtraTreesRegressor,0.967149,0.969339,0.970627,0.001418
1,DecisionTreeRegressor,0.965764,0.967928,0.969448,0.001542
3,GradientBoostingRegressor,0.954645,0.956254,0.958379,0.001257
0,AdaBoostRegressor,0.931043,0.933675,0.935846,0.001536
4,LinearRegression,0.714387,0.717906,0.721581,0.003035
6,SGDRegressor,0.714236,0.717847,0.721591,0.003063
7,XGBRegressor,0.514137,0.519242,0.530274,0.005662


### Observations
* The average **R² score** (mean_score) indicates how well a model of the data fits the actual data. It ranges from **0.51** to **0.97**, which is exceptional, as value of R² score = 1 represents a perfect fit.
* The R² score of most estimators is much higher than the **0.7** tolerance we decided in the business case.
    * We could use this information to feedback to the business team to review the business model.
    * A tolerance level between **0.85** to **0.95** may be suitable for this dataset / business case.


* From this quick review, we have a range of choices between most regressors. The best result is the **RandomForestRegressor Regressor** however any of the ExtraTree, DecisionTree or GradientBoosting regressors perform above the current business requirement tolerance.

* The GradientBoosting regressor under performs and would not be included for further consideration in this business case.

In [18]:
# stop

### Do an extensive search on the most suitable model to find the best hyperparameter configuration.

Define model and parameters, for Extensive Search

#### Random Forest Regressor (8min)

In [19]:
# documentation to help on hyperparameter list: 
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

models_search = {
    'RandomForestRegressor': RandomForestRegressor(),
}

params_search = {
    'RandomForestRegressor':{
        # 'model__criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
        'model__criterion': ['poisson'],
        # # 'model__max_depth': [None],
        # 'model__max_depth': [3,10,None],
        'model__max_features': [1.0, 'sqrt', 'log2'],
        # 'model__n_estimators': [100,300,600,29089],
        'model__n_estimators': [100,400,800],
        # 'model__n_jobs': [None, 1],
        # 'model__n_jobs': [None],
    }
}

Extensive GridSearch CV

In [20]:
search_rfr = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search_rfr.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)


Running GridSearchCV for RandomForestRegressor 

Fitting 5 folds for each of 9 candidates, totalling 45 fits




Check results

In [21]:
grid_search_summary, grid_search_pipelines = search_rfr.score_summary(sort_by='mean_score (R²)')
grid_search_summary

Unnamed: 0,estimator,min_score,mean_score (R²),max_score,std_score,model__criterion,model__max_features,model__n_estimators
8,RandomForestRegressor,0.96787,0.970215,0.972034,0.001545,poisson,log2,600
1,RandomForestRegressor,0.967894,0.970209,0.972032,0.001519,poisson,1.0,300
2,RandomForestRegressor,0.96788,0.970208,0.972005,0.001525,poisson,1.0,600
5,RandomForestRegressor,0.967817,0.970205,0.972016,0.001549,poisson,sqrt,600
7,RandomForestRegressor,0.967819,0.970204,0.972048,0.001551,poisson,log2,300
4,RandomForestRegressor,0.96781,0.970179,0.971949,0.001538,poisson,sqrt,300
0,RandomForestRegressor,0.967741,0.970138,0.971942,0.001562,poisson,1.0,100
6,RandomForestRegressor,0.967849,0.970131,0.971978,0.001521,poisson,log2,100
3,RandomForestRegressor,0.967821,0.970121,0.971956,0.001526,poisson,sqrt,100


In [22]:
# grid_search_summary_RForest, grid_search_pipelines_RForest = search_rfr.score_summary(sort_by='mean_score (R²)')
# grid_search_summary = grid_search_summary_RForest
# grid_search_pipelines = grid_search_pipelines_RForest
# grid_search_summary_RForest

In [23]:
# stop

---

#### Extra Trees Regressor (48min)

In [24]:
# documentation to help on hyperparameter list: 
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html

models_search = {
    'ExtraTreesRegressor': ExtraTreesRegressor(),
}

params_search = {
    'ExtraTreesRegressor':{
        'model__criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
        # # 'model__max_depth': [None],
        # 'model__max_depth': [3,10,None],
        # 'model__max_features': [1.0, 'sqrt', 'log2'],
        # model__min_samples_split': [2,4,6],
        # 'model__n_estimators': [100,200,300],
        # 'model__n_jobs': [None, 1],
    }
}

Extensive GridSearch CV

In [25]:
# search_et = HyperparameterOptimizationSearch(models=models_search, params=params_search)
# search_et.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=2)

Check Results

In [26]:
# grid_search_summary_ExtraTrees, grid_search_pipelines_ExtraTrees = search_et.score_summary(sort_by='mean_score (R²)')
# grid_search_summary_ExtraTrees

Concatenation into a summary

In [27]:
# grid_search_summary = pd.concat([grid_search_summary_RForest, grid_search_summary_ExtraTrees], ignore_index=True)
# grid_search_pipelines = dict(grid_search_summary_RForest); grid_search_pipelines.update(grid_search_summary_ExtraTrees)

---

#### Check the best model

In [28]:
best_model = grid_search_summary.iloc[0, 0]
best_model

'RandomForestRegressor'

Parameters for best model

In [29]:
grid_search_pipelines[best_model].best_params_

{'model__criterion': 'poisson',
 'model__max_features': 'log2',
 'model__n_estimators': 600}

Define the best regressor, based on search

In [30]:
best_regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
best_regressor_pipeline

In [31]:
type(X_train)

pandas.core.frame.DataFrame

In [32]:
stop

NameError: name 'stop' is not defined

## Assess feature importance

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

# after data cleaning and feature engineering, the features may have changes
# how many data cleaning and feature engineering steps does your pipeline have?
data_cleaning_feat_eng_steps = 2
columns_after_data_cleaning_feat_eng = (Pipeline(best_regressor_pipeline.steps[:data_cleaning_feat_eng_steps])
                                        .transform(X_train)
                                        .columns)

best_features = columns_after_data_cleaning_feat_eng[best_regressor_pipeline['feat_selection'].get_support(
)].to_list()

# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
    'Feature': columns_after_data_cleaning_feat_eng[best_regressor_pipeline['feat_selection'].get_support()],
    'Importance': best_regressor_pipeline['model'].feature_importances_})
    .sort_values(by='Importance', ascending=False)
)

# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()


## Evaluate on Train and Test Sets

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np


def regression_performance(X_train, y_train, X_test, y_test, pipeline):
    print("Model Evaluation \n")
    print("* Train Set")
    regression_evaluation(X_train, y_train, pipeline)
    print("* Test Set")
    regression_evaluation(X_test, y_test, pipeline)


def regression_evaluation(X, y, pipeline):
    prediction = pipeline.predict(X)
    print('R2 Score:', r2_score(y, prediction).round(3))
    print('Mean Absolute Error:', mean_absolute_error(y, prediction).round(3))
    print('Mean Squared Error:', mean_squared_error(y, prediction).round(3))
    print('Root Mean Squared Error:', np.sqrt(
        mean_squared_error(y, prediction)).round(3))
    print("\n")


def regression_evaluation_plots(X_train, y_train, X_test, y_test, pipeline, alpha_scatter=0.5):
    pred_train = pipeline.predict(X_train)
    pred_test = pipeline.predict(X_test)

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))
    sns.scatterplot(x=y_train, y=pred_train, alpha=alpha_scatter, ax=axes[0])
    sns.lineplot(x=y_train, y=y_train, color='red', ax=axes[0])
    axes[0].set_xlabel("Actual")
    axes[0].set_ylabel("Predictions")
    axes[0].set_title("Train Set")

    sns.scatterplot(x=y_test, y=pred_test, alpha=alpha_scatter, ax=axes[1])
    sns.lineplot(x=y_test, y=y_test, color='red', ax=axes[1])
    axes[1].set_xlabel("Actual")
    axes[1].set_ylabel("Predictions")
    axes[1].set_title("Test Set")

    plt.show()


Evaluate Performance

In [None]:
regression_performance(X_train, y_train, X_test, y_test, best_regressor_pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test, best_regressor_pipeline)

# Regressor with PCA

Let's explore potential values for PCA n_components.

In [None]:
pipeline = PipelineOptimization(model=LinearRegression())
pipeline_pca = Pipeline(pipeline.steps[:3])
df_pca = pipeline_pca.fit_transform(df.drop(['tenure'], axis=1))

print(df_pca.shape,'\n', type(df_pca))

Apply PCA separately to the scaled data

In [None]:
import numpy as np
from sklearn.decomposition import PCA

n_components = 17


def pca_components_analysis(df_pca, n_components):
    pca = PCA(n_components=n_components).fit(df_pca)
    x_PCA = pca.transform(df_pca)  # array with transformed PCA

    ComponentsList = ["Component " + str(number)
                      for number in range(n_components)]
    dfExplVarRatio = pd.DataFrame(
        data=np.round(100 * pca.explained_variance_ratio_, 3),
        index=ComponentsList,
        columns=['Explained Variance Ratio (%)'])

    dfExplVarRatio['Accumulated Variance'] = dfExplVarRatio['Explained Variance Ratio (%)'].cumsum(
    )

    PercentageOfDataExplained = dfExplVarRatio['Explained Variance Ratio (%)'].sum(
    )

    print(
        f"* The {n_components} components explain {round(PercentageOfDataExplained,2)}% of the data \n")
    plt.figure(figsize=(12, 5))
    sns.lineplot(data=dfExplVarRatio,  marker="o")
    plt.xticks(rotation=90)
    plt.yticks(np.arange(0, 110, 10))
    plt.show()


pca_components_analysis(df_pca=df_pca, n_components=n_components)


In [None]:
n_components = 7
pca_components_analysis(df_pca=df_pca, n_components=n_components)

## Rewrite ML Pipeline for Modelling

In [None]:
# PCA
from sklearn.decomposition import PCA


def PipelineOptimization(model):
    pipeline_base = Pipeline([

        ("OrdinalCategoricalEncoder", OrdinalEncoder(encoding_method='arbitrary',
                                                     variables=['gender', 'Partner', 'Dependents', 'PhoneService',
                                                                'MultipleLines', 'InternetService', 'OnlineSecurity',
                                                                'OnlineBackup', 'DeviceProtection', 'TechSupport',
                                                                'StreamingTV', 'StreamingMovies', 'Contract',
                                                                'PaperlessBilling', 'PaymentMethod'])),


        ("SmartCorrelatedSelection", SmartCorrelatedSelection(variables=None,
         method="spearman", threshold=0.6, selection_method="variance")),


        ("feat_scaling", StandardScaler()),

        # PCA replace Feature Selection
        ("PCA", PCA(n_components=7, random_state=0)),

        ("model", model),

    ])

    return pipeline_base


## Grid Search CV – Sklearn

In [None]:
print("* Train set:", X_train.shape, y_train.shape, "\n* Test set:",  X_test.shape, y_test.shape)

### Use standard hyperparameters to find the most suitable model.

In [None]:
models_quick_search = {
    'LinearRegression': LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0),
    "XGBRegressor": XGBRegressor(random_state=0),
}

params_quick_search = {
    'LinearRegression': {},
    "DecisionTreeRegressor": {},
    "RandomForestRegressor": {},
    "ExtraTreesRegressor": {},
    "AdaBoostRegressor": {},
    "GradientBoostingRegressor": {},
    "XGBRegressor": {},
}


Do a quick optimisation search 

In [None]:
quick_search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
quick_search.fit(X_train, y_train, scoring='r2', n_jobs=-1, cv=5)

Check results

In [None]:
grid_search_summary, grid_search_pipelines = quick_search.score_summary(sort_by='mean_score')
grid_search_summary

### Do an extensive search on the most suitable model to find the best hyperparameter configuration.

Define model and parameters for extensive search

In [None]:
models_search = {
    "GradientBoostingRegressor":GradientBoostingRegressor(random_state=0),
}

# documentation to help on hyperparameter list: 
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

# We will not conduct an extensive search, since the focus
# is on how to combine all knowledge in an applied project.
# In a workplace project, you may spend more time in this step
params_search = {
    "GradientBoostingRegressor":{
        'model__n_estimators': [100,300],
        'model__learning_rate': [1e-1,1e-2,1e-3], 
        'model__max_depth': [3,10, None],
    }
}

Extensive GridSearch CV

In [None]:
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, scoring = 'r2', n_jobs=-1, cv=5)

Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

Check the best model

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

Parameters for best model

In [None]:
grid_search_pipelines[best_model].best_params_

Define the best regressor

In [None]:
best_regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
best_regressor_pipeline

## Evaluate Regressor on Train and Tests Sets

In [None]:
regression_performance(X_train, y_train, X_test, y_test,best_regressor_pipeline)
regression_evaluation_plots(X_train, y_train, X_test, y_test,
                            best_regressor_pipeline)

# Convert Regression to Classification

### Convert numerical target to bins, and check if it is balanced

In [None]:
from feature_engine.discretisation import EqualFrequencyDiscretiser
disc = EqualFrequencyDiscretiser(q=3, variables=['tenure'])  # we will try q as 2, and 3
df_clf = disc.fit_transform(df)

print(f"* The classes represent the following ranges: \n{disc.binner_dict_} \n")
sns.countplot(data=df_clf, x='tenure')
plt.show()

In [None]:
df_clf.head(3)

## Rewrite ML Pipeline for Modelling

In [None]:
def PipelineOptimization(model):
    pipeline_base = Pipeline([

        ("OrdinalCategoricalEncoder", OrdinalEncoder(encoding_method='arbitrary',
                                                     variables=['gender', 'Partner', 'Dependents', 'PhoneService',
                                                                'MultipleLines', 'InternetService', 'OnlineSecurity',
                                                                'OnlineBackup', 'DeviceProtection', 'TechSupport',
                                                                'StreamingTV', 'StreamingMovies', 'Contract',
                                                                'PaperlessBilling', 'PaymentMethod'])),


        ("SmartCorrelatedSelection", SmartCorrelatedSelection(variables=None,
         method="spearman", threshold=0.6, selection_method="variance")),

        ("feat_scaling", StandardScaler()),

        ("feat_selection",  SelectFromModel(model)),

        ("model", model),

    ])

    return pipeline_base


## Load algorithms for classification

In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

## Split Train Test Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_clf.drop(['tenure'], axis=1),
    df_clf['tenure'],
    test_size=0.2,
    random_state=0
)

print("* Train set:", X_train.shape, y_train.shape,
      "\n* Test set:",  X_test.shape, y_test.shape)


## Grid Seach CV – Sklearn

### Use standard hyper parameters to find most suitable model

In [None]:
models_quick_search = {
    "XGBClassifier": XGBClassifier(random_state=0),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
    "RandomForestClassifier": RandomForestClassifier(random_state=0),
    "GradientBoostingClassifier": GradientBoostingClassifier(random_state=0),
    "ExtraTreesClassifier": ExtraTreesClassifier(random_state=0),
    "AdaBoostClassifier": AdaBoostClassifier(random_state=0),
}

params_quick_search = {
    "XGBClassifier":{},
    "DecisionTreeClassifier":{},
    "RandomForestClassifier":{},
    "GradientBoostingClassifier":{},
    "ExtraTreesClassifier":{},
    "AdaBoostClassifier":{},
}

GridSearch CV

In [None]:
from sklearn.metrics import make_scorer, recall_score
quick_search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
quick_search.fit(X_train, y_train,
                 scoring = make_scorer(recall_score, labels=[0], average=None),
                 n_jobs=-1,
                 cv=5)

Check results

In [None]:
grid_search_summary, grid_search_pipelines = quick_search.score_summary(sort_by='mean_score')
grid_search_summary

### Do an extensive search on the most suitable model to find the best hyperparameter configuration.

Define models and parameters

In [None]:
models_search = {
    "AdaBoostClassifier": AdaBoostClassifier(random_state=0),
}

# documentation to help on hyperparameter list:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
params_search = {
    "AdaBoostClassifier": {
        'model__n_estimators': [50, 100, 300],
        'model__learning_rate': [1e-1, 1e-2, 1e-3],
    }
}


Extensive GridSearch CV

In [None]:
from sklearn.metrics import make_scorer,  recall_score
search = HyperparameterOptimizationSearch(
    models=models_search, params=params_search)
search.fit(X_train, y_train,
           scoring=make_scorer(recall_score, labels=[0], average=None),
           n_jobs=-1, cv=5)


Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary


Check the best model

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

Parameters for best model
* We are saving this content for later

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

Define the best clf pipeline

In [None]:
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
pipeline_clf

## Assess feature importance

We can assess feature importance for this model with `.feature_importances_`

In [None]:
# after data cleaning and feat engine, the feature may space changes
# how much data cleaning and feature engineering does your pipeline have?
data_cleaning_feat_eng_steps = 2
columns_after_data_cleaning_feat_eng = (Pipeline(pipeline_clf.steps[:data_cleaning_feat_eng_steps])
                                        .transform(X_train)
                                        .columns)

# best_features = columns_after_data_cleaning_feat_eng
best_features = columns_after_data_cleaning_feat_eng[pipeline_clf['feat_selection'].get_support(
)].to_list()

# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
    'Feature': columns_after_data_cleaning_feat_eng[pipeline_clf['feat_selection'].get_support()],
    'Importance': pipeline_clf['model'].feature_importances_})
    .sort_values(by='Importance', ascending=False)
)

# reassign best features in order
best_features = df_feature_importance['Feature'].to_list()

# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{best_features}")

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()


## Evaluate Classifier on Train and Test Sets

Custom Function

In [None]:
from sklearn.metrics import classification_report, confusion_matrix


def confusion_matrix_and_report(X, y, pipeline, label_map):

    prediction = pipeline.predict(X)

    print('---  Confusion Matrix  ---')
    print(pd.DataFrame(confusion_matrix(y_true=prediction, y_pred=y),
          columns=[["Actual " + sub for sub in label_map]],
          index=[["Prediction " + sub for sub in label_map]]
          ))
    print("\n")

    print('---  Classification Report  ---')
    print(classification_report(y, prediction, target_names=label_map), "\n")


def clf_performance(X_train, y_train, X_test, y_test, pipeline, label_map):
    print("#### Train Set #### \n")
    confusion_matrix_and_report(X_train, y_train, pipeline, label_map)

    print("#### Test Set ####\n")
    confusion_matrix_and_report(X_test, y_test, pipeline, label_map)


List that relates the classes and tenure interval

In [None]:
disc.binner_dict_['tenure']

We can create manually

In [None]:
label_map = ['<4.0', '4.0 to 20.0','+20.0']
label_map

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                        X_test=X_test, y_test=y_test,
                        pipeline=pipeline_clf,
                        label_map= label_map )

# Which pipeline to choose?

We fitted 3 pipelines:
* Regression
* Regression with PCA
* Classifier

The regressor pipelines didn't reach the expected performance threshold (0.7 R2 score) for the train and test set.

The classifier was tuned on Recall for class 0 (tenure <4 months), since we are interested to detect prospects that may churn soon. 
* It has reasonable performance for class 0 (<4 months) and class 2 (+20 months)
* Class 1 (4 to 20 months) has weak performance.

In [None]:
pipeline_clf

# Refit pipeline with best features

## Rewrite Pipeline

In [None]:
def PipelineOptimization(model):
    pipeline_base = Pipeline([

        ("OrdinalCategoricalEncoder", OrdinalEncoder(encoding_method='arbitrary',
                                                     variables=['Contract', 'PaymentMethod'])),

        ("feat_scaling", StandardScaler()),

        # feature selection is not needed

        ("model", model),

    ])

    return pipeline_base


## Split Train Test Set, only with best features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_clf.drop(['tenure'], axis=1),
    df_clf['tenure'],
    test_size=0.2,
    random_state=0
)

print("* Train set:", X_train.shape, y_train.shape,
      "\n* Test set:",  X_test.shape, y_test.shape)


Subset Best Features

In [None]:
X_train = X_train.filter(best_features)
X_test = X_test.filter(best_features)

print("* Train set:", X_train.shape, y_train.shape, "\n* Test set:",  X_test.shape, y_test.shape)
X_train.head(3)

## Grid Search CV – Sklearn

We are using the same model from the previous GridCV search

In [None]:
models_search

And the best parameters from the previous GridCV search

In [None]:
best_parameters

You will need to type in manually since the hyperparameter values have to be a list. The previous dictionary is not in this format.

In [None]:
params_search = {'AdaBoostClassifier':  {
    'model__learning_rate': [0.001],   # the value should be in []
    'model__n_estimators': [50]       # the value should be in []
}
}
params_search

In [None]:
# params_search = {f'{models_search}': best_parameters}
# params_search

GridSearch CV

In [None]:
from sklearn.metrics import make_scorer, recall_score
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train,
           scoring = make_scorer(recall_score, labels=[0], average=None),
           n_jobs=-1,cv=5)


Check results

In [None]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

Check the best model

In [None]:
best_model = grid_search_summary.iloc[0,0]
best_model

Define the best clf pipeline

In [None]:
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
pipeline_clf

## Assess feature importance

In [None]:
# how many data cleaning and feature engineering does your pipeline have?
data_cleaning_feat_eng_steps = 1
columns_after_data_cleaning_feat_eng = (Pipeline(pipeline_clf.steps[:data_cleaning_feat_eng_steps])
                                        .transform(X_train)
                                        .columns)

best_features = columns_after_data_cleaning_feat_eng

# create DataFrame to display feature importance
df_feature_importance = (pd.DataFrame(data={
    'Feature': columns_after_data_cleaning_feat_eng,
    'Importance': pipeline_clf['model'].feature_importances_})
    .sort_values(by='Importance', ascending=False)
)

# Most important features statement and plot
print(f"* These are the {len(best_features)} most important features in descending order. "
      f"The model was trained on them: \n{df_feature_importance['Feature'].to_list()}")

df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()


## Evaluate Classifier on Train and Test Sets

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                        X_test=X_test, y_test=y_test,
                        pipeline=pipeline_clf,
                        label_map= label_map )

# Push files to the repo

We will generate the following files

* Train set
* Test set
* Modeling pipeline
* label map
* features importance plot

In [None]:
import joblib
import os

version = 'v1'
file_path = f'outputs/ml_pipeline/predict_tenure/{version}'

try:
  os.makedirs(name=file_path)
except Exception as e:
  print(e)

## Train Set: features and target

In [None]:
X_train.head()

In [None]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)

In [None]:
y_train

In [None]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

## Test Set: features and target

In [None]:
X_test.head()

In [None]:
X_test.to_csv(f"{file_path}/X_test.csv", index=False)

In [None]:
y_test

In [None]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

## Modelling pipeline

ML pipeline for predicting tenure

In [None]:
pipeline_clf

In [None]:
joblib.dump(value=pipeline_clf, filename=f"{file_path}/clf_pipeline.pkl")

## List  mapping target levels to ranges

Map for converting numerical variable to categorical variable

In [None]:
label_map

In [None]:
joblib.dump(value=label_map, filename=f"{file_path}/label_map.pkl")

## Feature importance plot

In [None]:
df_feature_importance.plot(kind='bar', x='Feature', y='Importance')
plt.show()

In [None]:
df_feature_importance.plot(kind='bar',x='Feature',y='Importance')
plt.savefig(f'{file_path}/features_importance.png', bbox_inches='tight')

Good job! Clear cell's outputs, push to the repo using git commands and move on to the next notebook

---