# Preprocessing, modeling and fine-tuning

We want to be able to predict churning.

Ideas for models:

1. start only with non-spatial features
2. if necessary, incorporate location

In [7]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

In [8]:
import pandas as pd
import pathlib
from sklearn.model_selection import train_test_split
import nmmn.misc

## Import dataset

### Read files

Plans, dependents, charges

In [3]:
plans=pd.read_excel(pathlib.Path('../data/services.xlsx'))

Churn reason, category and satisfaction score

In [4]:
status=pd.read_excel(pathlib.Path('../data/churn_status.xlsx'))

Location

In [5]:
loc=pd.read_excel(pathlib.Path('../data/location.xlsx'))

### Join

Join all tables into one for convenience

In [6]:
tmp = pd.merge(plans, status, on='Customer ID', how='inner')  # 'inner' join by default
df = pd.merge(tmp, loc, on='Customer ID', how='inner')

## 4. Prepare the data

No new features are necessary

### Train/validation split

In [35]:
train, val = train_test_split(df, test_size=0.2, random_state=42)

### Split features and labels

Let's separate the labels from the features: `X`=features, `Y`=labels.

In [36]:
labels=["Churn Value"]
X=train.drop(columns=labels)
Xval=val.drop(columns=labels)
Y=train[labels]
Yval=val[labels]

### Pipeline

The idea here in the spirit of scikit-learn is to create a pipeline that performs all necessary transformations on the raw data, be them log, ratios, etc etc. It will also include the standardization for ML training. 

In [15]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder

In [40]:
# Features to keep
binary_categorical_features = ['Phone Service',	'Online Security', 'Online Backup',	'Premium Tech Support']  
multi_class_categorical_features = ['Contract'] 
numerical_features = ['Number of Referrals', 'Tenure in Months', 'Monthly Charge', 'Satisfaction Score']

# Define preprocessing steps
preprocess = ColumnTransformer([
    ('binary_cat', OneHotEncoder(drop='if_binary'), binary_categorical_features),  # Binary encoded
    ('multi_cat', OneHotEncoder(), multi_class_categorical_features),  # One-hot encoding for multi-class
    ('num', StandardScaler(), numerical_features)  # Scaling for numerical features
], remainder='drop')

Inspect to see what came out

In [41]:
preprocess

In [42]:
temp=preprocess.fit_transform(X)

In [43]:
preprocess.get_feature_names_out()

array(['binary_cat__Phone Service_Yes', 'binary_cat__Online Security_Yes',
       'binary_cat__Online Backup_Yes',
       'binary_cat__Premium Tech Support_Yes',
       'multi_cat__Contract_Month-to-Month',
       'multi_cat__Contract_One Year', 'multi_cat__Contract_Two Year',
       'num__Number of Referrals', 'num__Tenure in Months',
       'num__Monthly Charge', 'num__Satisfaction Score'], dtype=object)

# 5. Select and train model

In [67]:
import nmmn.ml

### a. Logistic regression

In [68]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

Incorporates model into pipeline

In [107]:
logistic = make_pipeline(preprocess, LogisticRegression(solver='lbfgs') )

Train the model

In [108]:
%%time
logistic.fit(X, Y)

CPU times: user 339 ms, sys: 29.6 ms, total: 368 ms
Wall time: 192 ms


  y = column_or_1d(y, warn=True)


#### Accuracy

Training error

In [109]:
from sklearn.metrics import accuracy_score

In [110]:
accuracy_score(Y, logistic.predict(X))

0.9559815406460774

Validation error

In [111]:
accuracy_score(Yval, logistic.predict(Xval))

0.9481902058197303

#### Coefficients

Inspect coefficients of logistic regression to understand what are the most important factors driving churn.

In [123]:
# Access the LogisticRegression model and feature names from the pipeline
logistic_model = logistic.named_steps['logisticregression']
feature_names = logistic.named_steps['columntransformer'].get_feature_names_out() # Assuming you have a ColumnTransformer

# Get the coefficients
coefficients = logistic_model.coef_[0]  # Access the first row (for binary classification)

# Create a DataFrame for better visualization
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# Print the DataFrame
coef_df.sort_values(by='Coefficient', ascending=False, key=abs)

Unnamed: 0,Feature,Coefficient
10,num__Satisfaction Score,-6.935611
1,binary_cat__Online Security_Yes,-2.87566
0,binary_cat__Phone Service_Yes,-1.035499
9,num__Monthly Charge,1.026976
7,num__Number of Referrals,-1.016859
6,multi_cat__Contract_Two Year,-0.881557
4,multi_cat__Contract_Month-to-Month,0.69062
3,binary_cat__Premium Tech Support_Yes,-0.581608
8,num__Tenure in Months,-0.568135
2,binary_cat__Online Backup_Yes,-0.368813


### b. Random forest

In [75]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

In [76]:
forest = make_pipeline(preprocess, RandomForestClassifier(random_state=42))

Convert `Y` to one column for fitting

In [78]:
Ytrain1=np.argmax(Y.values, axis=1)
Yval1=np.argmax(Yval.values, axis=1)

In [79]:
%%time
# Train the classifier
forest.fit(X, Ytrain1)

CPU times: user 93.4 ms, sys: 9.78 ms, total: 103 ms
Wall time: 102 ms


#### Accuracy

Training error

In [84]:
accuracy_score(Y, forest.predict(X))

0.7392616258430955

Validation error

In [85]:
accuracy_score(Yval, forest.predict(Xval))

0.71611071682044

### c. XGBoost

In [87]:
from xgboost import XGBClassifier

In [88]:
# Set up the parameters for the XGBoost model
params = {
    'objective': 'binary:logistic',  
    'max_depth': 6,                # Depth of the trees
    'learning_rate': 0.1,          # Step size shrinkage
    'eval_metric': 'logloss',     
    'seed': 42                     # For reproducibility
}

#### Train

In [89]:
gbm = make_pipeline(preprocess, XGBClassifier(**params))#, use_label_encoder=False))

In [90]:
%%time
# Train the classifier
gbm.fit(X, Y)

CPU times: user 292 ms, sys: 114 ms, total: 407 ms
Wall time: 125 ms


#### Accuracy

Training error

In [91]:
accuracy_score(Y, gbm.predict(X))

0.9746183883564076

Validation error

In [92]:
accuracy_score(Yval, gbm.predict(Xval))

0.9446415897799858

### Model comparison

#### Accuracy (%)

| Model    | Training | Validation | 
|----------|----------|------------|
| Random   | 50       |            | 
| Logistic | 96       | 95         | 
| RF       | 74       | 72         | 
| GBM      | 97       | 94         |

## 6. Fine-tune

We selected *logistic* as the best model. Let's pick the best hyperparameters in a Monte Carlo fashion.

In [112]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats
from sklearn.pipeline import Pipeline

### Hyperparameters

In [126]:
params = {
    'C': 0.1
}

In [127]:
tuningPipeline = Pipeline([
    ("preprocess", preprocess),
    ("logit", LogisticRegression(**params)),
])

In [128]:
param_distribs = {'logit__C': scipy.stats.loguniform(0.01, 10)                 
                 }

### Number crunching

Let's try 100 random combinations of hyperparameters: `niter * cv`

In [129]:
niter=10
cv=10

In [130]:
search = RandomizedSearchCV(
    tuningPipeline, param_distributions=param_distribs, n_iter=niter, cv=cv,
    scoring='accuracy', random_state=42)

In [133]:
import warnings
warnings.filterwarnings("ignore", module="sklearn.*") # Suppress warnings from the sklearn module

In [134]:
%%time
search.fit(X, Y)

CPU times: user 13.8 s, sys: 6.43 s, total: 20.2 s
Wall time: 2.69 s


In [135]:
res = pd.DataFrame(search.cv_results_)
res.sort_values(by="mean_test_score", ascending=False, inplace=True)
res.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logit__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
3,0.019575,0.004854,0.002831,0.000642,0.625137,{'logit__C': 0.6251373574521749},0.955674,0.950355,0.960993,0.953901,0.94849,0.952043,0.952043,0.968028,0.950266,0.957371,0.954916,0.005628,1
8,0.020411,0.00465,0.004833,0.00253,0.635836,{'logit__C': 0.6358358856676253},0.955674,0.950355,0.960993,0.953901,0.946714,0.953819,0.952043,0.968028,0.950266,0.957371,0.954916,0.005794,1
7,0.017804,0.004313,0.005013,0.001865,3.967605,{'logit__C': 3.9676050770529883},0.957447,0.952128,0.95922,0.953901,0.94849,0.952043,0.952043,0.964476,0.950266,0.957371,0.954738,0.004585,3
1,0.02324,0.004473,0.003313,0.001348,7.114476,{'logit__C': 7.114476009343421},0.955674,0.952128,0.95922,0.953901,0.94849,0.952043,0.952043,0.964476,0.950266,0.957371,0.954561,0.004511,4
9,0.02113,0.00454,0.004549,0.001886,1.331122,{'logit__C': 1.3311216080736887},0.955674,0.950355,0.95922,0.955674,0.944938,0.950266,0.952043,0.966252,0.952043,0.957371,0.954383,0.005567,5


### Results

Best hyperparameters

In [138]:
cols=[4,16]
res2=res.iloc[:, cols]
res2.head()

Unnamed: 0,param_logit__C,mean_test_score
3,0.625137,0.954916
8,0.635836,0.954916
7,3.967605,0.954738
1,7.114476,0.954561
9,1.331122,0.954383


There is very little room for tuning with logistic regression, so I will not bother.

## 7. Present solution