# Ugly Duckings Churn Prediction

## Import required packages

In [333]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression # is the package to build the linear regression model
from sklearn import metrics # contains functions for module evaluation
from sklearn.utils.class_weight import compute_class_weight

# train/test split lib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.feature_selection import SelectFromModel

# Statsmodels
import statsmodels.api as sm # models
import statsmodels.tools # metrics

# Smote
from imblearn.over_sampling import SMOTE

import ast

## Load Data

In [171]:
pd.options.display.max_columns = None

In [173]:
df = pd.read_csv("../swan_data.csv")

## Cleaning and Feature Engineering Functions

In [176]:
def data_cleaning(df):
    #Making a copy of the dataset
    df = df.copy()

    #Using CustomerID as index
    df.set_index("CustomerID", inplace=True)

    #Fixing Total Charges column - inserting zeroes for blank columns and casting to float
    df["Total Charges"] = df["Total Charges"].str.replace(" ", "0")
    df = df.astype({"Total Charges": float})

    #Dropping unnecessary columns
    df.drop(columns=["Count", "Country", "State", "City", "Zip Code", "Lat Long", "Churn Label", "Churn Reason"], inplace=True)

    #Mapping columns to numeric values
    #General case, where values are "yes" and "no". Results like "no phone service" are mapped to 0 since the lack of phone service is contained in a different column
    general_mapper = {"No":0, "Yes":1, "No phone service": 0, "No internet service": 0}
    for col in df.columns:
        if "No" in df[col].unique() and "Yes" in df[col].unique():
            df[col] = df[col].map(general_mapper)
    
    #Mapping male to 0 and female to 1
    gender_mapper = {"Male":0, "Female":1}
    df["Gender"] = df["Gender"].map(gender_mapper)

    #Mapping contract lengths into a value representing the length of the contract term in years
    contract_mapper = {"Month-to-month": 1/12, "Two year": 2, "One year":1}
    df["Contract"] = df["Contract"].map(contract_mapper)

    return df


In [178]:
def min_max_scaling(df, train, col):
    minimum = train[col].min() # Min of train
    maximum = train[col].max() # Max of train
    df[col] = (df[col] - minimum) / (maximum-minimum) # Applying scaling
    return df[col] ## Returned scaled column


In [180]:
def metrics_func(actual, pred):
    accuracy = metrics.accuracy_score(actual, pred)
    precision = metrics.precision_score(actual, pred)
    recall = metrics.recall_score(actual, pred)
    f1 = metrics.f1_score(actual, pred)

    print(f'accuracy: {accuracy*100}%')
    print(f'precision: {precision*100}%')
    print(f'recall: {recall*100}%')
    print(f'f1: {f1*100}%')

In [182]:
def cutoff_change(target, data, n):
    data['y_pred'] = np.where(data.prob_churn > n, # Threshold
                                   1, # Condition if met
                                   0) # Condition if not met
    return metrics_func(target, data['y_pred'])

In [184]:
def feature_engineering(df):
    # Columns to scale
    #scale_cols = ['Monthly Charges','Total Charges','Tenure Months']
    
    # Apply min-max scaling to columns listed above
   # for col in scale_cols:
      #  df[col] = min_max_scaling(df, train, col)
    
    # OHE Payment Method and Internet Service columns
    df = pd.get_dummies(df, 
                        columns = ['Payment Method', 'Internet Service'], 
                        prefix = ['pay','is'], 
                        drop_first = True, 
                        dtype = int)
    
    
    return df.drop(columns = ['Partner', 'Paperless Billing'])

# Logistic Regression Model

In [187]:
df_clean = data_cleaning(df)

In [189]:
X = df_clean.drop(columns = 'Churn Value').copy()
y = df_clean['Churn Value'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, # features
                                                    y, # target
                                                    test_size=0.2, # What proportion of data is for testing
                                                    random_state=1204) ## Seed for random split

In [191]:
X_train_fe = feature_engineering(X_train)

In [193]:
all(X_train_fe.index == y_train.index) # sanity check

True

### Initial model

In [197]:
first_logreg = LogisticRegression(max_iter=10000, 
                                    penalty='l1', 
                                    solver='liblinear')

In [199]:
first_logreg.fit(X_train_fe, y_train)

### Feature selection

In [230]:
# Initialising selected model
selection = SelectFromModel(first_logreg, # initial model
                            prefit=True, # Has first_logreg already been fit
                            importance_getter='coef_',
                            threshold='median') # Determines selection of features based on coefficients

# 
X_train_sel = selection.fit_transform(X_train_fe)

### Stack exchange
feature_idx = selection.get_support()
feature_name = X_train_fe.columns[feature_idx]

X_train_sel = pd.DataFrame(X_train_sel, columns = feature_name)

In [266]:
X_train_sel.head()

Unnamed: 0,Dependents,Phone Service,Multiple Lines,Online Security,Online Backup,Tech Support,Streaming Movies,Contract,pay_Electronic check,is_Fiber optic,is_No
0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.083333,0.0,1.0,0.0
1,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.083333,1.0,1.0,0.0
2,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.083333,1.0,1.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0


### Class imbalance

In [232]:
# Define the SMOTE model
smote = SMOTE()

# Apply SMOTE to the training data
X_train_smote, y_train_smote = smote.fit_resample(X_train_sel, y_train)

In [274]:
y_train_smote.value_counts()

Churn Value
0    4141
1    4141
Name: count, dtype: int64

### Optimise recall

In [234]:
log_reg = LogisticRegression(penalty = 'l1', solver='liblinear')

In [236]:
# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'max_iter': [500,1000,5000,10000]
}

# Implement GridSearchCV
grid_search = GridSearchCV(sk_results, param_grid, cv=10, scoring='recall')
grid_search.fit(X_train_smote, y_train_smote)

In [237]:
best_accuracy = grid_search.best_estimator_

### Model in use

In [238]:
X_train_smote[['prob_retain','prob_churn']] = best_accuracy.predict_proba(X_train_smote)

In [260]:
cutoff_change(y_train_smote, X_train_smote, 0.5)

accuracy: 77.07075585607342%
precision: 74.52954048140043%
recall: 82.25066409079932%
f1: 78.19997704052348%


In [262]:
X_test_fe = feature_engineering(X_test)
X_test_sel = selection.transform(X_test_fe)
X_test_sel = pd.DataFrame(X_test_sel, columns=feature_name)

X_test_sel[['prob_retain','prob_churn']] = best_accuracy.predict_proba(X_test_sel)

cutoff_change(y_test, X_test_sel, 0.5)

accuracy: 73.59829666430092%
precision: 50.330033003300336%
recall: 81.11702127659575%
f1: 62.11812627291242%


In [280]:
X_train_smote

Unnamed: 0,Dependents,Phone Service,Multiple Lines,Online Security,Online Backup,Tech Support,Streaming Movies,Contract,pay_Electronic check,is_Fiber optic,is_No,prob_retain,prob_churn,y_pred
0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.083333,0.0,1.0,0.0,0.253254,0.746746,1
1,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.083333,1.0,1.0,0.0,0.186281,0.813719,1
2,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.083333,1.0,1.0,0.0,0.186281,0.813719,1
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.761589,0.238411,0
4,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.000000,1.0,1.0,0.0,0.551568,0.448432,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8277,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,1.0,0.603424,0.396576,0
8278,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.083333,1.0,1.0,0.0,0.207590,0.792410,1
8279,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.083333,1.0,1.0,0.0,0.178404,0.821596,1
8280,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.083333,1.0,1.0,0.0,0.135582,0.864418,1


## 500 Most likely to churn

In [317]:
X_fe = feature_engineering(X)
X_sel = selection.transform(X_fe)
X_sel = pd.DataFrame(X_sel, columns=feature_name)

X_sel = X_sel.set_index(y.index)

In [319]:
X_sel[['prob_retain','prob_churn']] = best_accuracy.predict_proba(X_sel)
X_sel['Churn Value'] = y
X_sel.reset_index(inplace = True)

In [323]:
risk_500 = X_sel[X_sel['Churn Value'] == 0][['CustomerID','prob_churn']]\
    .sort_values(by = 'prob_churn',
                 ascending = False)[:500]

In [370]:
risk_500.to_csv("Logistic_500.csv", index=False)

### Compare with decision tree list

In [327]:
lst_500 = list(risk_500['CustomerID'])

In [335]:
f = open("500_most.txt", "r")
matthew_lst = f.read()

matthew_lst = ast.literal_eval(matthew_lst)

In [338]:
len(set(lst_500).intersection(set(matthew_lst)))

349

## Churn risk

In [357]:
churn_risk = X_sel[X_sel['Churn Value'] == 0][['CustomerID','prob_churn']]\
    .sort_values(by = 'prob_churn',
                 ascending = False)

In [366]:
churn_risk.to_csv("Churn Risk.csv", index=False)

In [1077]:
# Define the SMOTE model
smote = SMOTE()

# Apply SMOTE to the training data
X_train_smote, y_train_smote = smote.fit_resample(X_train_fe, y_train)

In [1079]:
logreg = sm.Logit(y_train_smote, X_train_smote)

In [1081]:
results = logreg.fit()

Optimization terminated successfully.
         Current function value: 0.338731
         Iterations 8


In [1083]:
results.summary()

0,1,2,3
Dep. Variable:,Churn Value,No. Observations:,8282.0
Model:,Logit,Df Residuals:,8260.0
Method:,MLE,Df Model:,21.0
Date:,"Wed, 05 Jun 2024",Pseudo R-squ.:,0.5113
Time:,11:19:19,Log-Likelihood:,-2805.4
converged:,True,LL-Null:,-5740.6
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Latitude,0.0635,0.018,3.485,0.000,0.028,0.099
Longitude,0.0659,0.008,8.623,0.000,0.051,0.081
Gender,-0.2772,0.068,-4.090,0.000,-0.410,-0.144
Senior Citizen,-0.1950,0.090,-2.159,0.031,-0.372,-0.018
Dependents,-1.8723,0.123,-15.221,0.000,-2.113,-1.631
Tenure Months,-0.0933,0.006,-14.778,0.000,-0.106,-0.081
Phone Service,-7.6740,0.409,-18.777,0.000,-8.475,-6.873
Multiple Lines,-1.6322,0.123,-13.235,0.000,-1.874,-1.390
Online Security,-2.4161,0.127,-18.971,0.000,-2.666,-2.167


In [1085]:
y_pred = results.predict(X_train_smote)

In [1087]:
first_results = X_train_smote.copy()

In [1108]:
first_results['prob_churn'] = y_pred

In [1110]:
def cutoff_change(target, data, n):
    data['y_pred'] = np.where(data.prob_churn > n, # Threshold
                                   1, # Condition if met
                                   0) # Condition if not met
    return metrics_func(target, data['y_pred'])

In [1112]:
cutoff_change(y_train_smote, first_results, 0.5)

accuracy: 83.40980439507365%
precision: 82.91696407328098%
recall: 84.15841584158416%
f1: 83.53307766059443%


In [1114]:
X_test_fe = feature_engineering(X_test)

In [1116]:
y_pred_test = results.predict(X_test_fe)

In [1124]:
test_results = X_test_fe.copy()

test_results['prob_churn'] = y_pred_test

In [1126]:
cutoff_change(y_test, test_results, 0.5)

accuracy: 78.06955287437899%
precision: 57.772621809744784%
recall: 66.22340425531915%
f1: 61.7100371747212%


In [1128]:
test_results['y_pred'].unique()

array([1, 0])

In [93]:
type(y)

pandas.core.series.Series

In [645]:
X = df_clean.drop(columns = 'Churn Value').copy()
y = df_clean['Churn Value'].copy()

In [647]:
X_fe = feature_engineering(X)

In [649]:
X_fe[['prob_retain','prob_churn']] = model_selected.predict_proba(X_fe)

In [651]:
cutoff_change(y, X_fe, 0.45)

accuracy: 77.42439301434048%
precision: 55.691554467564266%
recall: 73.03370786516854%
f1: 63.19444444444445%


In [653]:
X_fe['Churn Value'] = y

In [655]:
X_fe.reset_index(inplace = True)

In [657]:
risk_500 = X_fe[X_fe['Churn Value'] == 0][['CustomerID','prob_churn']][:500]\
    .sort_values(by = 'prob_churn',
                 ascending = False)

In [659]:
lst_500 = list(risk_500.CustomerID)

In [661]:
f = open("500_most.txt", "r")
matthew_lst = f.read()

In [663]:
import ast
matthew_lst = ast.literal_eval(matthew_lst)

In [665]:
len(set(lst_500).intersection(set(matthew_lst)))

48

In [435]:
matthew_lst

['5542-TBBWB',
 '7577-SWIFR',
 '5043-TRZWM',
 '9603-OAIHC',
 '7439-DKZTW',
 '3318-NMQXL',
 '1452-VOQCH',
 '8622-ZLFKO',
 '4912-PIGUY',
 '7465-ZZRVX',
 '8161-QYMTT',
 '6630-UJZMY',
 '2545-EBUPK',
 '3878-AVSOQ',
 '5150-ITWWB',
 '4234-XTNEA',
 '2789-HQBOU',
 '6357-JJPQT',
 '2262-SLNVK',
 '3841-CONLJ',
 '4132-KALRO',
 '8714-CTZJW',
 '8739-XNIKG',
 '2215-ZAFGX',
 '8266-VBFQL',
 '8087-LGYHQ',
 '6856-RAURS',
 '3462-BJQQA',
 '4749-OJKQU',
 '4090-KPJIP',
 '9605-WGJVW',
 '8035-PWSEV',
 '8040-MNRTF',
 '7994-UYIVZ',
 '9840-EFJQB',
 '1640-PLFMP',
 '1197-BVMVG',
 '4633-MKHYU',
 '9094-AZPHK',
 '1628-BIZYP',
 '3320-VEOYC',
 '5935-FCCNB',
 '3411-WLRSQ',
 '0722-SVSFK',
 '2038-OEQZH',
 '7225-CBZPL',
 '9919-KNPOO',
 '5348-CAGXB',
 '3096-GKWEB',
 '4927-WWOOZ',
 '0021-IKXGC',
 '6969-MVBAI',
 '3452-GWUIN',
 '2371-JQHZZ',
 '9957-YODKZ',
 '5429-LWCMV',
 '7668-XCFYV',
 '0644-OQMDK',
 '7858-GTZSP',
 '1173-XZPYF',
 '0187-QSXOE',
 '6496-JDSSB',
 '3422-GALYP',
 '5032-MIYKT',
 '2081-VEYEH',
 '9360-OMDZZ',
 '6435-SRW

In [433]:
risk_500

Unnamed: 0,CustomerID,prob_churn
2189,4360-PNRQB,0.889637
2182,5605-IYGFG,0.889132
2036,6496-JDSSB,0.888574
2215,1452-VOQCH,0.883144
2265,4234-XTNEA,0.881224
...,...,...
2046,7083-MIOPC,0.000480
2029,7157-SMCFK,0.000227
2330,2804-ETQDK,0.000183
2297,7293-LSCDV,0.000149
