Context

"Predict behavior to retain customers. You can analyze all relevant customer data and develop focused customer retention programs." [IBM Sample Data Sets]

Content

Each row represents a customer, each column contains customer’s attributes described on the column Metadata.

The data set includes information about:

 - Customers who left within the last month – the column is called Churn
 - Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
 - Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
 - Demographic info about customers – gender, age range, and if they have partners and dependents

In [None]:
#Importing libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
import os
import matplotlib.pyplot as plt#visualization
%matplotlib inline
import pandas as pd
from scipy import stats
import seaborn as sns#visualization
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
sns.set_style(style='whitegrid')

In [None]:
#open the dataset
df = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
#first few rows
df.head()

In [None]:
def resumetable(df):
    '''
    Returns few key metrics of a dataframe.
    '''
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

In [None]:
resumetable(df)

 - customerID Customer ID
 - gender Whether the customer is a male or a female
 - SeniorCitizen Whether the customer is a senior citizen or not (1, 0)
 - Partner Whether the customer has a partner or not (Yes, No)
 - Dependents Whether the customer has dependents or not (Yes, No)
 - tenure Number of months the customer has stayed with the company
 - PhoneService Whether the customer has a phone service or not (Yes, No)
 - MultipleLines Whether the customer has multiple lines or not (Yes, No, No phone service)
 - InternetService Customer’s internet service provider (DSL, Fiber optic, No)
 - OnlineSecurity Whether the customer has online security or not (Yes, No, No internet service)
 - OnlineBackup Whether the customer has online backup or not (Yes, No, No internet service)
 - DeviceProtection Whether the customer has device protection or not (Yes, No, No internet service)
 - TechSupport Whether the customer has tech support or not (Yes, No, No internet service)
 - StreamingTV Whether the customer has streaming TV or not (Yes, No, No internet service)
 - StreamingMovies Whether the customer has streaming movies or not (Yes, No, No internet service)
 - Contract The contract term of the customer (Month-to-month, One year, Two year)
 - PaperlessBilling Whether the customer has paperless billing or not (Yes, No)
 - PaymentMethod The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))
 - MonthlyCharges The amount charged to the customer monthly
 - TotalCharges The total amount charged to the customer
 - Churn Whether the customer churned or not (Yes or No)

In [None]:
#Replacing spaces with null values in total charges column
df['TotalCharges'] = df["TotalCharges"].replace(" ",np.nan)
#convert to float type
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [None]:
#Relable 1 and 0 with yes and no, repectively for consistency
df['SeniorCitizen'] = df.SeniorCitizen.replace({1:"Yes", 0:"No"})

# EDA

## Customer Churn by Category

Let's extract the category columns to perform EDA on all items at once. Some categories also have 3 unique values. We'll see why.

In [None]:
df.columns[1:-3]

In [None]:
categories = list(df.columns[1:-3])

In [None]:
categories.pop(4)

In [None]:
#categorical features are:
categories

In [None]:
non_cats = []
for item in enumerate(list(~df.columns.isin(categories))):
    if item[1] == True:
        non_cats.append(df.columns[item[0]])

In [None]:
non_cats.pop(0)

In [None]:
non_cats.pop(-1)

In [None]:
#Non-categorical features
non_cats

Plotting each categorical feature split by Churn: No/Yes and percentage of customers in each group.

In [None]:
for cat in categories:
    fig, ax = plt.subplots(figsize = (10,10))
    churn_no = []
    churn_yes = []
    x = []
    for i in df[cat].unique():
        #each category has a sub-category: e.g. Gender category has male and female
        #looping through each subcategory and adding churn and no churn data to list
        churn_no.append(df.groupby([cat, 'Churn']).size()[i][0])
        churn_yes.append(df.groupby([cat, 'Churn']).size()[i][1])
        x.append(i)
    #print(churn_no, churn_yes)
    
    p1 = plt.bar(x, churn_no)
    p2 = plt.bar(x, churn_yes, bottom=churn_no)

    #Plotting the bar labels inside the bars, as percentage
    for r1, r2, in zip(p1, p2):
        height1 = r1.get_height()
        height2 = r2.get_height()
        plt.text(r1.get_x() + r1.get_width() / 2., #x
                 height1 / 2., #y
                 f'{round(height1 / (height1 + height2)*100,1)} %', #s
                 ha="center", va="center", color="white", fontsize=12)
        plt.text(r2.get_x() + r2.get_width() / 2., #x
                 height1 + height2 / 2., #y
                 f'{round(height2 / (height1 + height2)*100,1)} %', #s
                 ha="center", va="center", color="white", fontsize=12)


    
    plt.xlabel('Category', fontsize=12)
    plt.ylabel('Number of Customers in Category', fontsize=12)
    plt.legend(['No Churn','Churn'])
    plt.title(cat, fontsize=16)
    plt.show()

 - Gender: Gender has almost a 50/50 split and negligible difference in churn percentages.
 -	Senior Citizens: Senior citizens have close to double the rate of churn; however, they make up only 1/6th of the total customer population
 -	Partner: The split between population is almost equal with ~3,500 customers in each group. The percentage of customers that churn, however, is 30% higher if the customer is without a partner.
 -	Dependents: Customers are more likely to churn if they have no dependents, however, the split in customers based on dependents is imbalanced. 
 -	Phone Service: Rate of churn is similar regardless whether a customer has phone service or not.
 -	Multiple Lines: of the customers that have phone service, the ones that have multiple lines have similar churn rates of those who only have one line.
 -	Internet Service: Out of the customers that have internet service, customers with Fiber Optic service have double the rate of churn than those with DSL.
 -	Online Security: Of the customers that have internet service, ones without online security have a triple the churn rate than those who opted in for online security. May be worth investigating this feature further. 
 -	Online Backup / Device Protection: Of the customers that have internet service, customers with these features have half the rate of churn than those that don’t
 -	Tech Support: Of the customers that have internet service, those that use tech support have a significantly lower rate of churn. Note, the number of customers that use tech support is significantly less than those who don’t.
 -	Streaming TV / Streaming Movies: Of the customers that have internet service, no significant change in churn rate can be seen whether a customer uses the streaming service.
 -	Contract: Customers on contract have significantly lower churn rates than those that are month to month. This is expected.
 -	Paperless Billing: Customers on paperless billing have higher rates of churn. 
 -	Payment Method: Electronic Check customers have double the rate of churn than those using other payment methods.



## Customer Churn by non-categorical items

In [None]:
non_cats

In [None]:
len(df.tenure.unique())

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
df.tenure[df.Churn == 'Yes'].hist(bins=20)
df.tenure[df.Churn == 'No'].hist(bins=20, alpha=0.5)
plt.legend(['Churn Customers', 'Non-Churn Customers'])
plt.title('Customer Tenure')
plt.xlabel('Tenure')
plt.ylabel('Count of Customers')

The longer a customer stays, the less likely they are to churn (tenure by month)

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
df.MonthlyCharges[df.Churn == 'Yes'].hist(bins=20)
df.MonthlyCharges[df.Churn == 'No'].hist(bins=20, alpha=0.5)
plt.legend(['Churn Customers', 'Non-Churn Customers'])
plt.title('Customer Monthly Charges')
plt.xlabel('Monthly Charge Amount')
plt.ylabel('Count of Customers')

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
df.TotalCharges[df.Churn == 'Yes'].hist(bins=20)
df.TotalCharges[df.Churn == 'No'].hist(bins=20, alpha=0.5)
plt.legend(['Churn Customers', 'Non-Churn Customers'])
plt.title('Customer Total Charges')
plt.xlabel('Total Charge Amount')
plt.ylabel('Count of Customers')

## Churn Imbalance

In [None]:
df.groupby('Churn').size().values

In [None]:
df.groupby('Churn').size()

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
labels = 'No Churn', 'Churn'
x = df.groupby('Churn').size().values
ax.pie(x, autopct='%1.1f%%', labels=labels)
plt.show()

## Correlation

Correlation will indicate which variables are related to one another and to the target

### Encode the dataframe

The dataframe needs to be encoded into numerical values in order to make the comparison. e.g. Yes/No will need to be converted to 1 / 0. Scikit-Learn's LabelEncoder will be utilized

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
df.columns

In [None]:
categories

In [None]:
non_cats

In [None]:
other = ['customerID', 'Churn']

In [None]:
df.dtypes

We'll have to encode all categorical values in the dataframe. We already defined the lists 'categories' and 'other'.

In [None]:
encoded_df = pd.DataFrame()

In [None]:
for item in categories:
    encoded_df[item] = le.fit_transform(df[item].values)

In [None]:
for item in non_cats:
    encoded_df[item] = df[item]

In [None]:
for item in other:
    encoded_df[item] = le.fit_transform(df[item].values)

In [None]:
encoded_df.head()

In [None]:
def heatMap(df):
    #Create Correlation df
    corr = df.corr()
    #Plot figsize
    fig, ax = plt.subplots(figsize=(15, 15))
    #Generate Heat Map, allow annotations and place floats in map
    sns.heatmap(corr, cmap="Blues", annot=True, fmt=".2f", linewidths=.2)
    #Apply xticks
    plt.xticks(range(len(corr.columns)), corr.columns);
    #Apply yticks
    plt.yticks(range(len(corr.columns)), corr.columns)
    #show plot
    plt.show()

In [None]:
heatMap(encoded_df)

### Correlation to Churn

In [None]:
encoded_df.columns[0:]

In [None]:
corr_df = encoded_df[encoded_df.columns[0:]].corr()['Churn'][:]

In [None]:
np.abs(corr_df).sort_values(ascending=False)[1:6]

From the heatmap we can see a number of negatively and positively correlated features. We can focus on these during the feature engineering stage. Contract, tenure, OnlineSecurity, TechSupport, TotalCharges.

## Scatterplot

Next we can see if there is any indication of churn in the amount a customer spends. Since TotalCharges is correlated with Churn, it's a good idea to look at this variable to see if there is any signal.

In [None]:
sns.set(font_scale=2)
sns.set_style(style='whitegrid')
sns.pairplot(df, hue="Churn", height=9)

## Binning the data

In [None]:
df.tenure.describe()

In [None]:
df['tenure_bin'] = pd.cut(df.tenure, bins=[0,10,20,30,40,50,60,70,80], labels=[0,10,20,30,40,50,60,70])

In [None]:
sns.set(font_scale=1)
fig, ax = plt.subplots(figsize = (15,10))
sns.countplot('tenure_bin', hue='Churn', data=df)
plt.xlabel('Tenure bin, months')

In [None]:
#adding the tenure_bins to the encoded dataframe
encoded_df['tenure_bin'] = df.tenure_bin

# Preprocessing

In [None]:
encoded_df.head()

In [None]:
encoded_df.isnull().sum()

In [None]:
encoded_df = encoded_df.fillna(0)

In [None]:
encoded_df.head()

## Define features and target

In [None]:
features = encoded_df.drop(columns=['Churn', 'tenure_bin', 'customerID', 'gender','SeniorCitizen','PhoneService']).columns

In [None]:
features

In [None]:
target = ['Churn']

## Dealing with imbalance by downsampling

Imbalance in the data can bias the model to favor the majority class. Since we have more customers (~75%) that do not churn, our model will fit to the non-churn customers more favorably. In order to control for this, we can down-sample the majority class (non-churn) to match the minority class (churn).

 - 1 = Churn
 - 0 = non-Churn

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [None]:
down = encoded_df[encoded_df.Churn == 1]
up = encoded_df[encoded_df.Churn == 0]
down = down.Churn.count()
up = up.Churn.count()
print(f'Churn Fraction: {down/(up+down)}')

In [None]:
#let's first separate majority class and minority class and resample

df_majority = encoded_df[encoded_df.Churn == 0]
df_minority = encoded_df[encoded_df.Churn == 1]


# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=down,     # to match minority class
                                 random_state=42) # reproducible results
# combine the new dataframes
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_downsampled.Churn.value_counts()

## Define X and y

In [None]:
from sklearn.model_selection import train_test_split
y = df_downsampled[target]
X = df_downsampled[features]

Since we don't a significant amount of data to train on, we'll use a 90/10 train/test split.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state=42, stratify=y)

# Initial Modeling

Our goal is to not only predict whether a customer will churn but also get insight into why so we can intervene and change the trajectory of a customer. 

In [None]:
#import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve,\
precision_score, recall_score, precision_recall_curve, classification_report, roc_auc_score

## Logistic Regression Baseline

In [None]:
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression(random_state=42, max_iter=1000, )

In [None]:
logr.fit(X_train, y_train)
y_pred_logr = logr.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred_logr))
print(f'Accuracy: {accuracy_score(y_test, y_pred_logr)}')
print(f'Precision: {precision_score(y_test, y_pred_logr)}')
print(f'Recall: {recall_score(y_test, y_pred_logr)}')
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred_logr)}')

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
y_prob = logr.predict_proba(X_test)[::,1]
y_pred_proba = y_prob
fpr_rand, tpr_rand, _ = roc_curve(y_test,  y_pred_proba)
auc_rand = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr_rand,tpr_rand,label=f"Logistic Regression, AUC="+str(auc_rand))
plt.legend(loc=4)
plt.title('ROC Curve')
plt.show()

### Classification model interpretation

Our initial logistic regression model did relatively well. For our test dataset we had 187 non-churn customers and 187 churn customers. Looking at the confusion matrix output, our model was able to predict non-churn and churn with a 75% (140 out of 187) and 79% (147 out of 187) accuracy, respectively. 

## LightGBM

In [None]:
import lightgbm as lgb

In [None]:
clf = lgb.LGBMClassifier(drop_rate=0.9, min_data_in_leaf=800, max_bin=500,
                         n_estimators=5000, min_sum_hessian_in_leaf=1, importance_type='gain',
                         learning_rate=0.4, bagging_fraction=0.9, colsample_bytree=1.0,
                         feature_fraction=0.1, lambda_l1=5.0, lambda_l2=3.0, max_depth=9,
                         min_child_samples=55, min_child_weight=5.0, min_split_gain=0.1,
                         num_leaves=4000, subsample = 0.75)  

In [None]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')

In [None]:
fig, ax = plt.subplots(figsize = (10,10))
sns.set_style(style='whitegrid')
y_prob = clf.predict_proba(X_test)[::,1]
y_pred_proba = y_prob
fpr_rand, tpr_rand, _ = roc_curve(y_test,  y_pred_proba)
auc_rand = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr_rand,tpr_rand,label="LightGBM, AUC="+str(auc_rand))
plt.legend(loc=4)
plt.title('ROC Curve')
plt.show()

In [None]:
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_, X.columns)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.set(font_scale=1.5)
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

# Feature Engineering

From our correlation analysis, we can see the feature importance associated with the LightGBM model resembles our initial intuition for features to consider for feature engineering.
 - Contract          0.396713
 - tenure            0.352229
 - OnlineSecurity    0.289309
 - TechSupport       0.282492
 - TotalCharges      0.199484

LightGBM

In [None]:
feature_imp.sort_values(by="Value", ascending=False).head()

## Total number of services enrolled in

In [None]:
services = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
            'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling']

In [None]:
encoded_df['services'] = encoded_df[services].apply(lambda x: x.sum(), axis=1)

In [None]:
encoded_df.head()

### Model test

In [None]:
features = encoded_df.drop(columns=['Churn','tenure_bin','customerID','gender',
                                    'SeniorCitizen','PhoneService']).columns
target = ['Churn']
df_majority = encoded_df[encoded_df.Churn == 0]
df_minority = encoded_df[encoded_df.Churn == 1]


# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=down,     # to match minority class
                                 random_state=42) # reproducible results
# combine the new dataframes
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_downsampled.Churn.value_counts()
y = df_downsampled[target]
X = df_downsampled[features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state=42, stratify=y)

#LightGBM
clf = lgb.LGBMClassifier(drop_rate=0.9, min_data_in_leaf=800, max_bin=500,
                         n_estimators=5000, min_sum_hessian_in_leaf=1, importance_type='gain',
                         learning_rate=0.4, bagging_fraction=0.9, colsample_bytree=1.0,
                         feature_fraction=0.1, lambda_l1=5.0, lambda_l2=3.0, max_depth=9,
                         min_child_samples=55, min_child_weight=5.0, min_split_gain=0.1,
                         num_leaves=4000, subsample = 0.75)  

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')

fig, ax = plt.subplots(figsize = (10,10))
sns.set_style(style='whitegrid')
y_prob = clf.predict_proba(X_test)[::,1]
y_pred_proba = y_prob
fpr_rand, tpr_rand, _ = roc_curve(y_test,  y_pred_proba)
auc_rand = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr_rand,tpr_rand,label="LightGBM, AUC="+str(auc_rand))
plt.legend(loc=4)
plt.title('ROC Curve')
plt.show()

feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_, X.columns)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.set(font_scale=1.5)
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

## Average monthly price per service

In [None]:
encoded_df['avg_price'] = encoded_df.MonthlyCharges / encoded_df.services

In [None]:
encoded_df.head()

### Model test

In [None]:
features = encoded_df.drop(columns=['Churn','tenure_bin','customerID','gender','services',
                                    'SeniorCitizen','PhoneService']).columns
target = ['Churn']
df_majority = encoded_df[encoded_df.Churn == 0]
df_minority = encoded_df[encoded_df.Churn == 1]


# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=down,     # to match minority class
                                 random_state=42) # reproducible results
# combine the new dataframes
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_downsampled.Churn.value_counts()
y = df_downsampled[target]
X = df_downsampled[features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state=42, stratify=y)

#LightGBM
clf = lgb.LGBMClassifier(drop_rate=0.9, min_data_in_leaf=800, max_bin=500,
                         n_estimators=5000, min_sum_hessian_in_leaf=1, importance_type='gain',
                         learning_rate=0.4, bagging_fraction=0.9, colsample_bytree=1.0,
                         feature_fraction=0.1, lambda_l1=5.0, lambda_l2=3.0, max_depth=9,
                         min_child_samples=55, min_child_weight=5.0, min_split_gain=0.1,
                         num_leaves=4000, subsample = 0.75)  

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')

fig, ax = plt.subplots(figsize = (10,10))
sns.set_style(style='whitegrid')
y_prob = clf.predict_proba(X_test)[::,1]
y_pred_proba = y_prob
fpr_rand, tpr_rand, _ = roc_curve(y_test,  y_pred_proba)
auc_rand = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr_rand,tpr_rand,label="LightGBM, AUC="+str(auc_rand))
plt.legend(loc=4)
plt.title('ROC Curve')
plt.show()

feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_, X.columns)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.set(font_scale=1.5)
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

## Extra Charges

In [None]:
encoded_df['extra_charges'] = (encoded_df.MonthlyCharges * encoded_df.tenure) - encoded_df.TotalCharges

In [None]:
encoded_df.head()

### Model test

In [None]:
features = encoded_df.drop(columns=['Churn','tenure_bin','customerID','gender','services','avg_price',
                                    'SeniorCitizen','PhoneService']).columns
target = ['Churn']
df_majority = encoded_df[encoded_df.Churn == 0]
df_minority = encoded_df[encoded_df.Churn == 1]


# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=down,     # to match minority class
                                 random_state=42) # reproducible results
# combine the new dataframes
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_downsampled.Churn.value_counts()
y = df_downsampled[target]
X = df_downsampled[features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state=42, stratify=y)

#LightGBM
clf = lgb.LGBMClassifier(drop_rate=0.9, min_data_in_leaf=800, max_bin=500,
                         n_estimators=5000, min_sum_hessian_in_leaf=1, importance_type='gain',
                         learning_rate=0.4, bagging_fraction=0.9, colsample_bytree=1.0,
                         feature_fraction=0.1, lambda_l1=5.0, lambda_l2=3.0, max_depth=9,
                         min_child_samples=55, min_child_weight=5.0, min_split_gain=0.1,
                         num_leaves=4000, subsample = 0.75)  

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'Confusion Matrix: {confusion_matrix(y_test, y_pred)}')

fig, ax = plt.subplots(figsize = (10,10))
sns.set_style(style='whitegrid')
y_prob = clf.predict_proba(X_test)[::,1]
y_pred_proba = y_prob
fpr_rand, tpr_rand, _ = roc_curve(y_test,  y_pred_proba)
auc_rand = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr_rand,tpr_rand,label="LightGBM, AUC="+str(auc_rand))
plt.legend(loc=4)
plt.title('ROC Curve')
plt.show()

feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_, X.columns)), columns=['Value','Feature'])
plt.figure(figsize=(20, 10))
sns.set(font_scale=1.5)
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

We'll keep extra_charges and avg_price as new features

In [None]:
features = encoded_df.drop(columns=['Churn','tenure_bin','customerID','gender','services',
                                    'SeniorCitizen','PhoneService']).columns
target = ['Churn']
df_majority = encoded_df[encoded_df.Churn == 0]
df_minority = encoded_df[encoded_df.Churn == 1]


# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=down,     # to match minority class
                                 random_state=42) # reproducible results
# combine the new dataframes
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_downsampled.Churn.value_counts()
y = df_downsampled[target]
X = df_downsampled[features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state=42, stratify=y)

# Modeling

## Scikit-Learn and XGBoost Classifiers

In [None]:
#Import classifiers
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

In [None]:
#instantiate models
seed = 42
clf = RandomForestClassifier(random_state=seed)
xgb = XGBClassifier(random_state=seed)
xtr = ExtraTreesClassifier(random_state=seed)
lgbm = lgb.LGBMClassifier(drop_rate=0.9, min_data_in_leaf=800, max_bin=500,
                         n_estimators=5000, min_sum_hessian_in_leaf=1, importance_type='gain',
                         learning_rate=0.4, bagging_fraction=0.9, colsample_bytree=1.0,
                         feature_fraction=0.1, lambda_l1=5.0, lambda_l2=3.0, max_depth=9,
                         min_child_samples=55, min_child_weight=5.0, min_split_gain=0.1,
                         num_leaves=4000, subsample = 0.75)

In [None]:
classifiers = [('Random Forest', clf), ('XGBoost', xgb), ('ExtraTrees', xtr),
              ('Logistic Regression', logr), ('LightGBM', lgbm)]

In [None]:
# Iterate over the pre-defined list of classifiers
for clf_name, clf_algo in classifiers:    
 
    # Fit clf to the training set
    clf_algo.fit(X_train, y_train)    
   
    # Predict y_pred
    y_pred = clf_algo.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
   
    # Evaluate clf's accuracy on the test set
    print(f'name: {clf_name}')
    print(classification_report(y_test, y_pred))
    # Evaluate clf's accuracy on the test set 
    print(f'accuracy: {accuracy}')
    print(f'precision: {precision}')
    print(f'recall: {recall}')
    print(f'confusion matrix: {cm}')
    print('------------------------------')
    
    fig, ax = plt.subplots(figsize = (10,10))
    y_prob = clf_algo.predict_proba(X_test)[::,1]
    y_pred_proba = y_prob
    fpr_rand, tpr_rand, _ = roc_curve(y_test,  y_pred_proba)
    auc_rand = roc_auc_score(y_test, y_pred_proba)
    plt.plot(fpr_rand,tpr_rand,label=f"{clf_name}, AUC="+str(auc_rand))
    plt.legend(loc=4)
    plt.title('ROC Curve')
    plt.show()

In [None]:
# Import VotingClassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)     

# Fit vc to the training set
vc.fit(X_train, y_train)   

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)


# Evaluate clf's accuracy on the test set
print(f'name: Voting Classifier')
print(classification_report(y_test, y_pred))
# Evaluate clf's accuracy on the test set 
print(f'accuracy: {accuracy}')
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'confusion matrix: {cm}')
print('------------------------------')

fig, ax = plt.subplots(figsize = (10,10))
y_prob = clf_algo.predict_proba(X_test)[::,1]
y_pred_proba = y_prob
fpr_rand, tpr_rand, _ = roc_curve(y_test,  y_pred_proba)
auc_rand = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr_rand,tpr_rand,label=f"{clf_name}, AUC="+str(auc_rand))
plt.legend(loc=4)
plt.title('ROC Curve')
plt.show()

### Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

In [None]:
# Number of trees for tree ensambles
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 3)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(100, 200, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [10, 20, 30]
# Minimum number of samples required at each leaf node
min_samples_leaf = [6, 8, 12]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid

#### Random Forest

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
# Random search of parameters, using 3 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 20, \
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
rf_params = rf_random.best_params_

#### XGBoost

In [None]:
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              #'objective':['reg:linear'],
              'learning_rate': [0.01, 0.1],
              'max_depth': [6, 12], 
              'min_child_weight': [2, 6],
              #'verbosity': [1],
              'subsample': [1.0],
              'colsample_bytree': [0.3, 0.5],
              'gamma': [0],
              'n_estimators': [100, 500, 1000]}

xgb_rand = RandomizedSearchCV(xgb,
                              parameters,
                              cv = 3,
                              n_jobs = -1,
                              verbose=True)

xgb_rand.fit(X_train, y_train)

In [None]:
xgb_rand.best_params_

In [None]:
xgb_params = xgb_rand.best_params_

#### Extra Randomized Trees

In [None]:
param_dist = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap
             }

In [None]:
xtr_tune = RandomizedSearchCV(estimator = xtr, param_distributions = param_dist, n_iter = 50, \
                               cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
xtr_tune.fit(X_train, y_train)
xtr_tune.best_params_

In [None]:
xtr_params = xtr_tune.best_params_

### Tuned Model

In [None]:
#instantiate models
seed = 42
new_clf = RandomForestClassifier(**rf_random.best_params_, random_state=seed)
new_xgb = XGBClassifier(**xgb_rand.best_params_, random_state=seed)
new_xtr = ExtraTreesClassifier(**xtr_tune.best_params_, random_state=seed)
new_lgbm = lgb.LGBMClassifier(drop_rate=0.9, min_data_in_leaf=800, max_bin=500,
                         n_estimators=5000, min_sum_hessian_in_leaf=1, importance_type='gain',
                         learning_rate=0.4, bagging_fraction=0.9, colsample_bytree=1.0,
                         feature_fraction=0.1, lambda_l1=5.0, lambda_l2=3.0, max_depth=9,
                         min_child_samples=55, min_child_weight=5.0, min_split_gain=0.1,
                         num_leaves=4000, subsample = 0.75)

In [None]:
classifiers = [('Random Forest', new_clf), ('XGBoost', new_xgb), ('ExtraTrees', new_xtr), ('LightGBM', new_lgbm)]

In [None]:
# Iterate over the pre-defined list of classifiers
for clf_name, clf_algo in classifiers:    

    # Fit clf to the training set
    clf_algo.fit(X_train, y_train)    
    
    # Predict y_pred
    y_pred = clf_algo.predict(X_test)
          
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f'name: {clf_name}')
    print(classification_report(y_test, y_pred))
    # Evaluate clf's accuracy on the test set 
    print(f'accuracy: {accuracy}')
    print(f'precision: {precision}')
    print(f'recall: {recall}')
    print(f'confusion matrix: {cm}')
    fig, ax = plt.subplots(figsize = (10,10))
    y_prob = clf_algo.predict_proba(X_test)[::,1]
    y_pred_proba = y_prob
    fpr_rand, tpr_rand, _ = roc_curve(y_test,  y_pred_proba)
    auc_rand = roc_auc_score(y_test, y_pred_proba)
    plt.plot(fpr_rand,tpr_rand,label=f"{clf_name}, AUC="+str(auc_rand))
    plt.legend(loc=4)
    plt.title('ROC Curve')
    plt.show()

In [None]:
# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers, 
                      voting='soft', 
                      weights=[9,1,1,5]
                     )

# Fit vc to the training set
vc.fit(X_train, y_train)   

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f'Voting Classifier: ')
print(classification_report(y_test, y_pred))
    # Evaluate clf's accuracy on the test set 
print(f'accuracy: {accuracy}')
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'confusion matrix: {cm}')
fig, ax = plt.subplots(figsize = (10,10))
y_prob = vc.predict_proba(X_test)[::,1]
y_pred_proba = y_prob
fpr_rand, tpr_rand, _ = roc_curve(y_test,  y_pred_proba)
auc_rand = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr_rand,tpr_rand,label=f"Voting Classifier, AUC="+str(auc_rand))
plt.legend(loc=4)
plt.title('ROC Curve')
plt.show()