<a href="https://colab.research.google.com/github/richlee-Lee/richlee-code-book/blob/main/customer_churn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import tensorflow as tf

In [None]:
print("skiki-learn version:", sklearn.__version__)
print("Tensorflow version:", tf.__version__)

In [None]:
file_loc = "https://storage.googleapis.com/low-code-ai-book/churn_dataset.csv"
df_raw = pd.read_csv(file_loc)
df_raw.head()

In [None]:
df_raw.info()

In [None]:
df_raw['TotalCharges'].describe()

In [None]:
mask = (df_raw['TotalCharges']==' ')
df_raw[mask].head()

In [None]:
mask = (df_raw['tenure']==0)
df_raw[mask][['tenure','TotalCharges']]

In [None]:
df1 = df_raw.copy()
df1['TotalCharges'].replace(' ',0.0, inplace=True)
df1[mask][['tenure','TotalCharges']]

In [None]:
df1['TotalCharges'] = df1['TotalCharges'].astype('float')
df1.info()

In [None]:
df1.describe()

In [None]:
df1.describe(include='all')

In [None]:
df1.value_counts(['PhoneService','MultipleLines'])

In [None]:
df1.groupby('Contract')['Churn'].value_counts(normalize=True).unstack('Churn').plot.bar(stacked=True)

In [None]:
def plot_cat_dist(feature_name):
    df1.groupby(feature_name)['Churn'].value_counts(normalize=True).unstack('Churn').plot.bar(stacked=True)

In [None]:
plot_cat_dist('PaymentMethod')

In [None]:
df1['average_charge'] = df1['TotalCharges']/df1['tenure']
df1['diff'] = df1['MonthlyCharges'] - df1['average_charge']
df1['diff'].describe()

In [None]:
df1['month_bin'] = pd.cut(df1['MonthlyCharges'],bins=3)
plot_cat_dist('month_bin')

In [None]:
df2 = df1.copy()
df2 = df2.drop(columns=['gender','PhoneService','StreamingTV','StreamingMovies'])
df2.columns

In [None]:
df2 = df2.drop(columns=['customerID','TotalCharges','average_charge','diff','month_bin'])
df2.columns

In [None]:
df2.dtypes

In [None]:
df2.describe(include='all')

In [None]:
df2['InternetService'].value_counts()

In [None]:
df_prep = df2.replace('No internet service', 'No')
df_prep[['OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport']].nunique()

In [None]:
from sklearn.preprocessing import OneHotEncoder
numeric_columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges']
category_columns = ['Partner', 'Dependents', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'Contract', 'PaperlessBilling', 'PaymentMethod','diffbuckets']

x_num = df_prep[numeric_columns]
x_cat = df_prep[category_columns]
ohe = OneHotEncoder(drop='if_binary')
x_cat_trans = ohe.fit_transform(x_cat)

In [None]:
x_cat_trans.toarray()[0]

In [None]:
ohe.inverse_transform(x_cat_trans.toarray())[0]

In [None]:
X = np.concatenate((x_num.values,x_cat_trans.toarray()), axis=1)
y = df_prep['Churn'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=113)
X_train.shape

In [None]:
from sklearn.linear_model import LogisticRegression
cls = LogisticRegression()
cls.fit(X_train, y_train)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
cls.fit(X_train_scaled, y_train)

In [None]:
cls.score(X_test, y_test)

In [None]:
X_test_scaled = scaler.transform(X_test)
cls.score(X_test_scaled, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = cls.predict(X_test_scaled)
confusion_matrix(y_test, y_pred, labels=['Yes','No'])

In [None]:
from sklearn.metrics import precision_score, recall_score
print("Precision:",precision_score(y_test,y_pred,labels=['Yes','No'], pos_label='Yes'))
print("Recall:",recall_score(y_test,y_pred,labels=['Yes','No'],pos_label='Yes'))