In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_excel ("E:\python\Insaid test\Churn.xlsx")

In [None]:
df

In [None]:
df.nunique()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull()

In [None]:
df.isnull().sum()

In [None]:
import missingno as msno

In [None]:
msno.matrix(df)

In [None]:
df = df.fillna(value=0)

In [None]:
df.isna().sum()

In [None]:
msno.matrix(df)

In [None]:
for item in df.columns:
    try:
        df[item] = df[item].str.lower()
    except:
        print(item, "couldn't convert")
df.head()

In [None]:
# Replacing Yes, No to 0 and 1
columns_to_convert = ['Partner', 
                      'Dependents', 
                      'PhoneService', 
                      'PaperlessBilling', 
                      'Churn']

for item in columns_to_convert:
    df[item].replace(to_replace='yes', value=1, inplace=True)
    df[item].replace(to_replace='no',  value=0, inplace=True)
df.head()

In [None]:
df.columns.to_series().groupby(df.dtypes).groups

In [None]:
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [None]:
#Balance the labels so we have the same number of non-churners as churners.
churners_number = len(df[df['Churn'] == 1])
print("Number of churners", churners_number)

churners = (df[df['Churn'] == 1])

non_churners = df[df['Churn'] == 0].sample(n=churners_number)
print("Number of non-churners", len(non_churners))
df2 = churners.append(non_churners)

In [None]:
plt.figure(figsize=(3, 3))
labels =["Churn: Yes","Churn:No"]
values = [1869,5163]
labels_gender = ["F","M","F","M"]
sizes_gender = [939,930 , 2544,2619]
colors = ['#ff6666', '#66b3ff']
colors_gender = ['#c2c2f0','#ffb3e6', '#c2c2f0','#ffb3e6']
explode = (0.3,0.3) 
explode_gender = (0.1,0.1,0.1,0.1)
textprops = {"fontsize":15}
#Plot
plt.pie(values, labels=labels,autopct='%1.1f%%',pctdistance=1.08, labeldistance=0.8,colors=colors, startangle=90,frame=True, explode=explode,radius=10, textprops =textprops, counterclock = True, )
plt.pie(sizes_gender,labels=labels_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=7, textprops =textprops, counterclock = True, )
#Draw circle
centre_circle = plt.Circle((0,0),5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title('Churn Distribution w.r.t Gender: Male(M), Female(F)', fontsize=15, y=1.1)

# show plot 
 
plt.axis('equal')
plt.tight_layout()
plt.show()
#There is negligible difference in customer percentage/ count who chnaged the service provider. Both genders behaved in similar fashion when it comes to migrating to another service provider/firm.

In [None]:
# correlation
corelation = df.corr()  
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(df.corr(), annot = True)

In [None]:
plt.figure(figsize=(25, 10))

corr = df.apply(lambda x: pd.factorize(x)[0]).corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

ax = sns.heatmap(corr, mask=mask, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, linewidths=.2, cmap='coolwarm', vmin=-1, vmax=1)

In [None]:
profile = ProfileReport(df, title="Churn detection", explorative=True)
profile

In [None]:
df.hist()
plt.figure(figsize=(6, 6))
plt.show()

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
labels = df['PaymentMethod'].unique()
values = df['PaymentMethod'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_layout(title_text="<b>Payment Method Distribution</b>")
fig.show()


In [None]:
fig = px.histogram(df, x="Churn", color="PaymentMethod", title="<b>Customer Payment Method distribution w.r.t. Churn</b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

Major customers who moved out were having Electronic Check as Payment Method.
Customers who opted for Credit-Card automatic transfer or Bank Automatic Transfer and Mailed Check as Payment Method were less likely to move out.

In [None]:
g_labels = ['Male', 'Female']
c_labels = ['No', 'Yes']
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=g_labels, values=df['gender'].value_counts(), name="Gender"),
              1, 1)
fig.add_trace(go.Pie(labels=c_labels, values=df['Churn'].value_counts(), name="Churn"),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=8)

fig.update_layout(
    title_text="Gender and Churn Distributions",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='Gender', x=0.16, y=0.5, font_size=20, showarrow=False),
                 dict(text='Churn', x=0.84, y=0.5, font_size=20, showarrow=False)])
fig.show()
#26.6 % of customers switched to another firm. Customers are 49.5 % female and 50.5 % male.

In [None]:
 df[df["gender"]=="Male"][["InternetService", "Churn"]].value_counts()

In [None]:
df[df["gender"]=="Female"][["InternetService", "Churn"]].value_counts()

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [965, 992, 219, 240],
  name = 'DSL',
))

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [889, 910, 664, 633],
  name = 'Fiber optic',
))

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [690, 717, 56, 57],
  name = 'No Internet',
))

fig.update_layout(title_text="<b>Churn Distribution w.r.t. Internet Service and Gender</b>")

fig.show()

A lot of customers choose the Fiber optic service and it's also evident that the customers who use Fiber optic have high churn rate, this might suggest a dissatisfaction with this type of internet service.
Customers having DSL service are majority in number and have less churn rate compared to Fibre optic service.

In [None]:
color_map = {"Yes": "Red", "No": "Yellow"}
fig = px.histogram(df, x="Churn", color="Dependents", barmode="group", title="<b>Dependents distribution</b>", color_discrete_map=color_map)
fig.update_layout(width=500, height=600, bargap=0.1)
fig.show()
#Customers without dependents are more likely to churn

In [None]:
color_map = {"Yes": 'yellow', "No": 'Green'}
fig = px.histogram(df, x="Churn", color="Partner", barmode="group", title="<b>Chrun distribution w.r.t. Partners</b>", color_discrete_map=color_map)
fig.update_layout(width=200, height=300, bargap=0.1)
fig.show()
#Customers that doesn't have partners are more likely to churn

In [None]:
color_map = {"Yes": 'Red', "No": 'Yellow'}
fig = px.histogram(df, x="Churn", color="SeniorCitizen", title="<b>Chrun distribution w.r.t. Senior Citizen</b>", color_discrete_map=color_map)
fig.update_layout(width=300, height=400, bargap=0.1)
fig.show()
#It can be observed that the fraction of senior citizen is very less.Most of the senior citizens churn.

In [None]:
color_map = {"Yes": "Green", "No": "Yellow"}
fig = px.histogram(df, x="Churn", color="OnlineSecurity", barmode="group", title="<b>Churn w.r.t Online Security</b>", color_discrete_map=color_map)
fig.update_layout(width=300, height=400, bargap=0.1)
fig.show() 
#Most customers churn in the absence of online security,

In [None]:
color_map = {"Yes": 'Green', "No": 'Yellow'}
fig = px.histogram(df, x="Churn", color="PaperlessBilling",  title="<b>Chrun distribution w.r.t. Paperless Billing</b>", color_discrete_map=color_map)
fig.update_layout(width=200, height=250, bargap=0.1)
fig.show()
#Customers with Paperless Billing are most likely to churn.

In [None]:
fig = px.box(df, x='Churn', y = 'tenure')

fig.update_yaxes(title_text='Tenure (Months)', row=1, col=1)
fig.update_xaxes(title_text='Churn', row=1, col=1)

fig.update_layout(autosize=True, width=350, height=300,
    title_font=dict(size=25, family='Courier'),
    title='<b>Tenure vs Churn</b>',
)

fig.show()


In [None]:
plt.figure(figsize=(14,7))
df.corr()['Churn'].sort_values(ascending = False)

In [None]:
try:
    customer_id = df2['customerID'] # Store this as customer_id variable
    del df2['customerID'] # Don't need in ML DF
except:
    print("already removed customerID")

In [None]:
df2

In [None]:
# Use one-hot encoding to convert categorical data to binary (0 or 1)
ml_dummies = pd.get_dummies(df2)
ml_dummies.fillna(value=0, inplace=True)
ml_dummies.head()

In [None]:
ml_dummies['---randomColumn---'] = np.random.randint(0,1000, size=len(ml_dummies)) # Add a random column to the dataframe

In [None]:
try:
    label = ml_dummies['Churn'] # Remove the label before training the model
    del ml_dummies['Churn']
except:
    print("label already removed.")

In [None]:
feature_train, feature_test, label_train, label_test = train_test_split(ml_dummies, label, test_size=0.3)

In [None]:
classifiers = [
    KNeighborsClassifier(5),    
    DecisionTreeClassifier(max_depth=5)
]
    

# iterate over classifiers
for item in classifiers:
    classifier_name = ((str(item)[:(str(item).find("("))]))
    print (classifier_name)
    
    # Create classifier, train it and test it.
    clf = item
    clf.fit(feature_train, label_train)
    pred = clf.predict(feature_test)
    score = clf.score(feature_test, label_test)
    print (round(score,3),"\n", "- - - - - ", "\n")

In [None]:
feature_df = pd.DataFrame()
feature_df['features'] = ml_dummies.columns
feature_df['importance'] = clf.feature_importances_
feature_df.sort_values(by='importance', ascending=False)    
feature_df.set_index(keys='features').sort_values(by='importance', ascending=True).plot(kind='barh', figsize=(15, 15))

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

In [None]:
cnf_matrix = confusion_matrix(label_test, pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
class_names = ['Not churned','churned']

plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()


from sklearn.metrics import classification_report
eval_metrics = classification_report(label_test, pred, target_names=class_names)
print(eval_metrics)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
max_depth_range = range(2,20,2)
leaf_range = range(1,10,2)
n_estimators_range = range(10,200,10)
max_features_range = range(1,len(ml_dummies.columns),5)


param_grid = dict(max_depth = max_depth_range,
                 min_samples_leaf = leaf_range,
                 n_estimators = n_estimators_range,
                 max_features = max_features_range
                )

In [None]:
# Preprocessing original dataframe
def preprocess_df(dataframe):
    x = dataframe.copy()
    try:
        customer_id = x['customerID']
        del x['customerID'] # Don't need in ML DF
    except:
        print("already removed customerID")
    ml_dummies = pd.get_dummies(x)
    ml_dummies.fillna(value=0, inplace=True)

    # import random done above
    ml_dummies['---randomColumn---'] = np.random.randint(0,1000, size=len(ml_dummies))

    try:
        label = ml_dummies['Churn']
        del ml_dummies['Churn']
    except:
        print("label already removed.")
    return ml_dummies, customer_id, label

original_df = preprocess_df(df)

In [None]:
output_df = original_df[0].copy()
output_df['---randomColumn---']
output_df['prediction'] = clf.predict_proba(output_df)[:,1]
output_df['churn'] = original_df[2]
output_df['customerID'] = original_df[1]
print('Mean predict proba of churn:',round(output_df[output_df['churn'] == 1]['prediction'].mean(),2))
print('Mean predict proba of NON-churn:',round(output_df[output_df['churn'] == 0]['prediction'].mean(),2))

In [None]:
activate = output_df[output_df['churn'] == 0] # Using media, target the customers who haven't churned but wants to.
activate[['customerID','churn','prediction']]

In [None]:
# import the classification module 
from pycaret import classification
from pycaret.classification import *

In [None]:
# import the classification module 
from pycaret import classification
clf = setup(data = ml_dummies, target = 'PhoneService', silent = True, session_id = 123, train_size = 0.8)

In [None]:
compare_models()

In [None]:
knn = create_model('knn')    #train a model

In [None]:
tuned_knn = tune_model(knn, n_iter = 50)   

In [None]:
tuned_knn = tune_model(knn, n_iter = 150)   

In [None]:
tuned_knn = tune_model(knn, optimize = 'AUC') #default is 'Accuracy'

In [None]:
tuned_knn = tune_model(knn, optimize = 'Accuracy') #default is 'Accuracy'

In [None]:
svm = create_model('svm')    #train a model

In [None]:
tuned_svm = tune_model(svm, n_iter = 50)   

In [None]:
tuned_svm = tune_model(svm, n_iter = 100)   

In [None]:
tuned_svm = tune_model(svm, optimize = 'Accuracy') #default is 'Accuracy'

Best results from the model:

        Accuracy  AUC	 Recall	Prec.	  F1	      Kappa	     MCC
Mean	0.9050	 0.7899	  1.0000	0.9048	 0.9500	      0.0421	    0.1134    #knn


Mean	0.9391	 0.0000	  1.0000	0.9370	 0.9674	      0.5105 	    0.5856    #svm 

In [None]:
from sklearn.svm import SVC
svc_model = SVC(random_state = 1)
svc_model.fit(feature_train,label_train)
predict_y = svc_model.predict(feature_test)
accuracy_svc = svc_model.score(feature_test,label_test)
print("SVM accuracy is :",accuracy_svc)

In [None]:
print(classification_report(label_test, predict_y))

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(label_test, pred),
                annot=True,fmt = "d",linecolor="k",linewidths=3)
    
plt.title("FINAL CONFUSION MATRIX",fontsize=14)
plt.show()

From the confusion matrix we can see that: There are total 435+156=570 actual non-churn values and the algorithm predicts 435 of them as non churn and 156 of them as churn. While there are 128+424=552 actual churn values and the algorithm predicts 124 of them as non churn values and 424 of them as churn values.