# Problem Statement: 
* Recommend the steps to retain existing telecom customers. In the industry retaining customers is going challenging day by day and acquire new customers is even draining so retaining the existing customers is highly worths every effort than acquiring new customers.
* Know the customer behavior and recommend the steps to retain existing customers and build the model to know which existing customers may leave.
1. Recommend the steps to retain existing telecom customers.
2. Build customer churn, prediction model.


# Index
1. Imports
2. Info 
3. EDA 
4. ML Model 
    1. Feature Engineering 
    2. Decision Tree | Confusion matrix, Classification Report
    3. Random Forest | Confusion matrix, Classification Report
    4. Gradient Boost | Confusion matrix, Classification Report
    5. KNN | Confusion matrix, Classification Report

5. Evaluation AUC ROC Curve for all models 
6. Conclusions
    1. Final Steps to Retain Customers

# Imports

In [None]:
# EDA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
## Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## ML Models Deffrent Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

## Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix

## Evaluation and Comparision of Performance of all Algorithms
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
df = pd.read_csv('../input/telecom-churn/telecom_churn.csv')
df

# INFO

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# NO MISSING VALUES

# EDA

In [None]:
df.hist(figsize=(15,15), bins=30)

In [None]:
sns.pairplot(df, hue="Churn")

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(), annot=True, cmap="Blues")

In [None]:
df["Churn"].value_counts().plot(kind="pie", title="All Over Churned Ratio",
                                labels = ["Retained {:.2f} %".format(len(df.loc[df.Churn==0])*100/len(df.Churn)),
                                          "Churned {:.2f} %".format(len(df.loc[df.Churn==1])*100/len(df.Churn))],
                                figsize=(4,4))

In [None]:
sns.catplot(x="ContractRenewal", y="CustServCalls", hue="Churn", kind="bar", data=df)

In [None]:
sns.catplot(x="ContractRenewal", y="DataUsage", hue="Churn", data=df)

In [None]:
sns.catplot(x="DataPlan", y="MonthlyCharge", hue="Churn", data=df)

## 1. Daily Min vs Monthly Charge 

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df1 = df.loc[df["Churn"]==1]
df0 = df.loc[df["Churn"]==0]

plt.figure(figsize=(20,5))

plt.subplot(131)
plt.plot(df1["MonthlyCharge"], df1["DayMins"], "co")
plt.xlabel("Monthly Charge")
plt.ylabel("Day Mins")
plt.title("Churn=1")

plt.subplot(132)
plt.plot(df0["MonthlyCharge"], df0["DayMins"], "o")
plt.xlabel("Monthly Charge")
plt.ylabel("Day Mins")
plt.title("Churn=0")

plt.subplot(133)
plt.plot(df["MonthlyCharge"], df["DayMins"], "o")
plt.xlabel("Monthly Charge")
plt.ylabel("Day Mins")
plt.title("Churn=0&1")

## 2. Data Usage vs Monthly Charge

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df1 = df.loc[df["Churn"]==1]
df0 = df.loc[df["Churn"]==0]

plt.figure(figsize=(20,5))

plt.subplot(131)
plt.plot(df1["MonthlyCharge"], df1["DataUsage"], "co")
plt.xlabel("MonthlyCharge")
plt.ylabel("DataUsage")
plt.title("Churn=1")

plt.subplot(132)
plt.plot(df0["MonthlyCharge"], df0["DataUsage"], "o")
plt.xlabel("MonthlyCharge")
plt.ylabel("DataUsage")
plt.title("Churn=0")

plt.subplot(133)
plt.plot(df["MonthlyCharge"], df["DataUsage"], "o")
plt.xlabel("MonthlyCharge")
plt.ylabel("DataUsage")
plt.title("Churn=0&1")

**In data usage case also we can easily consider two segments.
Consider the segment as below,**
1. Customers whose monthly charges are up to 85 and data usage around 0-1.
2. Customers whose monthly charges are up to 30 onwards and data usage around is >1.

### Manual Customer Segmentation

In [None]:
# Manual Customer Segmentaion
seg1 = df.loc[(df["MonthlyCharge"]<85) & (df["DataUsage"]<=1)]
seg2 = df.loc[(df["MonthlyCharge"]>30) & (df["DataUsage"]>1)]

### Segment 1. Customers who's monthly charges are upto 85 and data usage around 0-1.

In [None]:
seg1

In [None]:
seg1["Churn"].value_counts()

In [None]:
(403*100)/2420

In [None]:
seg1["DataPlan"].value_counts()

In [None]:
seg1.loc[seg1["DataPlan"]==1]

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df1 = seg1.loc[seg1["Churn"]==1]
df0 = seg1.loc[seg1["Churn"]==0]

plt.figure(figsize=(20,5))

plt.subplot(131)
plt.plot(df1["MonthlyCharge"], df1["DayMins"], "co")
plt.xlabel("Monthly Charge")
plt.ylabel("Day Mins")
plt.title("Churn=1")

plt.subplot(132)
plt.plot(df0["MonthlyCharge"], df0["DayMins"], "o")
plt.xlabel("Monthly Charge")
plt.ylabel("Day Mins")
plt.title("Churn=0")

plt.subplot(133)
plt.plot(seg1["MonthlyCharge"], seg1["DayMins"], "o")
plt.xlabel("Monthly Charge")
plt.ylabel("Day Mins")
plt.title("Churn=0&1")

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df1 = seg1.loc[seg1["Churn"]==1]
df0 = seg1.loc[seg1["Churn"]==0]

plt.figure(figsize=(20,5))

plt.subplot(131)
plt.plot(df1["MonthlyCharge"], df1["DataUsage"], "co")
plt.xlabel("MonthlyCharge")
plt.ylabel("DataUsage")
plt.title("Churn=1")

plt.subplot(132)
plt.plot(df0["MonthlyCharge"], df0["DataUsage"], "o")
plt.xlabel("MonthlyCharge")
plt.ylabel("DataUsage")
plt.title("Churn=0")

plt.subplot(133)
plt.plot(seg1["MonthlyCharge"], seg1["DataUsage"], "o")
plt.xlabel("MonthlyCharge")
plt.ylabel("DataUsage")
plt.title("Churn=0&1")

In [None]:
plt.plot(seg1["MonthlyCharge"], seg1["DataUsage"], "o")
plt.plot(df1["MonthlyCharge"], df1["DataUsage"], "o") # Churned within seg1

### More on zero data usage customers

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
dataus0 = seg1.loc[seg1["DataUsage"]==0] # Users with Data Usage zero in seg1.

df0 = dataus0.loc[dataus0["Churn"]==0]
df1 = dataus0.loc[dataus0["Churn"]==1]

plt.figure(figsize=(20,5))

plt.subplot(131)
plt.plot(df1["MonthlyCharge"], df1["DayMins"], "co")
plt.xlabel("MonthlyCharge")
plt.ylabel("DayMins")
plt.title("Churn=1")

plt.subplot(132)
plt.plot(df0["MonthlyCharge"], df0["DayMins"], "o")
plt.xlabel("MonthlyCharge")
plt.ylabel("DayMins")
plt.title("Churn=0")

plt.subplot(133)
plt.plot(dataus0["MonthlyCharge"], dataus0["DayMins"], "o")
plt.xlabel("MonthlyCharge")
plt.ylabel("DayMins")
plt.title("Churn=0&1")

In [None]:
dataus0

In [None]:
plt.figure(figsize=(8,5))
df["AccountWeeks"].hist(bins=30, alpha=0.5, label='Account weeks all users')
seg1["AccountWeeks"].hist(bins=30, alpha=0.5, label='Seg1 account weeks')
dataus0["AccountWeeks"].hist(bins=30, alpha=0.5, label='Data usage zero account weeks')
seg2["AccountWeeks"].hist(bins=30, alpha=0.5, label='Seg2 account weeks')
plt.legend()

In [None]:
dataus0["Churn"].value_counts()

In [None]:
(322*100)/(1491+322)

In [None]:
len(seg1)

In [None]:
seg1["Churn"].value_counts()

In [None]:
(403*100)/(2017+403)

### Data users within segemet 1

In [None]:
dataus = seg1.loc[seg1["DataUsage"]>0]
dataus

In [None]:
plt.figure(figsize=(8,5))
df["AccountWeeks"].hist(bins=30, label='Account Weeks All Users')
dataus["AccountWeeks"].hist(bins=30, label='Account Weeks Data Users')
plt.legend()

In [None]:
dataus["Churn"].value_counts()

In [None]:
(81*100)/(256+81)

### Segment 2. Customers who's monthly charges are 30 onwards and data usage is >1.

In [None]:
seg2

In [None]:
seg2["Churn"].value_counts()

In [None]:
(80*100)/833

In [None]:
seg2["DataPlan"].value_counts()

In [None]:
# df0 and df1 is temporary variable here untill defined again like this.
df1 = seg2.loc[seg2["Churn"]==1]
df0 = seg2.loc[seg2["Churn"]==0]

df0["DayMins"].hist(bins=30, label='Seg2 DayMins of Retained') 
df1["DayMins"].hist(bins=30, label='Seg2 DayMins of Churned') 
plt.legend()

### Segment 2 Data Usage

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df1 = seg2.loc[seg2["Churn"]==1]
df0 = seg2.loc[seg2["Churn"]==0]

plt.figure(figsize=(20,5))

plt.subplot(131)
plt.plot(df1["MonthlyCharge"], df1["DataUsage"], "co")
plt.xlabel("MonthlyCharge")
plt.ylabel("DataUsage")
plt.title("Churn=1")

plt.subplot(132)
plt.plot(df0["MonthlyCharge"], df0["DataUsage"], "o")
plt.xlabel("MonthlyCharge")
plt.ylabel("DataUsage")
plt.title("Churn=0")

plt.subplot(133)
plt.plot(seg2["MonthlyCharge"], seg2["DataUsage"], "o")
plt.xlabel("MonthlyCharge")
plt.ylabel("DataUsage")
plt.title("Churn=0&1")

In [None]:
plt.plot(seg2["MonthlyCharge"], seg2["DataUsage"], "o")
plt.plot(df1["MonthlyCharge"], df1["DataUsage"], "o")

In [None]:
sns.boxplot(x="Churn", y="MonthlyCharge", data=seg2)

In [None]:
plt.figure(figsize=(8,5))
df0["DataUsage"].hist(bins=30, label='Seg2 Data Usage for Churn=0')
df1["DataUsage"].hist(bins=30, label='Seg2 Data Usage for Churn=1') 
plt.legend()

In [None]:
plt.figure(figsize=(8,5))
df0["MonthlyCharge"].hist(bins=30, label='Seg2 Monthly Charge for Churn=0') 
df1["MonthlyCharge"].hist(bins=30, label='Seg2 Monthly Charge for Churn=1') 
plt.legend()

In [None]:
plt.figure(figsize=(8,5))
df0["RoamMins"].hist(bins=30, label='Seg2 Roam Min for Churn=0') 
df1["RoamMins"].hist(bins=30, label='Seg2 Roam Min for Churn=1')
plt.legend() 

In [None]:
plt.figure(figsize=(8,5))
df0["AccountWeeks"].hist(bins=30, label='Seg2 Account Weeks for Churn=0') 
df1["AccountWeeks"].hist(bins=30, label='Seg2 Account Weeks for Churn=1')
plt.legend() 

## Seg 1 & Seg2 DayMins

In [None]:
seg1['DayMins'].hist(bins=30, label='Seg1 Day Mins')
seg2['DayMins'].hist(bins=30, label='Seg2 Day Mins')
plt.legend()

In [None]:
seg1['DayMins'].mean()

In [None]:
seg2['DayMins'].mean()

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df0 = seg1.loc[seg1["Churn"]==0]
df1 = seg1.loc[seg1["Churn"]==1]

plt.figure(figsize=(15,5))

plt.subplot(121)
df0["DayMins"].hist(bins=30, alpha=0.5, label='Seg1 Day Mins Churn=0')
df1["DayMins"].hist(bins=30, alpha=0.5, label='Seg1 Day Mins Churn=1')
plt.legend()
plt.subplot(122)
df0["DataUsage"].hist(label='Seg1 data usage Churn=0')
df1["DataUsage"].hist(label='Seg1 data usage Churn=1')
plt.legend()

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df0 = seg2.loc[seg2["Churn"]==0]
df1 = seg2.loc[seg2["Churn"]==1]

plt.figure(figsize=(15,5))
plt.subplot(121)
df0["DayMins"].hist(bins=30, label='Seg2 Day Mins Churn=0')
df1["DayMins"].hist(bins=30, label='Seg2 Day Mins Churn=1')
plt.legend()
plt.subplot(122)
df0["DataUsage"].hist(label='Seg2 Data Usage Churn=0')
df1["DataUsage"].hist(label='Seg2 Data Usage Churn=1')
plt.legend()

## Seg1 & Seg2 Monthly Charges

In [None]:
plt.figure(figsize=(20,5))
plt.subplot(121)
seg1["MonthlyCharge"].plot(kind="box", vert=False, title="Segment 1 Monthly charges")
plt.subplot(122)
seg2["MonthlyCharge"].plot(kind="box", vert=False, title="Segment 2 Monthly charges")

In [None]:
sns.boxplot(x="Churn", y="MonthlyCharge", data=seg1)

In [None]:
sns.boxplot(x="Churn", y="MonthlyCharge", data=seg2)

### SUM of Monthly Charges seg1 & 2

In [None]:
df["MonthlyCharge"].hist(bins=30, alpha=0.3, label='Monthly Charge all Users') 
seg1["MonthlyCharge"].hist(bins=30, alpha=0.5, label='Monthly Charge Seg1 Users') 
seg2["MonthlyCharge"].hist(bins=30, alpha=0.5,label='Monthly Charge Seg2 Users') 
plt.legend()

print("Sum of Monthly Charge in seg1 =", seg1["MonthlyCharge"].sum())
print("Sum of Monthly Charge in seg2 =", seg2["MonthlyCharge"].sum())

## Roaming

In [None]:
df["RoamMins"].hist(bins=30, alpha=0.5, label='Roam Mins all Users')
seg1["RoamMins"].hist(bins=30, alpha=0.5, label='Roam Mins Seg1 Users')
seg2["RoamMins"].hist(bins=30, alpha=0.5, label='Roam Mins Seg2 Users')
plt.legend()

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df0 = seg1.loc[seg1["Churn"]==0]
df1 = seg1.loc[seg1["Churn"]==1] 

df0["RoamMins"].hist(bins=30, label='Roam Mins Seg1 Churn=0')
df1["RoamMins"].hist(bins=30, label='Roam Mins Seg1 Churn=1') 
plt.legend()

print(f"{df0.RoamMins.sum()} mins, per {len(df0.RoamMins)} customers who not churned.")
print(f"{df1.RoamMins.sum()} mins, per {len(df1.RoamMins)} customers who churned.")

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df0 = seg2.loc[seg2["Churn"]==0]
df1 = seg2.loc[seg2["Churn"]==1] 

df0["RoamMins"].hist(bins=30, label='Roam Mins Seg2 Churn=0')
df1["RoamMins"].hist(bins=30, label='Roam Mins Seg2 Churn=1')
plt.legend()

print(f"{df0.RoamMins.sum()} mins, per {len(df0.RoamMins)} customers who not churned.")
print(f"{df1.RoamMins.sum()} mins, per {len(df1.RoamMins)} customers who churned.")

## Customer Service Calls

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df0 = seg1.loc[seg1["Churn"]==0] 
df1 = seg1.loc[seg1["Churn"]==1]

df0["CustServCalls"].hist(alpha=0.5, label='Cust. Serv. cals Seg1 churn=0')
df1["CustServCalls"].hist(alpha=0.5, label='Cust. Serv. cals Seg1 churn=1')
plt.legend()

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df0 = seg2.loc[seg2["Churn"]==0] # seg2 not churned
df1 = seg2.loc[seg2["Churn"]==1] # seg2 churned

df0["CustServCalls"].hist(alpha=0.5, label='Cust. Serv. cals Seg2 churn=0')
df1["CustServCalls"].hist(alpha=0.5, label='Cust. Serv. cals Seg2 churn=1')
plt.legend()

## Contract Renewal

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df0 = seg1.loc[seg1["Churn"]==0] # seg1 not churned
df1 = seg1.loc[seg1["Churn"]==1] # seg1 churned

df0["ContractRenewal"].hist(label='Contract Renewal Seg1 Churn=0') 
df1["ContractRenewal"].hist(label='Contract Renewal Seg1 Churn=1')
plt.legend() 

In [None]:
# df0 and df1 are temporary variables here untill defined again like this.
df0 = seg2.loc[seg2["Churn"]==0] # seg2 not churned
df1 = seg2.loc[seg2["Churn"]==1] # seg2 churned

df0["ContractRenewal"].hist(label='Contract Renewal Seg2 Churn=0')
df1["ContractRenewal"].hist(label='Contract Renewal Seg2 Churn=1')
plt.legend()

# ML Model

In [None]:
df

In [None]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Feature Engineering

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_train

In [None]:
X_test = scaler.fit_transform(X_test)
X_test

## Decision Tree 

In [None]:
clfd = DecisionTreeClassifier(random_state=0)
clfd.fit(X_train, y_train)

pd = clfd.predict(X_test)
print(classification_report(y_test, pd))
plot_confusion_matrix(clfd, X_test, y_test) 

## Random Forest

In [None]:
clfr = RandomForestClassifier(n_estimators=100,random_state=0)
clfr.fit(X_train, y_train)

pr = clfr.predict(X_test)
print(classification_report(y_test, pr))
plot_confusion_matrix(clfr, X_test, y_test)  

## Gradient Boosting

In [None]:
clfg = GradientBoostingClassifier(n_estimators=250, learning_rate=1.0, max_depth=2, random_state=0)
clfg.fit(X_train, y_train)

pg = clfg.predict(X_test)
print(classification_report(y_test, pg))
plot_confusion_matrix(clfg, X_test, y_test) 

## KNN

In [None]:
clfk = KNeighborsClassifier(n_neighbors=2, weights='distance')
clfk.fit(X_train, y_train)

pk = clfk.predict(X_test)
print(classification_report(y_test, pk))
plot_confusion_matrix(clfk, X_test, y_test) 

## AUC ROC

In [None]:
p1 = clfd.predict_proba(X_test)
p2 = clfr.predict_proba(X_test)
p3 = clfg.predict_proba(X_test)
p4 = clfk.predict_proba(X_test)

In [None]:
# auc scores
auc_score1 = roc_auc_score(y_test, p1[:,1])
auc_score2 = roc_auc_score(y_test, p2[:,1])
auc_score3 = roc_auc_score(y_test, p3[:,1])
auc_score4 = roc_auc_score(y_test, p4[:,1])

print(auc_score1, auc_score2, auc_score3, auc_score4)

In [None]:
# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, p1[:,1], pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, p2[:,1], pos_label=1)
fpr3, tpr3, thresh3 = roc_curve(y_test, p3[:,1], pos_label=1)
fpr4, tpr4, thresh4 = roc_curve(y_test, p4[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

In [None]:
plt.plot(fpr1, tpr1, linestyle='--', label='Decision Tree')
plt.plot(fpr2, tpr2, linestyle='--', label='Random Forest')
plt.plot(fpr3, tpr3, linestyle='--', label='Gradient Boost')
plt.plot(fpr4, tpr4, linestyle='--', label='KNN')
plt.plot(p_fpr, p_tpr, linestyle='--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')

# Conclusion
## Final Steps to Retain Customers
* Optimise price of talk time for segment 1 customers.
* Introduce data plan to those customers who are using data without data plans ASAP.
* Introduce exciting data plans to segment 1 customers.
* If possible optimising price of data plans can retain segment 2 customers also.
---
