# Importing Libraries:

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt 
import matplotlib.dates as md
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm    

# Setting the Dataset Path:

In [None]:
%cd C:\Musfique\Springboard Data Analytics CT\Capstone 2\Telco Customer Churn

# Reading the Dataset in the Notebook:

In [None]:
telco_df = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
telco_df.head()

# Exploring & Cleaning the Dataset:

In [None]:
telco_df.dtypes

* The 'TotalCharges' column is supposed to be numeric.

# Converting 'TotalCharges' to a Numeric Data Type:

In [None]:
telco_df.TotalCharges = pd.to_numeric(telco_df.TotalCharges, errors='coerce')
telco_df['TotalCharges'].dtypes

# Checking Missing Values in the Dataset:

In [None]:
telco_df.isnull().sum()

# Removing Rows having Missing Values from the Dataset:

In [None]:
telco_df.dropna(inplace=True)
telco_df.isnull().sum()

# Removing 'customerID' from the Dataset

In [None]:
df_cleaned = telco_df.drop(['customerID'], axis=1)
df_cleaned.head()

# Converting All Strings to Lowercase:

In [None]:
for item in df_cleaned.columns:
    try:
        df_cleaned[item] = df_cleaned[item].str.lower()
    except:
        print(item, "couldn't convert")
df_cleaned.head(20)

# Converting All 'yes/no' Variables to '1/0':

In [None]:
columns_to_convert = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for item in columns_to_convert:
    df_cleaned[item].replace(to_replace='yes', value=1, inplace=True)
    df_cleaned[item].replace(to_replace='no',  value=0, inplace=True)
df_cleaned.head()

# Exploratory Data Analysis (EDA)

# Correlation Heatmap:

In [None]:
corr_mat = df_cleaned.corr()
plt.figure(figsize=(16, 6))
heat_map = sns.heatmap(corr_mat, vmin=-1, vmax=1, annot=True)
heat_map.set_title('Correlation Heatmap')

Insights: Churn clearly shows some correlation with all variables shown on the heatmap except phone service.

# Analyzing Binary Categorical Variables:

In [None]:
ax = sns.countplot(x='SeniorCitizen', data=df_cleaned)
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['SeniorCitizen'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

ax = sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['SeniorCitizen'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
ax = sns.countplot(x='Partner', data=df_cleaned)
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['Partner'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

ax = sns.countplot(x='Partner', data=df_cleaned, hue='Churn')
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['Partner'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
ax = sns.countplot(x='Dependents', data=df_cleaned)
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['Dependents'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

ax = sns.countplot(x='Dependents', data=df_cleaned, hue='Churn')
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['Dependents'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
ax = sns.countplot(x='PhoneService', data=df_cleaned)
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['PhoneService'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

ax = sns.countplot(x='PhoneService', data=df_cleaned, hue='Churn')
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['PhoneService'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
ax = sns.countplot(x='PaperlessBilling', data=df_cleaned)
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['PaperlessBilling'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

ax = sns.countplot(x='PaperlessBilling', data=df_cleaned, hue='Churn')
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['PaperlessBilling'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
ax = sns.countplot(x='gender', data=df_cleaned)
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['gender'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

ax = sns.countplot(x='gender', data=df_cleaned, hue='Churn')
plt.figure()
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['gender'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

Insights:

1. 16.2% of the customers are seen to have churned. Churn is significantly higher in Senior Citizens (42%) compared to customers who aren't Senior Citizens (23.6%).

2. 48.3% of the customers have Partners. Churn is significantly higher in customers that don't have partners (33.1%) compared to those having Partners (19.7%).

3. 29.8% of customers have Dependents. Churn is significantly lower in customers having Dependents (15.4%) compared to those not having Dependents (31.2%).

4. 59.3% of the customers have Paperless Billing. Churn is significantly higher in customers having paperless billing (33.6%) compared to those not having Paperless Billing (16.5%).

5. 90.3% of the customers have Phone Service. Churn is not much different irrespective of customers enjoying a Phone Service (26.8%) or not (24.7%).

6. Male - Female ratio is 50.5% vs 49.5%. Churn is almost unbiased in terms of gender.

# Analyzing Other Categorical Variables:

In [None]:
plt.figure(figsize=(9, 4))
ax = sns.countplot(x='PaymentMethod', data=df_cleaned)
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['PaymentMethod'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

plt.figure(figsize=(9, 4))
ax = sns.countplot(x='PaymentMethod', data=df_cleaned, hue='Churn')
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['PaymentMethod'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
plt.figure(figsize=(7, 4))
ax = sns.countplot(x='Contract', data=df_cleaned)
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['Contract'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

plt.figure(figsize=(7, 4))
ax = sns.countplot(x='Contract', data=df_cleaned, hue='Churn')
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['Contract'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
plt.figure(figsize=(7, 4))
ax = sns.countplot(x='MultipleLines', data=df_cleaned)
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['MultipleLines'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

plt.figure(figsize=(7, 4))
ax = sns.countplot(x='MultipleLines', data=df_cleaned, hue='Churn')
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['MultipleLines'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
plt.figure(figsize=(7, 4))
ax = sns.countplot(x='InternetService', data=df_cleaned)
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['InternetService'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

plt.figure(figsize=(7, 4))
ax = sns.countplot(x='InternetService', data=df_cleaned, hue='Churn')
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['InternetService'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
plt.figure(figsize=(7, 4))
ax = sns.countplot(x='OnlineSecurity', data=df_cleaned)
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['OnlineSecurity'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

plt.figure(figsize=(7, 4))
ax = sns.countplot(x='OnlineSecurity', data=df_cleaned, hue='Churn')
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['OnlineSecurity'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
plt.figure(figsize=(7, 4))
ax = sns.countplot(x='OnlineBackup', data=df_cleaned)
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['OnlineBackup'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

plt.figure(figsize=(7, 4))
ax = sns.countplot(x='OnlineBackup', data=df_cleaned, hue='Churn')
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['OnlineBackup'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
plt.figure(figsize=(7, 4))
ax = sns.countplot(x='DeviceProtection', data=df_cleaned)
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['DeviceProtection'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

plt.figure(figsize=(7, 4))
ax = sns.countplot(x='DeviceProtection', data=df_cleaned, hue='Churn')
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['DeviceProtection'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
plt.figure(figsize=(7, 4))
ax = sns.countplot(x='TechSupport', data=df_cleaned)
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['TechSupport'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

plt.figure(figsize=(7, 4))
ax = sns.countplot(x='TechSupport', data=df_cleaned, hue='Churn')
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['TechSupport'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
plt.figure(figsize=(7, 4))
ax = sns.countplot(x='StreamingTV', data=df_cleaned)
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['StreamingTV'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

plt.figure(figsize=(7, 4))
ax = sns.countplot(x='StreamingTV', data=df_cleaned, hue='Churn')
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['StreamingTV'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

In [None]:
plt.figure(figsize=(7, 4))
ax = sns.countplot(x='StreamingMovies', data=df_cleaned)
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['StreamingMovies'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

plt.figure(figsize=(7, 4))
ax = sns.countplot(x='StreamingMovies', data=df_cleaned, hue='Churn')
#ax=sns.countplot(x='SeniorCitizen', data=df_cleaned, hue='Churn')

total = len(df_cleaned['StreamingMovies'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() /2.5
        y = p.get_y() + p.get_height()+1
        ax.annotate(percentage, (x, y))
plt.show()

Insights:

1. 33.6% customers paying through Electronic Checks. Churn is significantly higher in customers paying through electronic checks (45.2%) compared to those having other Payments Methods (15-19%).

2. 55.1% of the customers have Month-to-Month Contract. Churn is significantly higher in customers having a month-to-month contract (42.6%) compared to those having other Contracts (3-11%).

3. 9.7% of the customers don't have Phone Service. Of the 90.3% having Phone Service, 46.7% have Multiple Lines. Churn is observed to be bit lower in customers not having Multiple Lines (25.1%) compared to those having Multiple Lines (28.7%).

4. 78.4% customers have Internet Service. Higher proportion of the customers subscribed Internet Service through Fiber Optic (41.8%) have churned compared to those subscribed the service through DSL (18.9%).

5. Churn is significantly higher in customers who haven't subscribed to these serives - Online Security, Online Backup, Device Protection and Tech Support (41.9%, 39.9%, 39.1% and 41.7% respectively) compared to those who subcribed these services (14.6%, 21.4%, 22.7% and 15.2% respectively) 

5. Churn is slightly higher (less than 3%) in customers who haven't subscribed streaming TV or movies too.

# Analyzing Continuous Variables:

Plotting Distributions:

In [None]:
sns.displot(df_cleaned, x='MonthlyCharges', hue='Churn', element='step')

In [None]:
sns.displot(df_cleaned, x='TotalCharges', hue='Churn', element='step')

In [None]:
sns.displot(df_cleaned, x='tenure', hue='Churn', element='step')

Pair Plots:

In [None]:
sns.pairplot(df_cleaned,vars = ['tenure','MonthlyCharges','TotalCharges'], hue="Churn")

Insights: 

Both distributions and pair plot confirms the following:

1. The lower the total charges and tenure, the higher the churn.

2. Churn, conversely, is higher for highers bands of monthly charges.

# Testing Logistic Regression as a Predictive Model:

Step 1: Get data

In [None]:
df_lr = df_cleaned.drop('gender', 1)
df_lr.head()

- Since the gender is seen to remain unbiased towards churn, gender has been dropped.

Let's convert all the categorical variables into dummy variables

In [None]:
df_dummies = pd.get_dummies(df_lr)
df_dummies.head()

In [None]:
x = df_dummies.drop('Churn', 1)
y = df_dummies['Churn']

In [None]:
X = df_dummies.drop('Churn', 1)
y = df_dummies['Churn']

In [None]:
# split X and y into training and testing sets

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [None]:
# import the class
from sklearn.linear_model import LogisticRegression

In [None]:
# instantiate the model (using the default parameters)
logreg = LogisticRegression()

In [None]:
# fit the model with data
logreg.fit(X_train,y_train)

In [None]:
#
y_pred=logreg.predict(X_test)

In [None]:
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

In [None]:
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

'Insights:
    
The accuracy of predicting churn using logistic regression looks quite high (80.77%).'