In [None]:
import numpy as np # linear algebra
from scipy import stats # statistic library
import pandas as pd # To table manipulations
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")  

![](http://www.google.com/url?sa=i&url=https%3A%2F%2Fwww.quantiphi.com%2Fportfolio-posts%2Fcustomer-churn-analytics%2F&psig=AOvVaw13mN5rNbiZ_LtInd5xgyO7&ust=1596297447169000&source=images&cd=vfe&ved=0CAIQjRxqFwoTCMiOo8zt9-oCFQAAAAAdAAAAABAV)
# Data Overview

In [None]:
print ("Rows     : " ,df.shape[0])
print ("Columns  : " ,df.shape[1])
print ("\nFeatures : \n" ,df.columns.tolist())
print ("\nMissing values :  ", df.isnull().sum().values.sum())
print ("\nUnique values :  \n",df.nunique())

In [None]:
Check NA values in dataset

In [None]:
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
df.isnull().sum()

In [None]:
df = df.dropna()
df.isnull().sum()
#Remove NA values.

In [None]:
# Creating a copy of the data 
df_copy = df.copy()
df_copy.drop(['customerID','MonthlyCharges','TotalCharges','tenure'],axis=1,inplace = True)
df_copy.head()

In [None]:
summary = pd.concat([pd.crosstab(df_copy[x],df_copy.Churn) for x in df_copy.columns[:-1]], keys= df_copy.columns[:-1])
summary

In [None]:
summary['Churn_Percentage'] = summary['Yes']*100/(summary['No'] + summary['Yes'])
summary
#Looking at the percentage churn we can take decision on where to focus to prevent or reduce the churn of customers.

# Exploratory Data Analysis

In [None]:
sns.set(style="ticks", color_codes=True)

fig, axes = plt.subplots(nrows = 3,ncols = 5,figsize = (25,15))
sns.countplot(x = "gender", data = df, ax=axes[0][0])
sns.countplot(x = "Partner", data = df, ax=axes[0][1])
sns.countplot(x = "Dependents", data = df, ax=axes[0][2])
sns.countplot(x = "PhoneService", data = df, ax=axes[0][3])
sns.countplot(x = "MultipleLines", data = df, ax=axes[0][4])
sns.countplot(x = "InternetService", data = df, ax=axes[1][0])
sns.countplot(x = "OnlineSecurity", data = df, ax=axes[1][1])
sns.countplot(x = "OnlineBackup", data = df, ax=axes[1][2])
sns.countplot(x = "DeviceProtection", data = df, ax=axes[1][3])
sns.countplot(x = "TechSupport", data = df, ax=axes[1][4])
sns.countplot(x = "StreamingTV", data = df, ax=axes[2][0])
sns.countplot(x = "StreamingMovies", data = df, ax=axes[2][1])
sns.countplot(x = "Contract", data = df, ax=axes[2][2])
sns.countplot(x = "PaperlessBilling", data = df, ax=axes[2][3])
ax = sns.countplot(x = "PaymentMethod", data = df, ax=axes[2][4])
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
plt.show(fig)

There are only three numerical columns: tenure, monthly charges and total charges. 
The probability density distribution can be estimate using the seaborn kdeplot function.
From the plots above we can conclude that:
Recent clients are more likely to churn
Clients with higher MonthlyCharges are also more likely to churn
Tenure and MonthlyCharges are probably important features

In [None]:

def kdeplot(feature):
    plt.figure(figsize=(9, 4))
    plt.title("KDE for {}".format(feature))
    ax0 = sns.kdeplot(df[df['Churn'] == 'No'][feature].dropna(), color= 'navy', label= 'Churn: No')
    ax1 = sns.kdeplot(df[df['Churn'] == 'Yes'][feature].dropna(), color= 'orange', label= 'Churn: Yes')
kdeplot('tenure')
kdeplot('MonthlyCharges')
kdeplot('TotalCharges')

In [None]:
analysis1 = sns.catplot(y="Churn", kind="count", data=df, height=2.6, aspect=2.5, orient='h')

In [None]:
plt.figure(figsize=(15, 15))

plt.subplot(3, 2, 1)
sns.countplot('gender', data=df, hue='Churn')

plt.subplot(3, 2, 2)
sns.countplot('SeniorCitizen', data=df, hue='Churn')

plt.subplot(3, 2, 3)
sns.countplot('Partner', data=df, hue='Churn')

plt.subplot(3, 2, 4)
sns.countplot('Dependents', data=df, hue='Churn')

plt.subplot(3, 2, 5)
sns.countplot('PhoneService', data=df, hue='Churn')

plt.subplot(3, 2, 6)
sns.countplot('PaperlessBilling', data=df, hue='Churn')

In [None]:
plt.figure(figsize=(14, 14))

plt.subplot(3, 2, 1)
df[df.Churn == 'No'].tenure.hist(bins=35, alpha=0.6, label='Churn=No')
df[df.Churn == 'Yes'].tenure.hist(bins=35, alpha=0.6, label='Churn=Yes')
plt.legend()
plt.xlabel('Number of months with company')


plt.subplot(3, 2, 2)
df[df.Churn == 'No'].MonthlyCharges.hist(bins=35, alpha=0.6, label='Churn=No')
df[df.Churn == 'Yes'].MonthlyCharges.hist(bins=35, alpha=0.6, label='Churn=Yes')
plt.xlabel('Monthly Payment')
plt.legend()


plt.subplot(3, 2, 3)
df[df.Churn == 'No'].TotalCharges.hist(bins=35, alpha=0.6, label='Churn=No')
df[df.Churn == 'Yes'].TotalCharges.hist(bins=35, alpha=0.6, label='Churn=Yes')
plt.xlabel('Total Payment')
plt.legend()

In [None]:
plt.figure(figsize=(15, 15))

plt.subplot(3, 2, 1)
sns.countplot('OnlineBackup', data=df, hue='Churn')

plt.subplot(3, 2, 2)
sns.countplot('OnlineSecurity', data=df, hue='Churn')

plt.subplot(3, 2, 3)
sns.countplot('InternetService', data=df, hue='Churn')

plt.subplot(3, 2, 4)
sns.countplot('MultipleLines', data=df, hue='Churn')

plt.subplot(3, 2, 5)
sns.countplot('DeviceProtection', data=df, hue='Churn')

plt.subplot(3, 2, 6)
sns.countplot('TechSupport', data=df, hue='Churn')

In [None]:
plt.figure(figsize=(15, 18))

plt.subplot(3, 2, 1)
sns.countplot('StreamingMovies', data=df, hue='Churn')

plt.subplot(3, 2, 2)
sns.countplot('StreamingTV', data=df, hue='Churn')

plt.subplot(3, 2, 3)
g = sns.countplot('PaymentMethod', data=df, hue='Churn')
g.set_xticklabels(g.get_xticklabels(), rotation=45);

plt.subplot(3, 2, 4)
g = sns.countplot('Contract', data=df, hue='Churn')
g.set_xticklabels(g.get_xticklabels(), rotation=45);