In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Our Question: Which service has the most churn and what are the trends?

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

df = pd.read_csv('../input/../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
df.info

# Exploring the data

In [None]:
df.isnull().sum()

We see that there are no null values in the dataframe

In [None]:
df.head()

This dataset has the following information:

**Demographics**: Gender, senior citizen, partner, dependents

**Services:** Phone service, multiple lines, internet service, online security, online backup, tech support, streamingtv, streamingmovies

**Account info:** Contract, paperless billing, monthly charges, payment method, total charges, tenure

We will focus on the following services as these are likely to be the main services customers signed up for.
1. Phone Service
2. Internet Service
3. Streaming TV
4. Streaming Movies

In [None]:
df['Churn'].value_counts(normalize = True)

Around 25% of the customers in the dataset are churned customers

In [None]:
filt = df['Churn']=='Yes'
bins = np.arange(0,75, 5)

plt.hist(df[filt]['tenure'], bins = bins)
plt.xlabel('Tenure (months)')
plt.ylabel('Count of churned customers')
plt.title('Churn based on tenure')

Looking at churn based on tenure months, we see that the highest churn occurs at customers with a tenure of 0 to 5 months. 

Now we will look at the no. of services each customer signed up and if they churn

In [None]:
#To indicate how many services each customer signed up for (Phone, Streaming Tv/Movies, Internet)

isY = lambda x:int(x == 'Yes' or x=='DSL' or x=='Fiber optic')
countFactors = lambda row: isY(row['PhoneService']) + isY(row['StreamingMovies']) + isY(row['InternetService']) + isY(row['StreamingTV'])
                               
df['Services'] = df.apply(countFactors, axis = 1)
df[['PhoneService', 'StreamingMovies', 'InternetService','StreamingTV','Churn','Services']].head(10)

In [None]:
#Looking at number of churn based on services signed up for
churn_count = df[['Services','Churn']].value_counts().unstack()
churn_count['Churn%'] = churn_count['Yes']/(churn_count['No'] + churn_count['Yes'])
churn_count['Churn%'] = churn_count['Churn%'].round(decimals = 4)
churn_count

# Comparing Churn %
* The lowest churn % comes from customers who signed up for 1 service
* The highest churn comes from customers with 2 services (35%)
* At least 30% of customers with 2 or more services churned

In [None]:
bins = np.arange(0,130, 5)

plt.hist(df['MonthlyCharges'], bins = bins)
plt.xlabel('Monthly Charges')
plt.ylabel('Count of accounts')
plt.title('Distribution of Monthly Charges')

The most common monthly charges are 15 - 25 and 70 - 90 per month.

In [None]:
filt = df['Churn']=='Yes'
bins = np.arange(0,145, 5)

plt.hist(df[filt]['MonthlyCharges'], bins = bins)
plt.xlabel('Monthly Charges')
plt.ylabel('Count of accounts')
plt.title('Distribution of Monthly Charges for Churned customers')

A large amount of churn comes from customers paying 70 - 105 per month.

# Looking at the churn for each service - Phone, Internet, Streaming

In [None]:
df_phoneservice = df[df['PhoneService']=='Yes']

v = sns.catplot(x = 'PhoneService',kind = 'count', hue = 'Churn', data = df_phoneservice)
v.set(title = 'Churn for Phone Service', xlabel = None)

ax = v.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
df_is = df[(df['InternetService']=='DSL')|(df['InternetService']=='Fiber optic')]

v2 = sns.catplot(x = 'InternetService',kind = 'count', hue = 'Churn', data = df_is)
v2.set(title = 'Churn for Internet Service', xlabel = None)

ax = v2.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
dfst = df[df['StreamingTV']=='Yes']

v6 = sns.catplot(x = 'StreamingTV',kind = 'count', hue = 'Churn', data = dfst)
v6.set(title = 'Churn for Streaming TV', xlabel = None)

ax = v6.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
#To focus on movies first,select only those who subscribed to movie streaming
dfsm = df[df['StreamingMovies']=='Yes']

v7 = sns.catplot(x = 'StreamingMovies',kind = 'count', hue = 'Churn', data = dfsm)

ax = v7.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

Observations on churn %:
* Churn is around 25% for phone service
* Churn is a lot higher for Fiber optic compared to DSL users
* About 30% of Streaming TV customers churn
* About 30% of Streaming Movies customers churn

# **Churn % for services based on:**
1. Gender
2. Payment method
3. Contract type

## **Streaming Movies**

In [None]:
#To focus on movies first,select only those who subscribed to movie streaming
df_movies = df[df['StreamingMovies']=='Yes']
plt_movie = sns.catplot(x = 'gender',kind = 'count', hue = 'Churn', data = df_movies)
plt_movie.set(title = 'Churn based on Gender (Streaming Movies)', xlabel = None)

ax = plt_movie.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_movie2 = sns.catplot(x = 'PaymentMethod',kind = 'count', hue = 'Churn', data = df_movies, aspect = 2)
plt_movie2.set(title = 'Churn based on Payment Method (Streaming Movies)', xlabel = None)

ax = plt_movie2.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')
    
#high churn for people who pay via electronic check, mailed check % is next highest

In [None]:
plt_movie3 = sns.catplot(x = 'Contract',kind = 'count', hue = 'Churn', data = df_movies)
plt_movie3.set(title = 'Churn based on Contract type (Streaming Movies)', xlabel = None)

ax = plt_movie3.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_movie4 = sns.catplot(x = 'TechSupport',kind = 'count', hue = 'Churn', data = df_movies)
plt_movie4.set(title = 'Churn based on TechSupport (Streaming Movies)', xlabel = None)

ax = plt_movie4.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

## **Streaming TV**

In [None]:
#To only select those who subscribed to tv streaming
df_tv = df[df['StreamingTV']=='Yes']

In [None]:
plt_tv = sns.catplot(x = 'gender',kind = 'count', hue = 'Churn', data = df_tv)
plt_tv.set(title = 'Churn based on Gender (Streaming TV)', xlabel = None)

ax = plt_tv.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_tv2 = sns.catplot(x = 'PaymentMethod',kind = 'count', hue = 'Churn', data = df_tv, aspect = 2)
plt_tv2.set(title = 'Churn based on Payment Method (Streaming TV)', xlabel = None)

ax = plt_tv2.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')
    
#same trends as movies, high churn for electronic check, followed by mailed check. Need to get them to shift to more automatic payment modes

In [None]:
plt_tv3 = sns.catplot(x = 'Contract',kind = 'count', hue = 'Churn', data = df_tv)
plt_tv3.set(title = 'Churn based on Contract type (Streaming TV)', xlabel = None)

ax = plt_tv3.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_tv4 = sns.catplot(x = 'TechSupport',kind = 'count', hue = 'Churn', data = df_tv)
plt_tv4.set(title = 'Churn based on Tech Support (Streaming TV)', xlabel = None)

ax = plt_tv4.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

## **Internet (Fiber & DSL)**

In [None]:
#To only select those who subscribed for internet
df_internet_FO = df[(df['InternetService']=='Fiber optic')]
df_internet_DSL = df[(df['InternetService']=='DSL')]

In [None]:
plt_internet_FO = sns.catplot(x = 'gender',kind = 'count', hue = 'Churn', data = df_internet_FO)
plt_internet_FO.set(title = 'Churn based on Gender (Fiber Optic)', xlabel = None)

ax = plt_internet_FO.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_internet_DSL = sns.catplot(x = 'gender',kind = 'count', hue = 'Churn', data = df_internet_DSL)
plt_internet_DSL.set(title = 'Churn based on Gender (DSL)', xlabel = None)

ax = plt_internet_DSL.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_internet_FO2 = sns.catplot(x = 'PaymentMethod',kind = 'count', hue = 'Churn', data = df_internet_FO, aspect = 2)
plt_internet_FO2.set(title = 'Churn based on Payment Method (Fiber)', xlabel = None)

ax = plt_internet_FO2.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_internet_DSL2 = sns.catplot(x = 'PaymentMethod',kind = 'count', hue = 'Churn', data = df_internet_DSL, aspect = 2)
plt_internet_DSL2.set(title = 'Churn based on Payment Method (DSL)', xlabel = None)

ax = plt_internet_DSL2.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_internet_FO3 = sns.catplot(x = 'Contract',kind = 'count', hue = 'Churn', data = df_internet_FO)
plt_internet_FO3.set(title = 'Churn based on Contract Type(Fiber)', xlabel = None)


ax = plt_internet_FO3.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_internet_DSL3 = sns.catplot(x = 'Contract',kind = 'count', hue = 'Churn', data = df_internet_DSL)
plt_internet_DSL3.set(title = 'Churn based on Contract Type (DSL)', xlabel = None)


ax = plt_internet_DSL3.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_internet_FO4 = sns.catplot(x = 'TechSupport',kind = 'count', hue = 'Churn', data = df_internet_FO)
plt_internet_FO4.set(title = 'Churn based on Tech Support (Fiber Optic)', xlabel = None)

ax = plt_internet_FO4.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_internet_DSL4 = sns.catplot(x = 'TechSupport',kind = 'count', hue = 'Churn', data = df_internet_DSL)
plt_internet_DSL4.set(title = 'Churn based on Tech Support (DSL)', xlabel = None)

ax = plt_internet_DSL4.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

## **Phone Service**

In [None]:
#To only select those who subscribed for phone service
df_phone = df[df['PhoneService']=='Yes']

In [None]:
plt_phone = sns.catplot(x = 'gender',kind = 'count', hue = 'Churn', data = df_phone)
plt_phone.set(title = 'Churn based on Gender (Phone)', xlabel = None)

ax = plt_phone.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_phone2 = sns.catplot(x = 'PaymentMethod',kind = 'count', hue = 'Churn', data = df_phone, aspect = 2)
plt_phone2.set(title = 'Churn based on Payment Method (Phone)', xlabel = None)

ax = plt_phone2.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

In [None]:
plt_phone3 = sns.catplot(x = 'Contract',kind = 'count', hue = 'Churn', data = df_phone)
plt_phone3.set(title = 'Churn based on Contract type (Phone)', xlabel = None)

ax = plt_phone3.facet_axis(0,0)
for p in ax.patches:
    ax.annotate((p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', 
                   xytext = (0, 9), textcoords = 'offset points')

Overall trends observed across the 4 services

1. Gender does not impact churn rates
2. Automatic payment methods have low churn rates. Manual payment see a much higher churn % with electronic check having the highest churn %
3. Customers that also signed up for tech support have a lower churn compared to customers without tech support
4. The most common churn based on contract type is for month-to-month customers