In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [4]:
len(df.columns)

21

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [1]:
def value_count_viz(column):

    fig, ax = plt.subplots(figsize=(10,12))

    x_ticks = [a+1 for a in list(range(len([a for a in df[column].value_counts()])))]
    x_ticklabels = [a for a in df[column].value_counts().keys()]
    y_values = [a for a in df[column].value_counts()]
    bar_width = 0.8
    
    x_axis_label = column
    y_axis_label = 'Value Counts'
    fig_title = f'{column} Value Counts'

    ax.bar(x_ticks, y_values, bar_width)

    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels)
    ax.set_xlabel(x_axis_label,fontsize=15)
    ax.set_ylabel(y_axis_label,fontsize=15)
    ax.set_title(fig_title,fontsize=20);

In [3]:
#This shows all the categorical variable columns within one cell. 
#Unlike value_count_viz which needs to be passed for every column.

def category_counts(df):

    cat_df = df.select_dtypes(include=['object','category']).drop(columns = 'customerID', axis = 1)
    n_cat_cols = len(cat_df.columns)

    fig, ax = plt.subplots(n_cat_cols,figsize=(10,140))


    for col_index in list(range(n_cat_cols)):

        x_ticks = [a+1 for a in list(range(len([a for a in cat_df.iloc[:,col_index].value_counts()])))]
        x_ticklabels = [a for a in cat_df.iloc[:,col_index].value_counts().keys()]
        y_values = [a for a in cat_df.iloc[:,col_index].value_counts()]
        bar_width = 0.8


        x_axis_label = cat_df.columns[col_index]
        y_axis_label = 'Value Counts'
        fig_title = f'{x_axis_label} Value Counts'


        ax[col_index].bar(x_ticks,y_values,bar_width,color='mediumpurple')

        ax[col_index].set_xticks(x_ticks)
        ax[col_index].set_xticklabels(x_ticklabels,fontsize=10)
        ax[col_index].set_xlabel(x_axis_label,fontsize=12)
        ax[col_index].set_ylabel(y_axis_label,fontsize=12)
        ax[col_index].set_title(fig_title,fontsize=20);

In [4]:
def compare_churn(column):
    
    df_churn = pd.DataFrame(churn_df.groupby(column)['customerID'].count())
    df_churn.columns = ['Customers Churned Count']
    df_total = pd.DataFrame(df.groupby(column)['customerID'].count())
    df_total.columns = ['Total Customer Count']
    final_df = pd.merge(df_total,df_churn,on=column)
    final_df['% of Customers Churned'] = [f'{round((a/b)*100,2)}%' for a,b in zip(final_df['Customers Churned Count'],final_df['Total Customer Count'])]

    display(final_df)

    fig, ax = plt.subplots(figsize = (20,12))

    all_y_values = final_df['Total Customer Count'].tolist()
    churn_y_values = final_df['Customers Churned Count'].tolist()
    x_labels = list(final_df.index)
    x_ticks = list(range(len(x_labels)))

    ax.bar(x_ticks,all_y_values,label="All Customers",color='darkblue',edgecolor='black')
    ax.bar(x_ticks,churn_y_values,label="Churned Customers",color='darkorange',edgecolor='black')
    
    for a,b in zip(x_ticks,all_y_values):
        ax.text(a,b+20,f'Total Count: {b}',fontsize=15)
    
    for a,b in zip(x_ticks,churn_y_values):
        ax.text(a,100,f'Churn Count: {b}',fontsize=15,fontweight='extra bold',color='dimgray')

    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_labels,fontsize=15)
    ax.set_xlabel(column.title(),fontsize=20)
    ax.set_ylabel('Customer Count',fontsize=20)
    ax.set_title(f'Customer Churn comparison: {column.title()}',fontsize=30)
    ax.legend(fontsize=15,loc='upper center');