In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
pd.options.display.max_columns = 50
from plotly.offline import iplot
from plotly.subplots import make_subplots

from scipy.stats import chi2_contingency

In [4]:
def load_data(path):
    df = pd.read_csv(path)
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df.columns = ['customerID', 'Gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'Tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
    df = df.drop(columns = ['customerID'])
    cat_cols = df.select_dtypes('object').columns.to_list()
    for col in cat_cols:
        df[col] = df[col].str.title()
    df = df.dropna()
    return df

In [5]:
df = load_data('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [75]:
df_cat = 'InternetService'
df_value = 'MonthlyCharges'
limit = 15


df["Churn"] = df['Churn'].replace({"Yes":1, "No":0})
tmp_churn = df[df['Churn'] == 1].groupby(df_cat)[df_value].sum().nlargest(limit).to_frame().reset_index()
tmp_no_churn = df[df['Churn'] == 0].groupby(df_cat)[df_value].sum().nlargest(limit).to_frame().reset_index()

In [None]:
              domain= {'x': [.52, 1]})

    trace2 = go.Pie(labels=tmp_no_churn[df_cat], 
                        values=tmp_no_churn[df_value], name= "No-Churn", hole= .5, 
                        hoverinfo="label+percent+name+value", showlegend=False,
                        domain= {'x': [0, .48]})


In [102]:
tmp_churn

Unnamed: 0,InternetService,MonthlyCharges
0,Fiber Optic,114300.05
1,Dsl,22529.2
2,No,2301.6


In [None]:
(114300)

In [101]:

p1= go.Pie(labels = tmp_churn[df_cat], values=tmp_churn[df_value], name='CHurn', hole=0.5, domain= {'x': [0, .5]})
p2 = go.Pie(labels = tmp_no_churn[df_cat], values=tmp_no_churn[df_value], name='No CHurn', hole=0.5,domain= {'x': [.5, 1]} )
layout = dict(title= "some", height=450,width=1200, font=dict(size=15),
                  annotations = [
                    dict(
                          x=.2, y=.5,
                          text='Churn', 
                          showarrow=False,
                          font=dict(size=20)
                      ),

                      dict(
                          x=.8, y=.5,
                          text='No Churn', 
                          showarrow=False,
                          font=dict(size=20)
                      ),
                      
        ])

fig = dict(data=[p1, 
p2
], layout=layout)
iplot(fig)

In [10]:
ch_df = df[df['Churn'] == "Yes"]
ch_df['InternetService'].value_counts()

Fiber Optic    1297
Dsl             459
No              113
Name: InternetService, dtype: int64

In [11]:
1297/(1297+459+113)

0.6939539860888175

In [22]:
churn_df = df[df['Churn'] == "Yes"]
no_churn_df = df[df['Churn'] == "No"]
retained_revenue = no_churn_df['MonthlyCharges'].sum()
not_retained = churn_df['MonthlyCharges'].sum()
total_revenue = df['MonthlyCharges'].sum()

In [24]:
retained_revenue, not_retained, total_revenue

(316530.15, 139130.85, 455661.0)

Unnamed: 0,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No Phone Service,Dsl,No,Yes,No,No,No,No,Month-To-Month,Yes,Electronic Check,29.85,29.85,0
1,Male,0,No,No,34,Yes,No,Dsl,Yes,No,Yes,No,No,No,One Year,No,Mailed Check,56.95,1889.5,0
2,Male,0,No,No,2,Yes,No,Dsl,Yes,Yes,No,No,No,No,Month-To-Month,Yes,Mailed Check,53.85,108.15,1
3,Male,0,No,No,45,No,No Phone Service,Dsl,Yes,No,Yes,Yes,No,No,One Year,No,Bank Transfer (Automatic),42.3,1840.75,0
4,Female,0,No,No,2,Yes,No,Fiber Optic,No,No,No,No,No,No,Month-To-Month,Yes,Electronic Check,70.7,151.65,1


In [36]:
df["Churn"] = df['Churn'].replace({"Yes":1, "No":0})
    
tmp_churn = df[df['Churn'] == 1]
tmp_no_churn = df[df['Churn'] == 0]    

In [37]:
corr = df['Churn'].corr(df['MonthlyCharges'])
corr = np.round(corr,3)
corr

0.193

In [38]:
tmp1 = tmp_churn['MonthlyCharges'].dropna()
tmp2 = tmp_no_churn['MonthlyCharges'].dropna()

In [41]:
hist_data = [tmp1, tmp2]

In [43]:
group_labels = ['Yes_churn', 'No_churn']
colors = ['seagreen','indianred', ]

In [46]:
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
from plotly.offline import iplot

In [52]:
df["Churn"] = df['Churn'].replace({"Yes":1, "No":0})
tmp_churn = df[df['Churn'] == "Yes"]
tmp_no_churn = df[df['Churn'] == "No"]    
corr = df['Churn'].corr(df['MonthlyCharges'])
corr = np.round(corr,3)
tmp1 = tmp_churn['MonthlyCharges'].dropna()
tmp2 = tmp_no_churn['MonthlyCharges'].dropna()
hist_data = [tmp1, tmp2]

In [54]:
corr

0.193

In [47]:
def plot_distribution(df, var_select=None, bins=1.0): 
    # Calculate the correlation coefficient between the new variable and the target
    df["Churn"] = df['Churn'].replace({"Yes":1, "No":0})
    tmp_churn = df[df['Churn'] == "Yes"]
    tmp_no_churn = df[df['Churn'] == "No"]    
    corr = df['Churn'].corr(df[var_select])
    corr = np.round(corr,3)
    tmp1 = tmp_churn[var_select].dropna()
    tmp2 = tmp_no_churn[var_select].dropna()
    hist_data = [tmp1, tmp2]
    
    group_labels = ['Yes_churn', 'No_churn']
    colors = ['seagreen','indianred', ]

    fig = ff.create_distplot(hist_data,
                             group_labels,
                             colors = colors, 
                             show_hist = True,
                             curve_type='kde', 
                             bin_size = bins, 
                            )
    
    fig['layout'].update(title = var_select+' '+'(corr target ='+ str(corr)+')')
    # iplot(fig, filename = 'Density plot')

    return iplot(fig, filename = 'Density plot')

In [49]:
# plot_distribution(df, var_select='MonthlyCharges', bins=4)

ValueError: min() arg is an empty sequence

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   Tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 


In [58]:
cat_col = 'Gender'

In [102]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=1,cols=1,
print_grid=True,
horizontal_spacing=0.15, 
subplot_titles=("Distribution of and % Churn", f'Mean Monthly Charges of {cat_col}') )

df["Churn"] = df['Churn'].replace({"Yes":1, "No":0})
tmp_churn = df[df['Churn'] == 1]
tmp_no_churn = df[df['Churn'] == 0]  

This is the format of your plot grid:
[ (1,1) x,y ]



In [66]:
tmp_attr = round(tmp_churn[cat_col].value_counts().sort_index() / df[cat_col].value_counts().sort_index(),2)*100

In [67]:
tmp_attr

Female    27.0
Male      26.0
Name: Gender, dtype: float64

In [97]:
x=tmp_churn[cat_col].value_counts().sort_index().index
y=tmp_churn[cat_col].value_counts().sort_index().values

In [122]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=1,cols=2,
print_grid=True,
horizontal_spacing=0.15, 
subplot_titles=("Distribution of and % Churn", f'Mean Monthly Charges of {cat_col}') )

df["Churn"] = df['Churn'].replace({"Yes":1, "No":0})
tmp_churn = df[df['Churn'] == 1]
tmp_no_churn = df[df['Churn'] == 0]  
trace1 = go.Bar(
        x=tmp_churn[cat_col].value_counts().sort_index().index,
        y=tmp_churn[cat_col].value_counts().sort_index().values,
        name='Yes_Churn',opacity = 0.8, marker=dict(
            color='seagreen',
            line=dict(color='#000000',width=1)))

trace2 = go.Bar(
    x=tmp_no_churn[cat_col].value_counts().sort_index().index,
    y=tmp_no_churn[cat_col].value_counts().sort_index().values,
    name='No_Churn', opacity = 0.8, 
    marker=dict(
        color='indianred',
        line=dict(color='#000000',
                    width=1)
    )
)

trace3 =  go.Scatter(   
        x=tmp_attr.sort_index().index,
        y=tmp_attr.sort_index().values,
        yaxis = 'y2',
        name='% Churn', opacity = 0.6, 
        marker=dict(
            color='black',
            line=dict(color='#000000',
                      width=2 )
        )
    )

This is the format of your plot grid:
[ (1,1) x,y   ]  [ (1,2) x2,y2 ]



In [123]:
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 1) 
fig.append_trace(trace3, 1, 1)
fig['data'][2].update(yaxis='y3')
fig['layout']['yaxis3']=dict(range= [0, 100], #right y-axis in subplot (1,1)
                            overlaying= 'y', 
                            anchor= 'x', 
                            side= 'right', 
                            showgrid= False, 
                            title= '%Churn Ratio'
                            )


In [124]:
fig

In [None]:
def binary_ploting_distributions(df, cat_col):
    from plotly import tools

    fig = tools.make_subplots(rows=1,
                              cols=2,
                              print_grid=True,
                              horizontal_spacing=0.15, 
                              subplot_titles=("Distribution of and % Churn", 
                                              f'Mean Monthly Charges of {cat_col}') 
                             )

    tmp_churn = df[df['Churn'] == 1]
    tmp_no_churn = df[df['Churn'] == 0]
    tmp_attr = round(tmp_churn[cat_col].value_counts().sort_index() / df_train[cat_col].value_counts().sort_index(),2)*100

    trace1 = go.Bar(
        x=tmp_churn[cat_col].value_counts().sort_index().index,
        y=tmp_churn[cat_col].value_counts().sort_index().values,
        name='Yes_Churn',opacity = 0.8, marker=dict(
            color='seagreen',
            line=dict(color='#000000',width=1)))

    trace2 = go.Bar(
        x=tmp_no_churn[cat_col].value_counts().sort_index().index,
        y=tmp_no_churn[cat_col].value_counts().sort_index().values,
        name='No_Churn', opacity = 0.8, 
        marker=dict(
            color='indianred',
            line=dict(color='#000000',
                      width=1)
        )
    )

    trace3 =  go.Scatter(   
        x=tmp_attr.sort_index().index,
        y=tmp_attr.sort_index().values,
        yaxis = 'y2',
        name='% Churn', opacity = 0.6, 
        marker=dict(
            color='black',
            line=dict(color='#000000',
                      width=2 )
        )
    )

    df_tmp = (df_train.groupby(['Churn', cat_col])['MonthlyCharges'].mean().reset_index())

    tmp_churn = df_tmp[df_tmp['Churn'] == 1]
    tmp_no_churn = df_tmp[df_tmp['Churn'] == 0]

    df_tmp = (df_train.groupby(['Churn', cat_col])['MonthlyCharges'].mean()).unstack('Churn').reset_index()
    df_tmp['diff_rate'] = round((df_tmp[1] / df_tmp[0]) - 1,2) * 100

    trace4 = go.Bar(
        x=tmp_churn[cat_col],
        y=tmp_churn['MonthlyCharges'], showlegend=False,
        name='Mean Charge Churn',opacity = 0.8, marker=dict(
            color='seagreen',
            line=dict(color='#000000',width=1)))

    trace5 = go.Bar(
        x=tmp_no_churn[cat_col],
        y=tmp_no_churn['MonthlyCharges'],showlegend=False,
        name='Mean Charge NoChurn', opacity = 0.8, 
        marker=dict(
            color='indianred',
            line=dict(color='#000000',
                      width=1)
        )
    )

    trace6 =  go.Scatter(   
        x=df_tmp[cat_col],
        y=df_tmp['diff_rate'],
        yaxis = 'y2',
        name='% Diff Churn', opacity = 0.6, 
        marker=dict(
            color='black',
            line=dict(color='#000000',
                      width=5 )
        )
    )

    fig.append_trace(trace1, 1, 1)
    fig.append_trace(trace2, 1, 1) 
    fig.append_trace(trace3, 1, 1)
    fig.append_trace(trace4, 1, 2)
    fig.append_trace(trace5, 1, 2)
    fig.append_trace(trace6, 1, 2) 

    fig['data'][2].update(yaxis='y3')
    fig['data'][5].update(yaxis='y4')

    fig['layout']['xaxis'].update(autorange=True,
                                   tickfont=dict(size= 10), 
                                   title= f'{cat_col}', 
                                   type= 'category',
                                  )
    fig['layout']['yaxis'].update(title= 'Count')

    fig['layout']['xaxis2'].update(autorange=True,
                                   tickfont=dict(size= 10), 
                                   title= f'{cat_col}', 
                                   type= 'category',
                                  )
    fig['layout']['yaxis2'].update( title= 'Mean Monthly Charges' )

    fig['layout']['yaxis3']=dict(range= [0, 100], #right y-axis in subplot (1,1)
                              overlaying= 'y', 
                              anchor= 'x', 
                              side= 'right', 
                              showgrid= False, 
                              title= '%Churn Ratio'
                             )

    #Insert a new key, yaxis4, and the associated value:
    fig['layout']['yaxis4']=dict(range= [-20, 100], #right y-axis in the subplot (1,2)
                              overlaying= 'y2', 
                              anchor= 'x2', 
                              side= 'right', 
                              showgrid= False, 
                              title= 'Monhtly % Difference'
                             )
    fig['layout']['title'] = f"{cat_col} Distributions"
    fig['layout']['height'] = 500
    fig['layout']['width'] = 1000

    iplot(fig)
    
def plot_dist_churn(df, col, binary=None):
    tmp_churn = df[df[binary] == 1]
    tmp_no_churn = df[df[binary] == 0]
    tmp_attr = round(tmp_churn[col].value_counts().sort_index() / df[col].value_counts().sort_index(),2)*100
    print(f'Distribution of {col}: ')
    trace1 = go.Bar(
        x=tmp_churn[col].value_counts().sort_index().index,
        y=tmp_churn[col].value_counts().sort_index().values,
        name='Yes_Churn',opacity = 0.8, marker=dict(
            color='seagreen',
            line=dict(color='#000000',width=1)))

    trace2 = go.Bar(
        x=tmp_no_churn[col].value_counts().sort_index().index,
        y=tmp_no_churn[col].value_counts().sort_index().values,
        name='No_Churn', opacity = 0.8, 
        marker=dict(
            color='indianred',
            line=dict(color='#000000',
                      width=1)
        )
    )

    trace3 =  go.Scatter(   
        x=tmp_attr.sort_index().index,
        y=tmp_attr.sort_index().values,
        yaxis = 'y2',
        name='% Churn', opacity = 0.6, 
        marker=dict(
            color='black',
            line=dict(color='#000000',
                      width=2 )
        )
    )
    
    layout = dict(title =  f'Distribution of {str(col)} feature by Target - With Churn Rates',
              xaxis=dict(), 
              yaxis=dict(title= 'Count'), 
              yaxis2=dict(range= [0, 100], 
                          overlaying= 'y', 
                          anchor= 'x', 
                          side= 'right',
                          zeroline=False,
                          showgrid= False, 
                          title= 'Percentual Churn Ratio'
                         ))

    fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
    iplot(fig)
    
    
def plot_distribution(df, var_select=None, bins=1.0): 
    # Calculate the correlation coefficient between the new variable and the target
    tmp_churn = df[df['Churn'] == 1]
    tmp_no_churn = df[df['Churn'] == 0]    
    corr = df_train['Churn'].corr(df_train[var_select])
    corr = np.round(corr,3)
    tmp1 = tmp_churn[var_select].dropna()
    tmp2 = tmp_no_churn[var_select].dropna()
    hist_data = [tmp1, tmp2]
    
    group_labels = ['Yes_churn', 'No_churn']
    colors = ['seagreen','indianred', ]

    fig = ff.create_distplot(hist_data,
                             group_labels,
                             colors = colors, 
                             show_hist = True,
                             curve_type='kde', 
                             bin_size = bins
                            )
    
    fig['layout'].update(title = var_select+' '+'(corr target ='+ str(corr)+')')

    iplot(fig, filename = 'Density plot')
    
def monthly_charges(df, col, binary=None):
    #(df_train.groupby(['Churn', 'tenure'])['MonthlyCharges'].mean()).unstack('Churn').reset_index()
    df_tmp = (df_train.groupby([binary, col])['MonthlyCharges'].mean().reset_index())
    
    tmp_churn = df_tmp[df_tmp['Churn'] == 1]
    tmp_no_churn = df_tmp[df_tmp['Churn'] == 0]

    df_tmp = (df_train.groupby([binary, col])['MonthlyCharges'].mean()).unstack('Churn').reset_index()
    df_tmp['diff_rate'] = round((df_tmp[1] / df_tmp[0]) - 1,2) * 100
    
    trace1 = go.Bar(
        x=tmp_churn[col],
        y=tmp_churn['MonthlyCharges'],
        name='Mean Charge\nChurn',opacity = 0.8, marker=dict(
            color='seagreen',
            line=dict(color='#000000',width=1)))

    trace2 = go.Bar(
        x=tmp_no_churn[col],
        y=tmp_no_churn['MonthlyCharges'],
        name='Mean Charge No Churn', opacity = 0.8, 
        marker=dict(
            color='indianred',
            line=dict(color='#000000',
                      width=1)
        )
    )
    
    trace3 =  go.Scatter(   
        x=df_tmp[col],
        y=df_tmp['diff_rate'],
        yaxis = 'y2',
        name='% Diff Churn', opacity = 0.6, 
        marker=dict(
            color='black',
            line=dict(color='#000000',
                      width=5 )
        )
    )
        
    layout = dict(title =  f'Mean Monthly Charges of {str(col)} feature by Churn or Not Churn Customers - With Churn Ratio',
              xaxis=dict(), 
              yaxis=dict(title= 'Mean Monthly Charges'), 
              yaxis2=dict(range= [0, 100], 
                          overlaying= 'y', 
                          anchor= 'x', 
                          side= 'right',
                          zeroline=False,
                          showgrid= False, 
                          title= '% diff Monthly Charges Mean'
                         ))

    fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)
    iplot(fig)

In [24]:

CrosstabResult=pd.crosstab(index=LoanData['GENDER'],columns=LoanData['APPROVE_LOAN'])
print(CrosstabResult)
 
# importing the required function
 
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)
 
# P-Value is the Probability of H0 being True
# If P-Value&gt;0.05 then only we Accept the assumption(H0)
 
print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

   CIBIL  AGE GENDER  SALARY APPROVE_LOAN
0    480   28      M  610000          Yes
1    480   42      M  140000           No
2    480   29      F  420000           No
3    490   30      M  420000           No
4    500   27      M  420000           No
APPROVE_LOAN  No  Yes
GENDER               
F              2    5
M              3   10
The P-Value of the ChiSq Test is: 1.0


In [187]:
#Checking Correlation Of Categorical variables with target column
ch_scores  = []
for col in cat_cols[:-1]:
    crstb = pd.crosstab(index = df[col], columns=df['Churn'] )
    hyp = chi2_contingency(crstb)
    chiscore= np.round(hyp[1], 5)
    ch_scores.append(chiscore)
pd.DataFrame(zip(cat_cols[:-1], ch_scores), columns=['col', 'ch_score']).set_index('col').sort_values(by='ch_score')


Unnamed: 0_level_0,ch_score
col,Unnamed: 1_level_1
Partner,0.0
Dependents,0.0
InternetService,0.0
OnlineSecurity,0.0
OnlineBackup,0.0
DeviceProtection,0.0
TechSupport,0.0
StreamingTV,0.0
StreamingMovies,0.0
Contract,0.0


In [84]:
for col in cat_cols[:-1]:
    l, c = factorize(df[col])
    res = pd.Series(l)
    cor = df['Churn_No'].corr(res)
    print(col, cor)

Gender -0.008612095078997883
Partner 0.15044754495917667
Dependents -0.16422140157972476
PhoneService 0.01194198002900294
MultipleLines 0.0363104366547957
InternetService -0.04729138768314237
OnlineSecurity -0.33281919168942703
OnlineBackup -0.07420530149434537
DeviceProtection -0.2814648246574705
TechSupport -0.32985226446993626
StreamingTV -0.20574215693991318
StreamingMovies -0.20725609227308128
Contract -0.396712629209844
PaperlessBilling -0.19182533166646865
PaymentMethod -0.2628182020893576


In [188]:
from feature_engine.encoding import OrdinalEncoder
oe = OrdinalEncoder(encoding_method='arbitrary')
encoded_df = oe.fit_transform(df[cat_cols[:-1]])

In [194]:
encoded_df.corrwith(df['Churn'].replace({'Yes':1, 'No':0})).sort_values().T

Contract           -0.396713
OnlineSecurity     -0.332819
TechSupport        -0.329852
DeviceProtection   -0.281465
PaymentMethod      -0.262818
StreamingMovies    -0.207256
StreamingTV        -0.205742
PaperlessBilling   -0.191825
Dependents         -0.164221
OnlineBackup       -0.074205
InternetService    -0.047291
Gender             -0.008612
PhoneService        0.011942
MultipleLines       0.036310
Partner             0.150448
dtype: float64

In [195]:
pd.DataFrame(zip(cat_cols[:-1], ch_scores), columns=['col', 'ch_score']).set_index('col').sort_values(by='ch_score').T


col,Partner,Dependents,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MultipleLines,PhoneService,Gender
ch_score,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00346,0.33878,0.48658


In [130]:
churn = df[df['Churn'] ==1]
nchurn = df[df['Churn'] == 0]

In [134]:
churn[churn['SeniorCitizen'] == 1].shape, churn[churn['SeniorCitizen'] == 0].shape

((476, 20), (1393, 20))

In [137]:
gr = df.groupby(["SeniorCitizen", 'Churn'])['Churn'].agg('count')

In [149]:
gr

SeniorCitizen  Churn
0              0        4497
               1        1393
1              0         666
               1         476
Name: Churn, dtype: int64

In [147]:
(4497+1393)

5890

In [161]:
cat_col= "SeniorCitizen"

In [162]:
df["Churn"] = df['Churn'].replace({"Yes":1, "No":0})
tmp_churn = df[df['Churn'] == 1]
tmp_no_churn = df[df['Churn'] == 0]
# calculate churn / total count of categorical variables 
# tmp_attr = round(tmp_churn[cat_col].value_counts().sort_index() / df[cat_col].value_counts().sort_index(),2)*100

In [169]:
tmp_churn[cat_col].value_counts().sort_index()

0    1393
1     476
Name: SeniorCitizen, dtype: int64

In [170]:
df[cat_col].value_counts().sort_index()

0    5890
1    1142
Name: SeniorCitizen, dtype: int64

In [171]:
1393/5890

0.2365025466893039

In [172]:
139131/455661

0.3053388374251911

In [173]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   Tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 


In [174]:
df_train = df.copy()

In [175]:
df_train['internet']= np.where(df_train.InternetService != 'No', 'Yes', 'No')

df_train['num_services'] = (df_train[['PhoneService', 'OnlineSecurity',
                                      'OnlineBackup', 'DeviceProtection', 
                                      'TechSupport', 'StreamingTV', 
                                      'StreamingMovies', 'internet']] == 'Yes').sum(axis=1)


In [177]:
df_train['InternetService'].value_counts()

Fiber Optic    3096
Dsl            2416
No             1520
Name: InternetService, dtype: int64

In [182]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   Tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 


In [None]:
color_op = ['#5527A0', '#BB93D7', '#834CF7', '#6C941E', '#93EAEA', '#7425FF', '#F2098A', '#7E87AC', 
            '#EBE36F', '#7FD394', '#49C35D', '#3058EE', '#44FDCF', '#A38F85', '#C4CEE0', '#B63A05', 
            '#4856BF', '#F0DB1B', '#9FDBD9', '#B123AC']

def plot_pie(dataframe, df_cat, df_value, title, limit=15):

    df = dataframe.copy()
    tmp_churn = df[df['Churn'] == 1].groupby(df_cat)[df_value].sum().nlargest(limit).to_frame().reset_index()
    tmp_no_churn = df[df['Churn'] == 0].groupby(df_cat)[df_value].sum().nlargest(limit).to_frame().reset_index()

    trace1 = go.Pie(labels=tmp_no_churn[df_cat], 
                    values=tmp_no_churn[df_value], name= "No-Churn", hole= .5, 
                    hoverinfo="label+percent+name+value", showlegend=True,
                    domain= {'x': [0, .48]})

    trace2 = go.Pie(labels=tmp_churn[df_cat], 
                    values=tmp_churn[df_value], name="Churn", hole= .5, 
                    hoverinfo="label+percent+name+value", showlegend=False, 
                    domain= {'x': [.52, 1]})

    layout = dict(title= title, height=450, font=dict(size=15),
                  annotations = [
                      dict(
                          x=.20, y=.5,
                          text='No Churn', 
                          showarrow=False,
                          font=dict(size=20)
                      ),
                      dict(
                          x=.80, y=.5,
                          text='Churn', 
                          showarrow=False,
                          font=dict(size=20)
                      )
        ])

    fig = dict(data=[trace1, trace2], layout=layout)
    iplot(fig)

In [33]:
df.select_dtypes('object').columns

Index(['Gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod'],
      dtype='object')

In [37]:
df['PaymentMethod'].value_counts()

Electronic Check             2365
Mailed Check                 1604
Bank Transfer (Automatic)    1542
Credit Card (Automatic)      1521
Name: PaymentMethod, dtype: int64