### Importing necessary libararies
#### We are using Logistic Regression in Python with the statsmodels package (statsmodels.formula.api). 

In [1]:
import pandas as pd
import plotly.express as px
from statsmodels.formula.api import logit
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
from sklearn.preprocessing import quantile_transform
import numpy as np
import seaborn as sns

import ipywidgets as widgets

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [3]:
df = pd.read_csv("data/Churn_Modelling.csv",index_col=["RowNumber","CustomerId","Surname"])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,CustomerId,Surname,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
categorical_cols = ["Geography", "Gender", "Tenure", "NumOfProducts", "HasCrCard",]
non_categorical_cols = ["CreditScore", "Age", "Balance", "EstimatedSalary"]

In [5]:
w = widgets.Dropdown(
    options=non_categorical_cols,
    value="CreditScore",
    description="Task:",
)
display(w)

Dropdown(description='Task:', options=('CreditScore', 'Age', 'Balance', 'EstimatedSalary'), value='CreditScore…

In [9]:
non_categorical_col = w.value

layout = go.Layout(plot_bgcolor='#F0E9E6')

fig = go.Figure(layout=layout)
fig.add_trace(go.Box(y=df.loc[df["Exited"]==1,non_categorical_col],
                     marker={"color": "indianred"},
                     name="Churn"))
fig.add_trace(go.Box(y=df.loc[df["Exited"]==0,non_categorical_col],
                     marker={"color": "lightseagreen"},
                     name="Non Churn"))

fig.update_layout(
                   title='Continuous Regressor to Target',
                   xaxis_title=f"{non_categorical_col}",
                   yaxis_title='IQR', 
                   xaxis_showgrid=False, 
                   yaxis_showgrid=False
)

fig.show()

In [6]:
categorical_cols = ["Geography", "Gender",  "HasCrCard",]
non_categorical_cols = ["Age"]

formula = "Exited"+"~"+"+".join(non_categorical_cols)+"+"+\
          "+".join([f"C({each_categorcial_col})" for each_categorcial_col\
                    in categorical_cols]) + "-" + "1"
formula

'Exited~Age+C(Geography)+C(Gender)+C(HasCrCard)-1'

In [7]:
logitfit = logit(formula = str(formula), 
                 data = df).fit()

Optimization terminated successfully.
         Current function value: 0.449401
         Iterations 6


In [8]:
df["proba"] = logitfit.predict(df)
df["predicted"] = 0

#A treshold of 0.3 helps in taking care of the imbalnce
df.loc[df["proba"]>0.3,"predicted"] = 1

In [9]:
w_cat = widgets.Dropdown(
    options=categorical_cols,
    value="Geography",
    description="Task:",
)
display(w_cat)

Dropdown(description='Task:', options=('Geography', 'Gender', 'HasCrCard'), value='Geography')

In [10]:
ct = pd.crosstab(df[w_cat.value], df["Exited"])
ct.columns = ["No Churn","Churn"]
ct =ct.reset_index()
ct

Unnamed: 0,Geography,No Churn,Churn
0,France,4204,810
1,Germany,1695,814
2,Spain,2064,413


In [11]:
#Extracting Logit's coefficients and sorting them.
logit_coeffs = logitfit.summary2().tables[1]
logit_coeffs = logit_coeffs.reindex(logit_coeffs["Coef."].abs().sort_values().index)

In [12]:
logit_coeffs.head(10)

Unnamed: 0,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
C(HasCrCard)[T.1],-0.031968,0.057911,-0.55202,0.5809346,-0.145472,0.081536
Age,0.063294,0.002415,26.207172,2.201518e-151,0.058561,0.068028
C(Gender)[T.Male],-0.529893,0.05313,-9.973583,1.989203e-23,-0.634025,-0.425761
C(Geography)[Germany],-3.021432,0.119103,-25.368235,5.6552389999999994e-142,-3.25487,-2.787995
C(Geography)[Spain],-3.913336,0.126652,-30.898339,1.2574400000000002e-209,-4.161569,-3.665103
C(Geography)[France],-3.935134,0.119654,-32.887697,3.295416e-237,-4.169651,-3.700617


In [13]:
fig = px.bar(ct, x=logit_coeffs['Coef.'],\
             y=logit_coeffs['Coef.'].index,
             orientation="h",
             color_discrete_sequence=['lightseagreen'] )

fig.update_layout(
                   plot_bgcolor='#F0E9E6',
                   title='Feature Importances',
                   xaxis_title='Coefficient Importance',
                   yaxis_title='Features', 
                   xaxis_showgrid=False, 
                   yaxis_showgrid=False
)

fig.show()

In [14]:
layout = go.Layout(plot_bgcolor='#F0E9E6')
fig = go.Figure(layout=layout)

fig.add_trace(
    go.Scatter(
    x=logit_coeffs['Coef.'],
    y=logit_coeffs['Coef.'].index,
    line=dict(color='#42C4F7', width=2),
    mode='markers',
        
    error_x=dict(
            type='data',
            symmetric=False,
            array=logit_coeffs['0.975]'] - logit_coeffs['Coef.'],
            arrayminus=logit_coeffs['Coef.'] - logit_coeffs['[0.025'],
            color='#8793c4')
        )
    )


fig.update_layout(
                   title='Regression Meta Analysis',
                   xaxis_title='Weight Estimates',
                   yaxis_title='Variable', 
                   xaxis_showgrid=False, 
                   yaxis_showgrid=False
)

fig.show()

In [15]:
dummy_encoded_df = pd.get_dummies(df[non_categorical_cols+categorical_cols],\
               columns=["Geography"])

dummy_encoded_df = dummy_encoded_df[["HasCrCard", "Age", "Gender",
                                      "Geography_Germany", "Geography_Spain",	\
                                      "Geography_France",
                                      ]]
dummy_encoded_df["Gender"] = dummy_encoded_df["Gender"].map({
    "Female":0, "Male":1
}) 
dummy_encoded_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HasCrCard,Age,Gender,Geography_Germany,Geography_Spain,Geography_France
RowNumber,CustomerId,Surname,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,15634602,Hargrave,1,42,0,0,0,1
2,15647311,Hill,0,41,0,0,1,0
3,15619304,Onio,1,42,0,0,0,1
4,15701354,Boni,0,39,0,0,0,1
5,15737888,Mitchell,1,43,0,0,1,0


In [16]:
#Compute effects
effects = dummy_encoded_df * logit_coeffs['Coef.'].to_numpy()
effects.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HasCrCard,Age,Gender,Geography_Germany,Geography_Spain,Geography_France
RowNumber,CustomerId,Surname,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,15634602,Hargrave,-0.031968,2.658365,-0.0,-0.0,-0.0,-3.935134
2,15647311,Hill,-0.0,2.595071,-0.0,-0.0,-3.913336,-0.0
3,15619304,Onio,-0.031968,2.658365,-0.0,-0.0,-0.0,-3.935134
4,15701354,Boni,-0.0,2.468482,-0.0,-0.0,-0.0,-3.935134
5,15737888,Mitchell,-0.031968,2.72166,-0.0,-0.0,-3.913336,-0.0


In [17]:
layout = go.Layout(plot_bgcolor='#F0E9E6')

fig = go.Figure(layout=layout)

for each_col in effects.columns:
    fig.add_trace(go.Box(x=effects[each_col],\
                     marker_color = 'lightseagreen',
                     name=each_col))

fig.update_layout(
                   title='Effect Plot',
                   xaxis_title="Effects",
                   yaxis_title='Features', 
                   xaxis_showgrid=False, 
                   yaxis_showgrid=False
)

fig.show()

In [18]:
(df["proba"]>0.5).head(30)

RowNumber  CustomerId  Surname   
1          15634602    Hargrave      False
2          15647311    Hill          False
3          15619304    Onio          False
4          15701354    Boni          False
5          15737888    Mitchell      False
6          15574012    Chu           False
7          15592531    Bartlett      False
8          15656148    Obinna        False
9          15792365    He            False
10         15592389    H?            False
11         15767821    Bearce        False
12         15737173    Andrews       False
13         15632264    Kay           False
14         15691483    Chin          False
15         15600882    Scott         False
16         15643966    Goforth       False
17         15737452    Romeo          True
18         15788218    Henderson     False
19         15661507    Muldrow       False
20         15568982    Hao           False
21         15577657    McDonald      False
22         15597945    Dellucci      False
23         15699309 

In [19]:
SET_INDEX_DF = 16

local_data = dummy_encoded_df.iloc[SET_INDEX_DF,:]
local_effects =  effects.iloc[SET_INDEX_DF,:]
local_data,local_effects

(HasCrCard             1
 Age                  58
 Gender                1
 Geography_Germany     1
 Geography_Spain       0
 Geography_France      0
 Name: (17, 15737452, Romeo), dtype: int64,
 HasCrCard           -0.031968
 Age                  3.671076
 Gender              -0.529893
 Geography_Germany   -3.021432
 Geography_Spain     -0.000000
 Geography_France    -0.000000
 Name: (17, 15737452, Romeo), dtype: float64)

In [20]:
dummy_encoded_df.iloc[SET_INDEX_DF:SET_INDEX_DF+1,:].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HasCrCard,Age,Gender,Geography_Germany,Geography_Spain,Geography_France
RowNumber,CustomerId,Surname,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
17,15737452,Romeo,1,58,1,1,0,0


In [21]:
layout = go.Layout(plot_bgcolor='#F0E9E6')

fig = go.Figure(layout=layout)

for each_col in effects.columns:
    fig.add_trace(go.Box(x=effects[each_col],\
                     marker_color = 'lightseagreen',
                     name=each_col))


fig.add_trace(go.Scatter(
    x=local_effects.to_numpy(),
    y=local_effects.index,
    hovertext=local_data.to_numpy(),
    hoverinfo="text",
    marker=dict(
        color="red"
    ),
    mode="markers",
    marker_symbol="square-x",
    showlegend=False
))



fig.update_layout(
                   title=f'Local Effects of Data Point {SET_INDEX_DF},Predicted Log Odds-{df.iloc[SET_INDEX_DF,:]["proba"]}',
                   xaxis_title="Effects",
                   yaxis_title='Features', 
                   xaxis_showgrid=False, 
                   yaxis_showgrid=False
)

fig.show()

#### Task 1:  Write code for a customer for which the predicted log odds is lesser than 0.5. Create the box plot and provide interpretation.

#### Task 2:  Consider both the data point and its interpretation. What further interpretation you can derive in terms of how representative they are of the overall data distributions.