In [None]:
#In This notebook we will implement Logistic Regression using Statsmodel package and SKLearn Package.
#We also find Lift curve and Gain curve manually and by using Scikitplot Library

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
bank = pd.read_csv('../input/beginner-datasets/beginner_datasets/bank.csv')
bank.head()

In [None]:
bank.info()

In [None]:
bank.shape

In [None]:
bank['deposit'].value_counts()

In [None]:
round(bank['deposit'].value_counts(normalize=True) *100,2)


In [None]:
#Above output shows that data is highly imbalanced.
#Now, Have a look on some visualizations to understand more about data

In [None]:
sns.countplot(x='deposit',data =bank)

In [None]:
sns.barplot(x='deposit',y='balance',data=bank)

In [None]:
sns.countplot(x='campaign',data=bank)
#Following graph shows that as number of campaigns increases number of participants decreases

In [None]:
#Relationship between previous campaigns and the deposit column
sns.barplot(x='deposit',y='previous',data=bank)

In [None]:
#Let us see distribution of balance of the customers
sns.distplot(bank['balance']);

In [None]:
#Now we remove dependent column which we have to predict
X_features = list(bank.columns)
X_features.remove('deposit')
X_features

In [None]:
#Converting all categorical values into one hot encodings
encoded_bank_df = pd.get_dummies(bank[X_features],drop_first = True)

In [None]:
#Converting dependent variable to 0 or 1
Y=bank.deposit.map(lambda x: int(x== 'yes'))
X=encoded_bank_df

In [None]:
#Importing statsmodel package to perform Logistic Regression
import statsmodels.api as sm

In [None]:
logit_model = sm.Logit(Y,sm.add_constant(X)).fit()

In [None]:
#Now,observe the output of model
logit_model.summary2()

In [None]:
#In summary you can can notice that statsmodel give us more statistical values than sklearn

In [None]:
#In summary p values shows that how much a variable has significance on perdiction
#So we use all those variables which has more p value

In [None]:
X_features = ['previous',
             'pdays',
             'job_unknown',
             'poutcome_unknown',
             'default_yes',
             'age']

In [None]:
logit_model_2 = sm.Logit(Y,sm.add_constant( X[X_features])).fit()

In [None]:
logit_model_2.summary2()

In [None]:
y_pred_df = pd.DataFrame({'actual':Y,'predicted_prob':logit_model_2.predict(sm.add_constant(X[X_features]))})

In [None]:
y_pred_df

In [None]:
#Now sort predicted probabilities  in  descending order
import pandas as pd
y_pred_df = pd.DataFrame({'actual':Y,'predicted_prob':logit_model_2.predict(sm.add_constant(X[X_features]))})
sorted_predict_df = y_pred_df[['predicted_prob',
                               'actual']].sort_values('predicted_prob',
                                                                       ascending = False)

In [None]:
sorted_predict_df 

In [None]:
num_per_decile = int(len(sorted_predict_df)/10)
print("Number of observation per decile: ",num_per_decile)

In [None]:
#Function to get decile(split data into ten equal parts)
def get_deciles(df):
    df['decile']=1
    idx = 0
    for each_d in range(0,10):
        df.iloc[idx:idx+num_per_decile,df.columns.get_loc('decile')]= each_d
        idx += num_per_decile
    df['decile'] = df['decile']+1
    return df

In [None]:
deciles_predict_df = get_deciles(sorted_predict_df)

In [None]:
deciles_predict_df

In [None]:
gain_lift_df= pd.DataFrame(deciles_predict_df.groupby('decile')['actual'].sum()).reset_index()
gain_lift_df.columns = ['decile','gain']

In [None]:
gain_lift_df

In [None]:
gain_lift_df['gain_percentage']=(100 * gain_lift_df.gain.cumsum()/gain_lift_df.gain.sum())
gain_lift_df

In [None]:
#Above you notice that more than 70% subscribers are found in first seven deciles

In [None]:
plt.figure(figsize = (8,4))
plt.plot(gain_lift_df['decile'],gain_lift_df['gain_percentage'],'-')
plt.show()

In [None]:
#Now,calculate lift to see how good deciles are
gain_lift_df['lift']= (gain_lift_df.gain_percentage/(gain_lift_df.decile * 10))
gain_lift_df

In [None]:
plt.figure(figsize=(8,4))
plt.plot(gain_lift_df['decile'],gain_lift_df['lift'],'-')
plt.show()

In [None]:
#Developing model using Logistic Regression SKLearn Package

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logit = LogisticRegression()
#Fitting model with X and Y values of dataset
logit.fit(X,Y)

In [None]:
pred_y = logit.predict_proba(X)

In [None]:
pred_y

In [None]:
type(pred_y)

In [None]:
y_pred_df_new = pd.DataFrame({'actual':Y,'predicted_prob': pred_y[:,1]})

In [None]:
y_pred_df_new.head()

In [None]:
sorted_predict_df_new = y_pred_df_new[['predicted_prob','actual']].sort_values('predicted_prob',ascending =False)

In [None]:
deciles_predict_df_new = get_deciles(sorted_predict_df_new)

In [None]:
deciles_predict_df_new[0:20]

In [None]:
gain_lift_df_new = pd.DataFrame(deciles_predict_df_new.groupby('decile')['actual'].sum()).reset_index()
gain_lift_df_new.columns = ['decile','gain']

In [None]:
gain_lift_df_new['gain_percentage']=(100*gain_lift_df_new.gain.cumsum()/gain_lift_df_new.gain.sum())

In [None]:
gain_lift_df_new

In [None]:
gain_lift_df_new['lift']=(gain_lift_df_new.gain_percentage / (gain_lift_df_new.decile * 10))
gain_lift_df_new

In [None]:
#We can also calculate Lift and Gain curves by just writting a single line of code 

In [None]:
import scikitplot as skplt
#following line is used to find Gains Curve
skplt.metrics.plot_cumulative_gain(Y,pred_y)

In [None]:
#Following line is used to find Lift curve
skplt.metrics.plot_lift_curve(Y,pred_y)