In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Importing all the libraries required for this analysis

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

#Importing train_test_split 
from sklearn.model_selection import train_test_split

# Importing StandardScaler 
from sklearn.preprocessing import StandardScaler

# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Importing statsmodel
import statsmodels.api as sm 

# Importing VIF for calculation
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Importing r2_score for calculation
from sklearn.metrics import r2_score 

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Step 1. Reading and Understanding Data 

In [None]:
data=pd.read_csv('/kaggle/input/banking-subscription/bank-additional-full.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.shape

#### The column y stands for subscription done or not hence rename that column to 'subscription'

In [None]:
data=data.rename(columns={'y':'subscription'})

#### Checking Null values in the required dataset

In [None]:
data.isnull().sum()

#### Checking Data types of columns

In [None]:
data.select_dtypes(include='number').columns

In [None]:
data.select_dtypes(include='object').columns

#### Checking unique data in all columns

In [None]:
data.apply(pd.Series.nunique, axis = 0)

In [None]:
def uniqueDataView(data,col):
    count=0
    res_dct = {col[i]: data[col[i]].unique() for i in range(len(col))}
    for key in res_dct:
        count=count+1
        print(count, ".",key, ' : ', list(res_dct[key]))
        print()

In [None]:
uniqueDataView(data,list(data.columns.difference(['euribor3m','age','duration'])))

#### Observation:
        1. There is no column having value as Nan/Null.
        2. Following columns are object type column that means these are going to be consider for categorical data analysis.
               'job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome','y','pday'
        3. The column default,education,housing,job,marital ,loan should contain data as 'unknown'.
        4. The pdays column contains data as 999 which is invalid.

#### Checking the data category in the all object type columns

In [None]:
round((data.isin([999]).sum()/data.shape[0])*100,2)

    The column pday have 96% data in '999' hence removing the column from analysis

## Step 2: Data Cleaning

In [None]:
data=data.drop('pdays',axis=1)

In [None]:
round((data.isin(['unknown']).sum()/data.shape[0])*100,2)

#### Handing invalid values

In [None]:
def removeInvalidValues(dataset,invalidValue):
    for val in invalidValue:
        for y in dataset.columns:
            dataset.loc[:,y]=dataset.loc[:,y].drop(dataset[dataset[y]==val].index)
    return (round((data.isin(invalidValue).sum()/data.shape[0])*100,2))

In [None]:
removeInvalidValues(data,['unknown'])

#### Checking values in each categorical columns

In [None]:
def dataView(data,col):
    count=0
    res = {col[i]: round(data[col[i]].value_counts(normalize=True),3).to_frame()for i in range(len(col))}
    for k,v in res.items():
        count=count+1
        print('\n--------------------------------------------------------------\n',count,'.',v)

In [None]:
dataView(data,list(data.columns.difference(['euribor3m','age','duration'])))

#### Handling the missing/unknown data

#### Impute categorical data with mode and numerical data with mean

#### Observation
* Following columns are containing unknown or impute value in it which ideally NAN values.
       'job', 'marital', 'education', 'default', 'housing', 'loan','poutcome'
*  We need to fill these value with Nan then fill these with mode

In [None]:
data.fillna(data.select_dtypes(include='number').median(), inplace=True)
data.fillna(data.select_dtypes(include='object').mode().iloc[0], inplace=True)

In [None]:
(100*data.isnull().sum()/len(data)).round(2)

## Step 3: Data Visualization

In [None]:
def categorical_analysis(cat_var,data):
    plt.figure(figsize=(16,25))
    for i in range(0,len(cat_var)):
        plt.subplot(6,2,i+1)
        ax=sns.countplot(y = cat_var[i], data = data,orient='v',palette='RdYlGn')
        plt.xticks(rotation=15)
        for rect in ax.patches:
            width = rect.get_width()
            plt.text(1.05*rect.get_width(), rect.get_y()+0.5*rect.get_height(),
                 '%d' % int(width),
                 ha='center', va='center')
    plt.show()

In [None]:
categorical_analysis(data.select_dtypes(include='object').columns,data)

#### Observation
* All the tables are containing null value less than 50% .Hence we have to impute those columns with mean and mode for numerical and categorical data respectively.

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(data.corr(), annot=True, cmap="RdYlGn")
plt.show()

In [None]:
def categorical_analysis_box(cat_var,tar_var,data):
    plt.figure(figsize=(16, 30))
    for i in range(0,len(cat_var)):
        sns.set_style("whitegrid")
        plt.subplot(6,2,i+1)
        sns.boxplot(x = cat_var[i], y = tar_var, data = data)
        plt.xticks(rotation=45)
    plt.show()

In [None]:
categorical_analysis_box(data.select_dtypes(include='number').columns,'subscription',data)

In [None]:
data.info()

In [None]:
# Excluding values outside 99th percentile in 'TotalVisits' & 'Page Views Per Visit' variables
data=data[data['age']<np.nanpercentile(data['age'], 99)]
data=data[data['duration']<np.nanpercentile(data['duration'], 99)]
data=data[data['cons.conf.idx']<np.nanpercentile(data['cons.conf.idx'], 99)]

In [None]:
categorical_analysis_box(data.select_dtypes(include='number').columns,'subscription',data)

## Step 4: Data Preparation

In [None]:
data = data.replace(to_replace = "yes", value = 1)
data = data.replace(to_replace = "no", value = 0)

In [None]:
# Defining the map function
def dummies(cols,data):
    for x in cols:
        t = pd.get_dummies(data[x], drop_first = True,prefix=x)
        data = pd.concat([data, t], axis = 1)
        data.drop([x], axis = 1, inplace = True)
    return data
# Applying the function to the bike_data
data=dummies(data.select_dtypes(include='object').columns,data)


In [None]:
y=data['subscription']

In [None]:
X= data.drop('subscription',axis=1)

### Splitting the Data into Training and Testing Sets

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [None]:
print('X_train :',X_train.shape,' y_train :',y_train.shape)
print('X_test :',X_test.shape, ' y_test :',y_test.shape)

### Rescaling the Features

In [None]:
scaler=StandardScaler()
cols=X_train.select_dtypes(include='number').columns
X_train[cols]=scaler.fit_transform(X_train[cols])

In [None]:
X_train .describe()

## Step 5: Building the model

In [None]:
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
#### Running RFE with the output number of the variable equal to 18
logreg = LogisticRegression()
rfe = RFE(logreg, 15)          
rfe = rfe.fit(X_train, y_train)

In [None]:
X_train_rfe = X_train[X_train.columns[rfe.support_]]

In [None]:
def build_model(X,y):
    X = sm.add_constant(X)
    logm2 = sm.GLM(y,X, family = sm.families.Binomial())
    res = logm2.fit()
    print(res.summary())
    return X

def checkVIF(dataset):
    col=['const']
    dataset = dataset.drop([x for x in col if x in dataset.columns], axis=1)
    vif = pd.DataFrame()
    vif['Features'] = dataset.columns
    vif['VIF'] = [variance_inflation_factor(dataset.values, i) for i in range(dataset.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return(vif)

In [None]:
X_train_model = build_model(X_train_rfe,y_train)

In [None]:
checkVIF(X_train_model)

In [None]:
X_train_model = X_train_model.drop(["emp.var.rate"], axis = 1)

In [None]:
X_train_model = build_model(X_train_model,y_train)

In [None]:
checkVIF(X_train_model)

In [None]:
X_train_model = X_train_model.drop(["month_aug"], axis = 1)

In [None]:
X_train_model = build_model(X_train_model,y_train)

In [None]:
checkVIF(X_train_model)

In [None]:
X_train_model = X_train_model.drop(["month_nov"], axis = 1)

In [None]:
X_train_model = build_model(X_train_model,y_train)

In [None]:
checkVIF(X_train_model)

### All variables now are within acceptable range in terms of p-value and VIF. So we go ahead and make our predictions using this model only.

## Step 6: Making Predictions Using the Final Model

In [None]:
logm2 = sm.GLM(y_train,X_train_model, family = sm.families.Binomial())
res = logm2.fit()

### Creating Predict on training dataset

In [None]:
y_train_pred = res.predict(sm.add_constant(X_train_model))
y_train_pred[:10]

In [None]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

In [None]:
y_train_pred_final = pd.DataFrame({'Subscription':y_train.values, 'Subscription_Prob':y_train_pred})

### Creating 'predicted' column based on 'Converted_Prob' value >0.5

In [None]:
y_train_pred_final['predicted'] = y_train_pred_final['Subscription_Prob'].map(lambda x: 1 if x > 0.5 else 0)

In [None]:
y_train_pred_final

In [None]:
def confusionMatrix(x,y):  
    matrixName=metrics.confusion_matrix(x,y)
    print(matrixName)
    print()
    TP = matrixName[1,1] # true positive 
    TN = matrixName[0,0] # true negatives
    FP = matrixName[0,1] # false positives
    FN = matrixName[1,0] # false negatives
    matrix={"TP": TP,"TN":TN,"FP":FP,"FN":FN}
    print()
    print(matrix)
    print("Sensitivity is : ",round(TP / float(TP+FN),3))
    print("Specificity is : ",round(TN / float(TN+FP),3))
    print("False Postive Rate is :",round(FP/ float(TN+FP),3))
    print("Positive predictive value is : ",round(TP / float(TP+FP),3))
    print("Negative predictive value is : ",round(TN / float(TN+ FN),3))
    print("Accuracy Score is : ",round(metrics.accuracy_score(x,y),3))
    print("Precision Score is : ",round(precision_score(x,y),3))
    print("Recall Value is : ",round(recall_score(x,y),3))    

### Creating confusion matrix and related values using the y_train_pred_final

In [None]:
confusionMatrix(y_train_pred_final['Subscription'], y_train_pred_final['predicted'])

### Drawing ROC curve for the Training Dataset

In [None]:
def draw_roc(actual, probs):
    fpr, tpr, thresholds = metrics.roc_curve( actual, probs,drop_intermediate = False )
    auc_score = metrics.roc_auc_score( actual, probs )
    plt.figure(figsize=(10, 5))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

    return None

In [None]:
fpr, tpr, thresholds = metrics.roc_curve( y_train_pred_final['Subscription'], y_train_pred_final['Subscription_Prob'], drop_intermediate = False )

In [None]:
draw_roc(y_train_pred_final['Subscription'],y_train_pred_final['Subscription_Prob'])

### Creating columns with different probability cutoffs 

In [None]:
numbers = [float(x)/10 for x in range(10)]
for i in numbers:
    y_train_pred_final[i]= y_train_pred_final['Subscription_Prob'].map(lambda x: 1 if x > i else 0)
y_train_pred_final.head()

### Calculate accuracy sensitivity & specificity for various probability cutoffs

In [None]:
cutoff_df = pd.DataFrame( columns = ['prob','accuracy','sensi','speci'])
num = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for i in num:
    cm1 = metrics.confusion_matrix(y_train_pred_final['Subscription'], y_train_pred_final[i])
    total1=sum(sum(cm1))
    accuracy = (cm1[0,0]+cm1[1,1])/total1
    speci = cm1[0,0]/(cm1[0,0]+cm1[0,1])
    sensi = cm1[1,1]/(cm1[1,0]+cm1[1,1])
    cutoff_df.loc[i] =[ i ,accuracy,sensi,speci]
print(cutoff_df)

### Plot accuracy,sensitivity & specificity for various probabilities

In [None]:
# Let's plot accuracy sensitivity and specificity for various probabilities.
plt.figure(figsize=(10, 5))
cutoff_df.plot.line(x='prob', y=['accuracy','sensi','speci'])
plt.title('accuracy,sensitivity & specificity vs. probability')
plt.xlabel('Probability')
plt.ylabel('accuracy,sensitivity & specificity')
plt.legend()
plt.grid()
plt.show()

### Creating 'predicted' column based on 'Converted_Prob' value >0.25

In [None]:
y_train_pred_final['final_predicted'] = y_train_pred_final['Subscription_Prob'].map( lambda x: 1 if x > 0.15 else 0)
y_train_pred_final.head()

### Creating confusion matrix and related values using y_train_pred_final

In [None]:
confusionMatrix(y_train_pred_final['Subscription'], y_train_pred_final['final_predicted'])

In [None]:
y_train_pred_final['Subscription'], y_train_pred_final['final_predicted']
p, r, thresholds = precision_recall_curve(y_train_pred_final['Subscription'], y_train_pred_final['Subscription_Prob'])

In [None]:
plt.plot(thresholds, p[:-1], "g-")
plt.plot(thresholds, r[:-1], "r-")
plt.title("Precision Recall Plot")
plt.grid()
plt.show()

## Step 7: Model Evaluation    

In [None]:
cols=X_train.select_dtypes(include='number').columns
X_test[cols]=scaler.fit_transform(X_test[cols])

In [None]:
final_col=X_train_model.drop('const',axis=1)

In [None]:
X_test=X_test[final_col.columns]

In [None]:
X_test_sm = sm.add_constant(X_test)
y_test_pred = res.predict(X_test_sm)
y_test_pred[:10]

In [None]:
y_pred_1 = pd.DataFrame(y_test_pred)
y_test_df = pd.DataFrame(y_test)

In [None]:
y_pred_1.reset_index(drop=True, inplace=True)
y_test_df.reset_index(drop=True, inplace=True)

In [None]:
y_test_pred_final = pd.concat([y_test_df, y_pred_1],axis=1)

In [None]:
y_test_pred_final= y_test_pred_final.rename(columns={ 0 : 'Subscription_Prob'})

In [None]:
y_test_pred_final.head()

In [None]:
y_test_pred_final['Converted_Score'] = y_test_pred_final.Subscription_Prob.map( lambda x: round(x*100))

In [None]:
y_test_pred_final['final_Predicted'] = y_test_pred_final.Subscription_Prob.map(lambda x: 1 if x > 0.15 else 0)

### Creating Confusion matrix for the test Data

In [None]:
confusionMatrix(y_test_pred_final['subscription'], y_test_pred_final['final_Predicted'])

# Recommendation

#### Top features which are recommended by model as factor :
    duration
    cons.price.idx
    cons.conf.idx
    month_mar
    month_Jun
    poutcome_success
    poutcome_nonexistent
    
#### Features which are indicatorsfor not subscription:
    campaign
    euribor3m
    job_blue-collar
    contact_telephone
    month_may


# Insights

1.) Leads whose origin is either poutcome_success or poutcome_nonexistent are good for subscription.
    Marketting spends can be increased on these 2 types of origins to increase revenues for the company as these are more likely to subscribe the banking notification.
   
2.)  Sales professionals should not invest way too much time on people from job group as 'blue-collar" as they are most unlikely to subscribe 
