# Import required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import urllib.request
from sklearn.model_selection import train_test_split

import sweetviz as sv

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Load Data

In [2]:
data_location = "bank-full.csv"
data_url = 'https://github.com/diwakarsachan/marketing-project-final/raw/main/bank-full.csv'

if not os.path.exists (data_location):
    data_location = os.path.basename(data_location)
    if not os.path.exists(data_location):
        print("Downloading : ", data_url)
        urllib.request.urlretrieve(data_url, data_location)
print('data_location:', data_location)

data_location: bank-full.csv


In [3]:
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None) 

marketing_outcome = pd.read_csv(data_location, sep=';')
print('Orignal Shape of the data : ',marketing_outcome.shape)


Orignal Shape of the data :  (45211, 17)


# Clean Up

In [4]:
numeric_data = marketing_outcome.select_dtypes(include=np.number) # select_dtypes selects data with numeric features
numeric_col = numeric_data.columns

print("Numeric Features:", numeric_data.columns.values)
numeric_data.head()

Numeric Features: ['age' 'balance' 'day' 'duration' 'campaign' 'pdays' 'previous']


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0


In [5]:
categorical_data = marketing_outcome.select_dtypes(exclude=np.number) # we will exclude data with numeric features
categorical_col = categorical_data.columns

print("Categorical Features:", categorical_data.columns.values)
categorical_data.head()

Categorical Features: ['job' 'marital' 'education' 'default' 'housing' 'loan' 'contact' 'month'
 'poutcome' 'y']


Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,management,married,tertiary,no,yes,no,unknown,may,unknown,no
1,technician,single,secondary,no,yes,no,unknown,may,unknown,no
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown,no
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown,no
4,unknown,single,unknown,no,no,no,unknown,may,unknown,no


#### Treating outliers

In [None]:
# from scipy.stats.mstats import winsorize
# numeric_col = dataframe.select_dtypes(include=np.number).columns

# for col in numeric_col:    
#     dataframe[col] = winsorize(dataframe[col], limits=[0.05, 0.1],inclusive=(True, True))

# Shape data 

In [8]:
marketing_outcome =marketing_outcome.replace({'default': {'yes': 1, 'no': 0}, 'housing': {'yes': 1, 'no': 0}, 'loan': {'yes': 1, 'no': 0}, 'y': {'yes': 1, 'no': 0}})

In [13]:
categorical_col = marketing_outcome.select_dtypes(include='O')
marketing_outcome = pd.get_dummies(marketing_outcome, columns=[col for col in marketing_outcome.columns if col in categorical_col])

In [14]:
marketing_outcome.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 49 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  45211 non-null  int64
 1   default              45211 non-null  int64
 2   balance              45211 non-null  int64
 3   housing              45211 non-null  int64
 4   loan                 45211 non-null  int64
 5   day                  45211 non-null  int64
 6   duration             45211 non-null  int64
 7   campaign             45211 non-null  int64
 8   pdays                45211 non-null  int64
 9   previous             45211 non-null  int64
 10  y                    45211 non-null  int64
 11  job_admin.           45211 non-null  uint8
 12  job_blue-collar      45211 non-null  uint8
 13  job_entrepreneur     45211 non-null  uint8
 14  job_housemaid        45211 non-null  uint8
 15  job_management       45211 non-null  uint8
 16  job_retired          4

#### Scaling

In [15]:
# Numeric Columns for Scaling
df_numeric = marketing_outcome[numeric_col]

In [16]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
minmax_scalar = MinMaxScaler()
nd_numeric_transform = minmax_scalar.fit_transform(df_numeric)

In [17]:
df_numeric_transform=pd.DataFrame(np.array(nd_numeric_transform))
df_numeric_transform.columns = numeric_col
df_numeric_transform.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,0.52,0.09,0.13,0.05,0.0,0.0,0.0
1,0.34,0.07,0.13,0.03,0.0,0.0,0.0
2,0.19,0.07,0.13,0.02,0.0,0.0,0.0
3,0.38,0.09,0.13,0.02,0.0,0.0,0.0
4,0.19,0.07,0.13,0.04,0.0,0.0,0.0


In [18]:
non_numeric_columns = [i for i in marketing_outcome.columns if i not in numeric_col]
marketing_outcome_final = pd.concat([df_numeric_transform, marketing_outcome[non_numeric_columns]], axis=1) 
marketing_outcome_final.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,default,housing,loan,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,contact_cellular,contact_telephone,contact_unknown,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0.52,0.09,0.13,0.05,0.0,0.0,0.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,0.34,0.07,0.13,0.03,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2,0.19,0.07,0.13,0.02,0.0,0.0,0.0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,0.38,0.09,0.13,0.02,0.0,0.0,0.0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,0.19,0.07,0.13,0.04,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1


# Feature Engineering

In [20]:
label_col = 'y'

feature_columns = marketing_outcome_final.columns
feature_columns = feature_columns.drop (['y'])
print (feature_columns)

Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'default', 'housing', 'loan', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'marital_divorced', 'marital_married',
       'marital_single', 'education_primary', 'education_secondary',
       'education_tertiary', 'education_unknown', 'contact_cellular',
       'contact_telephone', 'contact_unknown', 'month_apr', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'],
      dtype='object')


In [21]:
X = marketing_outcome_final[feature_columns]
y = marketing_outcome_final[[label_col]]

print (X.shape)
print (y.shape)

(45211, 48)
(45211, 1)


# Test Train Split

In [22]:
X_train,X_test,y_train, y_test = train_test_split(X, y,  test_size=.2, random_state=42)

print ("x_train :" , X_train.shape )
print ("x_test :", X_test.shape)
print ("y_train :", y_train.shape)
print ("y_test :", y_test.shape)

x_train : (36168, 48)
x_test : (9043, 48)
y_train : (36168, 1)
y_test : (9043, 1)


# Over sample to fix the imbalace

In [23]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

# fit predictor and target variable also ensure you dont oversample on test data
X_trainsm, y_trainsm = smote.fit_resample(X_train, y_train)

print('Original dataset shape', len(y_train))
print('Resample dataset shape', len(y_trainsm))

Original dataset shape 36168
Resample dataset shape 63940


In [24]:
# Check for impbalnce on final train data
y_trainsm.value_counts(normalize=True)

y
0   0.50
1   0.50
dtype: float64

# Algorithm Starts