**Mounting Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

**reading the dataset**

In [3]:
bank_dataset = pd.read_csv('/content/drive/MyDrive/ML_TRAINING/Dataset/bank.csv')

**checking null values**

In [4]:
bank_dataset.isnull().any() #no nan values

age          False
job          False
marital      False
education    False
default      False
balance      False
housing      False
loan         False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
deposit      False
dtype: bool

In [5]:
bank_dataset.isnull().sum() #no nan values

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [6]:
bank_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


In [7]:
type(bank_dataset)

pandas.core.frame.DataFrame

In [8]:
bank_dataset.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes


In [9]:
bank_dataset.drop("education",1, inplace=True)
bank_dataset.drop("default",1, inplace=True)
bank_dataset.drop("day",1, inplace=True)
bank_dataset.drop("month",1, inplace=True)
bank_dataset.drop("duration",1, inplace=True)

In [10]:
bank_dataset

Unnamed: 0,age,job,marital,balance,housing,loan,contact,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,2343,yes,no,unknown,1,-1,0,unknown,yes
1,56,admin.,married,45,no,no,unknown,1,-1,0,unknown,yes
2,41,technician,married,1270,yes,no,unknown,1,-1,0,unknown,yes
3,55,services,married,2476,yes,no,unknown,1,-1,0,unknown,yes
4,54,admin.,married,184,no,no,unknown,2,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,blue-collar,single,1,yes,no,cellular,1,-1,0,unknown,no
11158,39,services,married,733,no,no,unknown,4,-1,0,unknown,no
11159,32,technician,single,29,no,no,cellular,2,-1,0,unknown,no
11160,43,technician,married,0,no,yes,cellular,2,172,5,failure,no


In [11]:
bank_dataset.columns

Index(['age', 'job', 'marital', 'balance', 'housing', 'loan', 'contact',
       'campaign', 'pdays', 'previous', 'poutcome', 'deposit'],
      dtype='object')

**Splitting x and y**

In [12]:
u =bank_dataset.iloc[:,0:5]
v= bank_dataset.iloc[:,6:12]
x=pd.concat([u,v],axis=1, join="outer")  #pd.concat([df1, df3], axis=1, join='inner'
y = bank_dataset.iloc[:,5:6]

In [13]:
x

Unnamed: 0,age,job,marital,balance,housing,contact,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,2343,yes,unknown,1,-1,0,unknown,yes
1,56,admin.,married,45,no,unknown,1,-1,0,unknown,yes
2,41,technician,married,1270,yes,unknown,1,-1,0,unknown,yes
3,55,services,married,2476,yes,unknown,1,-1,0,unknown,yes
4,54,admin.,married,184,no,unknown,2,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...
11157,33,blue-collar,single,1,yes,cellular,1,-1,0,unknown,no
11158,39,services,married,733,no,unknown,4,-1,0,unknown,no
11159,32,technician,single,29,no,cellular,2,-1,0,unknown,no
11160,43,technician,married,0,no,cellular,2,172,5,failure,no


In [14]:
y

Unnamed: 0,loan
0,no
1,no
2,no
3,no
4,no
...,...
11157,no
11158,no
11159,no
11160,yes


In [15]:
print(type(x),type(y))

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


**converting df to arrays**

In [16]:
x = x.values
y = y.values


In [17]:
x

array([[59, 'admin.', 'married', ..., 0, 'unknown', 'yes'],
       [56, 'admin.', 'married', ..., 0, 'unknown', 'yes'],
       [41, 'technician', 'married', ..., 0, 'unknown', 'yes'],
       ...,
       [32, 'technician', 'single', ..., 0, 'unknown', 'no'],
       [43, 'technician', 'married', ..., 5, 'failure', 'no'],
       [34, 'technician', 'married', ..., 0, 'unknown', 'no']],
      dtype=object)

In [18]:
y

array([['no'],
       ['no'],
       ['no'],
       ...,
       ['no'],
       ['yes'],
       ['no']], dtype=object)

In [19]:
x.shape

(11162, 11)

**identifying the categorical values for each column**

In [20]:
bank_dataset["job"].unique()

array(['admin.', 'technician', 'services', 'management', 'retired',
       'blue-collar', 'unemployed', 'entrepreneur', 'housemaid',
       'unknown', 'self-employed', 'student'], dtype=object)

In [21]:
bank_dataset["marital"].unique()


array(['married', 'single', 'divorced'], dtype=object)

In [22]:
bank_dataset["housing"].unique()


array(['yes', 'no'], dtype=object)

In [23]:
bank_dataset["contact"].unique()

array(['unknown', 'cellular', 'telephone'], dtype=object)

In [24]:
bank_dataset["poutcome"].unique()


array(['unknown', 'other', 'failure', 'success'], dtype=object)

In [25]:
x[0:5]

array([[59, 'admin.', 'married', 2343, 'yes', 'unknown', 1, -1, 0,
        'unknown', 'yes'],
       [56, 'admin.', 'married', 45, 'no', 'unknown', 1, -1, 0,
        'unknown', 'yes'],
       [41, 'technician', 'married', 1270, 'yes', 'unknown', 1, -1, 0,
        'unknown', 'yes'],
       [55, 'services', 'married', 2476, 'yes', 'unknown', 1, -1, 0,
        'unknown', 'yes'],
       [54, 'admin.', 'married', 184, 'no', 'unknown', 2, -1, 0,
        'unknown', 'yes']], dtype=object)

**Before  using ColumnTransformer**

In [26]:
u =bank_dataset.iloc[:,0:5]
v= bank_dataset.iloc[:,6:12]
a=pd.concat([u,v],axis=1, join="outer")
a.values

array([[59, 'admin.', 'married', ..., 0, 'unknown', 'yes'],
       [56, 'admin.', 'married', ..., 0, 'unknown', 'yes'],
       [41, 'technician', 'married', ..., 0, 'unknown', 'yes'],
       ...,
       [32, 'technician', 'single', ..., 0, 'unknown', 'no'],
       [43, 'technician', 'married', ..., 5, 'failure', 'no'],
       [34, 'technician', 'married', ..., 0, 'unknown', 'no']],
      dtype=object)

**convert categorical data to binary**

In [27]:
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder

In [28]:
#ct = ColumnTransformer([("instance name for onehotencoding",onehotencoderfunction,[column index numbers that are to be converted in to binary fotmat])],remainder = "passthrough")
ct = ColumnTransformer([("one",OneHotEncoder(),[1,2,4,5,9,10])],remainder = "passthrough")
x = ct.fit_transform(x)


In [29]:
x.shape

(11162, 31)

**After using Column Transformer**

In [30]:
x[0:5]

array([[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
        59, 2343, 1, -1, 0],
       [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
        56, 45, 1, -1, 0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
        1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
        41, 1270, 1, -1, 0],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
        55, 2476, 1, -1, 0],
       [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
        54, 184, 2, -1, 0]], dtype=object)

**splitting data set into train and test set**

In [31]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)

In [32]:
x_train.shape

(8929, 31)

In [33]:
y_train.shape

(8929, 1)

In [34]:
x_test.shape

(2233, 31)

In [35]:
y_test.shape

(2233, 1)

**feature scaling**

In [36]:
from sklearn.preprocessing import StandardScaler # [-3,3],[0,1]
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_train

array([[-0.37172192, -0.45760242, -0.17658264, ..., -0.56394809,
        -0.48428507, -0.35888264],
       [-0.37172192, -0.45760242, -0.17658264, ..., -0.56394809,
        -0.48428507, -0.35888264],
       [-0.37172192,  2.18530314, -0.17658264, ...,  0.92827075,
        -0.48428507, -0.35888264],
       ...,
       [-0.37172192, -0.45760242, -0.17658264, ...,  2.04743488,
        -0.48428507, -0.35888264],
       [ 2.69018306, -0.45760242, -0.17658264, ..., -0.56394809,
        -0.48428507, -0.35888264],
       [-0.37172192, -0.45760242, -0.17658264, ..., -0.56394809,
        -0.48428507, -0.35888264]])