In [24]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
df = pd.read_csv('Raw/Data.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [26]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
deposit      object
dtype: object

In [27]:
df.drop('job',axis='columns', inplace=True)

In [28]:
df.drop('month',axis='columns', inplace=True)

In [29]:
df.drop('poutcome',axis='columns', inplace=True)

In [30]:
df.drop('education',axis='columns', inplace=True)

In [31]:
df.drop('pdays',axis='columns', inplace=True)

In [32]:
df.dtypes

age          int64
marital     object
default     object
balance      int64
housing     object
loan        object
contact     object
day          int64
duration     int64
campaign     int64
previous     int64
deposit     object
dtype: object

In [33]:
df.isnull().sum()

age         0
marital     0
default     0
balance     0
housing     0
loan        0
contact     0
day         0
duration    0
campaign    0
previous    0
deposit     0
dtype: int64

In [34]:
for col in df:
    if df[col].dtypes == 'object':
        print(col, ': ', df[col].unique())

marital :  ['married' 'single' 'divorced']
default :  ['no' 'yes']
housing :  ['yes' 'no']
loan :  ['no' 'yes']
contact :  ['unknown' 'cellular' 'telephone']
deposit :  ['yes' 'no']


In [35]:
yes_no_cols = ['default','housing','loan','deposit']

for col in yes_no_cols:
    df[col].replace({'yes': 1, 'no': 0}, inplace=True)

In [36]:
for col in df:
    print(col, ': ',df[col].unique())

age :  [59 56 41 55 54 42 60 37 28 38 30 29 46 31 35 32 49 43 26 40 33 23 48 45
 36 52 53 39 57 51 44 24 50 27 34 47 25 58 61 68 75 22 69 66 85 72 90 67
 71 21 74 65 62 83 70 76 77 19 73 63 20 78 95 64 79 82 18 86 84 87 92 81
 80 93 88 89]
marital :  ['married' 'single' 'divorced']
default :  [0 1]
balance :  [2343   45 1270 ... 1594 2593 -134]
housing :  [1 0]
loan :  [0 1]
contact :  ['unknown' 'cellular' 'telephone']
day :  [ 5  6  7  8  9 12 13 14 15 16 19 20 21 23 26 27 28 29 30  2  3  4 11 17
 18 24  1 10 22 25 31]
duration :  [1042 1467 1389 ... 1504  818 1812]
campaign :  [ 1  2  3  4  6  5  8 11  9 10 15 12 14  7 24 13 17 29 21 20 16 32 19 25
 22 43 18 41 63 27 30 26 23 28 33 31]
previous :  [ 0  1  4  2  3 10  7  9  5 29  6 13 17  8 11 30 22 58 21 14 26 12 19 55
 23 15 20 27 16 41 40 28 18 37]
deposit :  [1 0]


In [37]:
df2 = pd.get_dummies(data=df, columns=['marital','contact'])
df2.columns

Index(['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration',
       'campaign', 'previous', 'deposit', 'marital_divorced',
       'marital_married', 'marital_single', 'contact_cellular',
       'contact_telephone', 'contact_unknown'],
      dtype='object')

In [39]:
df2.dtypes

age                  int64
default              int64
balance              int64
housing              int64
loan                 int64
day                  int64
duration             int64
campaign             int64
previous             int64
deposit              int64
marital_divorced     uint8
marital_married      uint8
marital_single       uint8
contact_cellular     uint8
contact_telephone    uint8
contact_unknown      uint8
dtype: object

In [41]:
cols_to_scale = ['age','balance','day','duration','campaign','previous']
df3= df2.copy(deep=True)


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df3[cols_to_scale] = scaler.fit_transform(df3[cols_to_scale])

df3.head(3)

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,previous,deposit,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone,contact_unknown
0,0.532468,0,0.104371,1,0,0.133333,0.26811,0.0,0.0,1,0,1,0,0,0,1
1,0.493506,0,0.078273,0,0,0.133333,0.377675,0.0,0.0,1,0,1,0,0,0,1
2,0.298701,0,0.092185,1,0,0.133333,0.357566,0.0,0.0,1,0,1,0,0,0,1


In [42]:
feat = df3.drop('deposit', axis='columns')
featLabel = df3['deposit']

In [43]:
train = feat.sample(frac=0.8, random_state=5)
trainLabel = featLabel[train.index]
test = feat.drop(train.index)
testLabel = featLabel.drop(train.index)

In [44]:
print(train.shape)
print(test.shape)
print(trainLabel.shape)
print(testLabel.shape)

(8930, 15)
(2232, 15)
(8930,)
(2232,)


In [45]:
type(train)

pandas.core.frame.DataFrame

In [46]:
train = train.to_numpy()

In [47]:
type(train)

numpy.ndarray

In [48]:
np.savetxt('Raw/train.csv',train)
np.savetxt('Raw/trainLabel.csv',trainLabel, fmt='%i')
np.savetxt('Raw/test.csv',test)
np.savetxt('Raw/testLabel.csv',testLabel, fmt='%i')

In [49]:
train = np.loadtxt('Raw/train.csv')
trainLabel = np.loadtxt('Raw/trainLabel.csv').astype(int)
test = np.loadtxt('Raw/test.csv')
testLabel = np.loadtxt('Raw/testLabel.csv').astype(int)

In [50]:
print(train.shape)
print(test.shape)
print(trainLabel.shape)
print(testLabel.shape)

(8930, 15)
(2232, 15)
(8930,)
(2232,)
