In [65]:
import numpy as np
import pandas as pd
from scipy.linalg import svd

data = pd.read_csv("bank.csv")


In [66]:
# drop useless columns
data_dropped = data.drop(columns=['poutcome', 'contact','default'])
print(data_dropped.shape)
data_dropped.head()

(11162, 14)


Unnamed: 0,age,job,marital,education,balance,housing,loan,day,month,duration,campaign,pdays,previous,deposit
0,59,admin.,married,secondary,2343,yes,no,5,may,1042,1,-1,0,yes
1,56,admin.,married,secondary,45,no,no,5,may,1467,1,-1,0,yes
2,41,technician,married,secondary,1270,yes,no,5,may,1389,1,-1,0,yes
3,55,services,married,secondary,2476,yes,no,5,may,579,1,-1,0,yes
4,54,admin.,married,tertiary,184,no,no,5,may,673,2,-1,0,yes


In [67]:
# analyzing type of data present in columns to filter it

for x in data_dropped.columns :
    print('--------------------',x, "-----------------------------")
    print(data_dropped[x].value_counts())


-------------------- age -----------------------------
31    496
32    477
34    466
33    464
35    461
     ... 
92      2
93      2
90      2
89      1
95      1
Name: age, Length: 76, dtype: int64
-------------------- job -----------------------------
management       2566
blue-collar      1944
technician       1823
admin.           1334
services          923
retired           778
self-employed     405
student           360
unemployed        357
entrepreneur      328
housemaid         274
unknown            70
Name: job, dtype: int64
-------------------- marital -----------------------------
married     6351
single      3518
divorced    1293
Name: marital, dtype: int64
-------------------- education -----------------------------
secondary    5476
tertiary     3689
primary      1500
unknown       497
Name: education, dtype: int64
-------------------- balance -----------------------------
0       774
1        39
3        35
2        34
4        29
       ... 
4641      1
2975      1


In [68]:
# replace unknown data to nan to use drpna() function in pandas
replaced_data = data_dropped.replace('unknown', np.nan )
# analyzing null data
replaced_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 14 columns):
age          11162 non-null int64
job          11092 non-null object
marital      11162 non-null object
education    10665 non-null object
balance      11162 non-null int64
housing      11162 non-null object
loan         11162 non-null object
day          11162 non-null int64
month        11162 non-null object
duration     11162 non-null int64
campaign     11162 non-null int64
pdays        11162 non-null int64
previous     11162 non-null int64
deposit      11162 non-null object
dtypes: int64(7), object(7)
memory usage: 1.2+ MB


In [69]:
# found no null rows in columns
pd.isnull(replaced_data).sum()

age            0
job           70
marital        0
education    497
balance        0
housing        0
loan           0
day            0
month          0
duration       0
campaign       0
pdays          0
previous       0
deposit        0
dtype: int64

In [70]:
# drop useless rows
replaced_data.dropna(inplace=True)

In [71]:
# checking there is not any null data
pd.isnull(replaced_data).sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
loan         0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
deposit      0
dtype: int64

In [72]:
replaced_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10634 entries, 0 to 11161
Data columns (total 14 columns):
age          10634 non-null int64
job          10634 non-null object
marital      10634 non-null object
education    10634 non-null object
balance      10634 non-null int64
housing      10634 non-null object
loan         10634 non-null object
day          10634 non-null int64
month        10634 non-null object
duration     10634 non-null int64
campaign     10634 non-null int64
pdays        10634 non-null int64
previous     10634 non-null int64
deposit      10634 non-null object
dtypes: int64(7), object(7)
memory usage: 1.2+ MB


In [73]:
# checking current dimention of data
replaced_data.columns

Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'loan',
       'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'deposit'],
      dtype='object')

In [74]:
# replace nominal data to ordinal data in columns
# for marital

d={'married':0,'single':1,'divorced':2}
for x in d.keys():
    replaced_data = replaced_data.replace(x,d[x])
    
    
#  for job   
    
d={
'management':0,
'blue-collar':1,
'technician':2,
'admin.':3,
'services':4,
'retired' :5,
'self-employed':6,
'unemployed': 7,
'entrepreneur':8,
'student': 9,
'housemaid':10}
for x in d.keys():
    replaced_data = replaced_data.replace(x,d[x])

    
#   for education  
d={
'secondary':0,
'tertiary':1,
'primary':2
}
for x in d.keys():
    replaced_data = replaced_data.replace(x,d[x])
    
#   for deposit,loan,housing  
d={
    'no':0,
    'yes':1
}
for x in d.keys():
    replaced_data = replaced_data.replace(x,d[x])

    
#    for months 
    
d={
    'jan':0,
    'feb':1,
    'mar':2,
    'apr':3,
    'may':4,
    'jun':5,
    'jul':6,
    'aug':7,
    'sep':8,
    'oct':9,
    'nov':10,
    'dec':11
}
for x in d.keys():
    replaced_data = replaced_data.replace(x,d[x])
    


In [75]:
# analyzing filtered data
for x in  replaced_data.columns :
    print('--------------------',x, "-----------------------------")
    print( replaced_data[x].value_counts())

-------------------- age -----------------------------
31    487
32    465
34    456
30    452
33    452
     ... 
18      3
88      2
90      2
89      1
95      1
Name: age, Length: 74, dtype: int64
-------------------- job -----------------------------
0     2482
1     1858
2     1771
3     1294
4      882
5      731
6      394
7      350
8      314
9      293
10     265
Name: job, dtype: int64
-------------------- marital -----------------------------
0    6041
1    3351
2    1242
Name: marital, dtype: int64
-------------------- education -----------------------------
0    5461
1    3680
2    1493
Name: education, dtype: int64
-------------------- balance -----------------------------
0       742
1        38
2        34
3        31
4        29
       ... 
5704      1
7606      1
1626      1
1634      1
3986      1
Name: balance, Length: 3702, dtype: int64
-------------------- housing -----------------------------
0    5530
1    5104
Name: housing, dtype: int64
-------------------- 

In [86]:
replaced_data.to_csv('intermediateData.csv')
data=replaced_data
print(data.shape)
data.head()
# data.tail()

(10634, 14)


Unnamed: 0,age,job,marital,education,balance,housing,loan,day,month,duration,campaign,pdays,previous,deposit
0,59,3,0,0,2343,1,0,5,4,1042,1,-1,0,1
1,56,3,0,0,45,0,0,5,4,1467,1,-1,0,1
2,41,2,0,0,1270,1,0,5,4,1389,1,-1,0,1
3,55,4,0,0,2476,1,0,5,4,579,1,-1,0,1
4,54,3,0,1,184,0,0,5,4,673,2,-1,0,1


In [77]:
# changing dataframe to 2d array for matrix operations
data=np.array(data)
no_of_example=data.shape[0]
# transpose of data
dataT=data.transpose()
print(data.shape)
print(dataT.shape)
# creating nxn dimention sigma matrix
sigma=(dataT.dot(data))/no_of_example
print(sigma.shape)
# use singular value decomposition function 
[u,s,vt]=svd(sigma)
# print(u)
print(u.shape)
print(s.shape)
print(vt.shape)


(10634, 14)
(14, 10634)
(14, 14)
(14, 14)
(14,)
(14, 14)


In [78]:
# reducing dimention to half of previous dimention
a=u[:,:u.shape[1]//2]
# a.shape
reduced_data=data.dot(a)
reduced_data.shape

(10634, 7)

In [79]:
print(reduced_data)

[[-2.38953867e+03 -9.28893836e+02  7.68916816e+01 ...  1.67241301e+01
   2.52711547e+00  2.06276338e-01]
 [-1.14084299e+02 -1.46086417e+03  9.70786881e+01 ...  1.62460108e+01
   2.47525393e+00  2.05439825e-01]
 [-1.33394825e+03 -1.32425942e+03  9.76476822e+01 ...  1.08632807e+01
   1.26908534e+00 -2.89951825e-01]
 ...
 [-3.64910804e+01 -1.55928925e+02  8.95651366e+00 ... -6.83719187e+00
  -1.92682812e+00 -2.60025868e-01]
 [-1.82648375e+00 -2.23784125e+01 -1.73844234e+02 ...  6.67714216e+00
   1.08212787e-01 -5.80086289e-01]
 [-2.96613091e+01 -6.26946309e+02  4.11381865e+01 ...  3.82717712e+00
  -1.14323488e+00 -1.06134650e-01]]


In [85]:
# converting 2d array to dataframe
reduced_data=pd.DataFrame(reduced_data)
print(reduced_data.shape)
reduced_data.head()

(10634, 7)


Unnamed: 0,0,1,2,3,4,5,6
0,-2389.538668,-928.893836,76.891682,3.05338,16.72413,2.527115,0.206276
1,-114.084299,-1460.86417,97.078688,21.626981,16.246011,2.475254,0.20544
2,-1333.948249,-1324.259422,97.647682,34.84521,10.863281,1.269085,-0.289952
3,-2500.641833,-461.581892,46.107842,-16.869896,14.81546,1.519574,1.331024
4,-215.660394,-663.953337,43.593816,-17.255421,14.622311,1.222577,0.024543


In [81]:
# save dataframe of reduced dimention data into csv formate
reduced_data.to_csv('ReducedBankingData.csv')