In [2]:
import numpy as np 

In [3]:
lending_data_numreic = np.loadtxt("datasets/Lending-company-Numeric.csv", delimiter=',')
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [4]:
np.isnan(lending_data_numreic).sum()

0

In [5]:
lending_data_numreic_nan = np.genfromtxt("datasets/Lending-company-Numeric-NAN.csv", delimiter=';')
# lending_data_numreic_nan
print(np.isnan(lending_data_numreic_nan))
print(np.isnan(lending_data_numreic_nan).sum())

[[False False False False False False]
 [False False False False False False]
 [False False False False False False]
 ...
 [ True False False False False False]
 [False False False False False False]
 [False False False False False False]]
260


In [6]:
lending_data_numreic_nan_imputed = np.genfromtxt("datasets/Lending-company-Numeric-NAN.csv", delimiter=';', filling_values=0)
print(np.isnan(lending_data_numreic_nan_imputed).sum())


0


#### Substituting the missing values

In [8]:
# adding one to the value keeps the  null value out of range
temp_fill = np.nanmax(lending_data_numreic_nan).round(2) +1
temp_fill

64002.0

In [9]:
# now ill values 
lending_data_numreic_nan_imputed = np.genfromtxt("datasets/Lending-company-Numeric-NAN.csv", delimiter=';', filling_values=temp_fill)
np.isnan(lending_data_numreic_nan_imputed).sum()


0

In [10]:
# if a value is equal to maximum value of the array,  it is actually a missing values 
# changing the data imputation to mean imputation will help much, as it does not change the interpretation data.

In [11]:
lending_data_numreic_nan

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [12]:
temp_mean = np.nanmean(lending_data_numreic_nan, axis=0).round(2)
temp_mean

array([ 2250.25,    46.11,   365.  ,  3895.99,  5160.75, 16571.44])

This is actual mean from data where the missing values are present. Now, lets see the avergae of imputed data. The actual mean was 2250.25, and imputed data has 4263.25 mean that is misleading.

In [14]:
print(np.isnan(lending_data_numreic_nan_imputed).sum())
np.mean(lending_data_numreic_nan_imputed, axis=0).round(2)[0]

0


4263.25

In [15]:
lending_data_numreic_nan_imputed[:,0] = np.where(lending_data_numreic_nan_imputed[:,0] == temp_fill, temp_mean[0], lending_data_numreic_nan_imputed[:,0])
np.mean(lending_data_numreic_nan_imputed, axis=0).round(2)[0]


2250.25

imputing the entire dataset with mean valeus. We could have done fillna_mean at the begininig, but we explored the difference. we will impute each column of the dataset, where we have temp fill values i.e. max +1.


In [16]:
for col in range(lending_data_numreic_nan_imputed.shape[1]):
    lending_data_numreic_nan_imputed[:, col] = np.where(lending_data_numreic_nan_imputed[:,col] == temp_fill, temp_mean[col], lending_data_numreic_nan_imputed[:,col])

lending_data_numreic_nan_imputed

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

##### Reshape (Efficiently)

Why it is useful: certain conditions about shapes and sizes to be met.

In [17]:
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [18]:
lending_data_numreic.shape

(1043, 6)

In [19]:
np.reshape(lending_data_numreic, (6,1043))

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [20]:
lending_data_numreic.T

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

##### Remove valeus using delete

In [21]:
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [22]:
np.delete(lending_data_numreic, 0)

array([   40.,   365.,  3121., ...,  4601.,  4601., 16600.])

In [24]:
np.delete(lending_data_numreic, 0).shape

(6257,)

In [26]:
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [25]:
np.delete(lending_data_numreic, [0,2, 4], axis=1)

array([[   40.,  3121., 13621.],
       [   40.,  3061., 15041.],
       [   40.,  2160., 15340.],
       ...,
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.],
       [   40.,  4601., 16600.]])

#### Sorting 

In [27]:
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])