In [2]:
import numpy as np 

In [3]:
lending_data_numreic = np.loadtxt("datasets/Lending-company-Numeric.csv", delimiter=',')
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [4]:
np.isnan(lending_data_numreic).sum()

0

In [5]:
lending_data_numreic_nan = np.genfromtxt("datasets/Lending-company-Numeric-NAN.csv", delimiter=';')
# lending_data_numreic_nan
print(np.isnan(lending_data_numreic_nan))
print(np.isnan(lending_data_numreic_nan).sum())

[[False False False False False False]
 [False False False False False False]
 [False False False False False False]
 ...
 [ True False False False False False]
 [False False False False False False]
 [False False False False False False]]
260


In [6]:
lending_data_numreic_nan_imputed = np.genfromtxt("datasets/Lending-company-Numeric-NAN.csv", delimiter=';', filling_values=0)
print(np.isnan(lending_data_numreic_nan_imputed).sum())


0


#### Substituting the missing values

In [8]:
# adding one to the value keeps the  null value out of range
temp_fill = np.nanmax(lending_data_numreic_nan).round(2) +1
temp_fill

64002.0

In [9]:
# now ill values 
lending_data_numreic_nan_imputed = np.genfromtxt("datasets/Lending-company-Numeric-NAN.csv", delimiter=';', filling_values=temp_fill)
np.isnan(lending_data_numreic_nan_imputed).sum()


0

In [10]:
# if a value is equal to maximum value of the array,  it is actually a missing values 
# changing the data imputation to mean imputation will help much, as it does not change the interpretation data.

In [11]:
lending_data_numreic_nan

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [12]:
temp_mean = np.nanmean(lending_data_numreic_nan, axis=0).round(2)
temp_mean

array([ 2250.25,    46.11,   365.  ,  3895.99,  5160.75, 16571.44])

This is actual mean from data where the missing values are present. Now, lets see the avergae of imputed data. The actual mean was 2250.25, and imputed data has 4263.25 mean that is misleading.

In [14]:
print(np.isnan(lending_data_numreic_nan_imputed).sum())
np.mean(lending_data_numreic_nan_imputed, axis=0).round(2)[0]

0


4263.25

In [15]:
lending_data_numreic_nan_imputed[:,0] = np.where(lending_data_numreic_nan_imputed[:,0] == temp_fill, temp_mean[0], lending_data_numreic_nan_imputed[:,0])
np.mean(lending_data_numreic_nan_imputed, axis=0).round(2)[0]


2250.25

imputing the entire dataset with mean valeus. We could have done fillna_mean at the begininig, but we explored the difference. we will impute each column of the dataset, where we have temp fill values i.e. max +1.


In [16]:
for col in range(lending_data_numreic_nan_imputed.shape[1]):
    lending_data_numreic_nan_imputed[:, col] = np.where(lending_data_numreic_nan_imputed[:,col] == temp_fill, temp_mean[col], lending_data_numreic_nan_imputed[:,col])

lending_data_numreic_nan_imputed

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

##### Reshape (Efficiently)

Why it is useful: certain conditions about shapes and sizes to be met.

In [17]:
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [18]:
lending_data_numreic.shape

(1043, 6)

In [19]:
np.reshape(lending_data_numreic, (6,1043))

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [20]:
lending_data_numreic.T

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

##### Remove valeus using delete

In [21]:
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [22]:
np.delete(lending_data_numreic, 0)

array([   40.,   365.,  3121., ...,  4601.,  4601., 16600.])

In [24]:
np.delete(lending_data_numreic, 0).shape

(6257,)

In [26]:
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [25]:
np.delete(lending_data_numreic, [0,2, 4], axis=1)

array([[   40.,  3121., 13621.],
       [   40.,  3061., 15041.],
       [   40.,  2160., 15340.],
       ...,
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.],
       [   40.,  4601., 16600.]])

#### Sorting 

In [27]:
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [30]:
lending_data_numreic[:,3].sort()
lending_data_numreic[:, 3]

array([-2870., -2550., -2450., ..., 16751., 17650., 19001.])

In [33]:
lending_data_numreic.sort(axis=0)
lending_data_numreic

array([[-2.8700e+03, -3.5000e+02,  5.0000e+01,  3.6500e+02,  1.0000e+03,
         1.1600e+03],
       [-2.8700e+03,  4.0000e+01,  1.0000e+02,  3.6500e+02,  1.0000e+03,
         1.5000e+03],
       [-2.5500e+03,  4.0000e+01,  1.5000e+02,  3.6500e+02,  1.0000e+03,
         2.2000e+03],
       ...,
       [ 1.2500e+02,  3.6500e+02,  7.2510e+03,  9.2010e+03,  1.8751e+04,
         5.4625e+04],
       [ 1.2500e+02,  3.6500e+02,  9.0000e+03,  1.2126e+04,  2.0001e+04,
         5.4625e+04],
       [ 1.6500e+02,  3.6500e+02,  9.0000e+03,  1.3900e+04,  2.2001e+04,
         6.4001e+04]])

In [35]:
?np.argsort

[0;31mSignature:[0m [0mnp[0m[0;34m.[0m[0margsort[0m[0;34m([0m[0ma[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m [0mkind[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0morder[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Returns the indices that would sort an array.

Perform an indirect sort along the given axis using the algorithm specified
by the `kind` keyword. It returns an array of indices of the same shape as
`a` that index data along the given axis in sorted order.

Parameters
----------
a : array_like
    Array to sort.
axis : int or None, optional
    Axis along which to sort.  The default is -1 (the last axis). If None,
    the flattened array is used.
kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
    Sorting algorithm. The default is 'quicksort'. Note that both 'stable'
    and 'mergesort' use timsort under the covers and, in general, the
    actual implementation will v

In [34]:
# argsort
np.argsort(lending_data_numreic)

array([[0, 1, 2, 3, 4, 5],
       [0, 1, 2, 3, 4, 5],
       [0, 1, 2, 3, 4, 5],
       ...,
       [0, 1, 2, 3, 4, 5],
       [0, 1, 2, 3, 4, 5],
       [0, 1, 2, 3, 4, 5]])

In [36]:
lending_data_numreic = np.loadtxt("datasets/Lending-company-Numeric.csv", delimiter=',')
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [37]:
np.argwhere(lending_data_numreic)

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]])

In [38]:
np.argwhere(lending_data_numreic > 1000)

array([[   0,    0],
       [   0,    3],
       [   0,    4],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]])

In [40]:
# get the co-ordiates of missing values 
np.argwhere(np.isnan(lending_data_numreic_nan))

array([[  11,    3],
       [  15,    3],
       [  27,    3],
       [  58,    3],
       [  60,    4],
       [  85,    4],
       [ 117,    5],
       [ 152,    1],
       [ 152,    2],
       [ 152,    4],
       [ 172,    1],
       [ 175,    1],
       [ 175,    2],
       [ 176,    3],
       [ 177,    4],
       [ 178,    5],
       [ 211,    3],
       [ 229,    0],
       [ 230,    1],
       [ 237,    1],
       [ 247,    3],
       [ 251,    5],
       [ 252,    4],
       [ 258,    1],
       [ 260,    3],
       [ 262,    4],
       [ 271,    5],
       [ 272,    4],
       [ 284,    2],
       [ 284,    3],
       [ 297,    1],
       [ 297,    2],
       [ 300,    3],
       [ 315,    3],
       [ 315,    5],
       [ 327,    4],
       [ 336,    4],
       [ 343,    0],
       [ 344,    2],
       [ 346,    2],
       [ 363,    3],
       [ 375,    3],
       [ 377,    2],
       [ 398,    5],
       [ 416,    4],
       [ 428,    0],
       [ 432,    1],
       [ 433,

#### casting 

In [42]:
lending_data_numreic.astype(dtype=np.int32)

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]], dtype=int32)

In [44]:
data_string = lending_data_numreic.astype(dtype=np.str)
data_string

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  data_string = lending_data_numreic.astype(dtype=np.str)


array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

#### Stacking 

In [46]:
lending_data_numreic

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [49]:
lending_data_numreic_nan_imputed

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [52]:
# example
np.stack((lending_data_numreic[:,1], lending_data_numreic[:,0]))

array([[  40.,   40.,   40., ...,   40.,   40.,   40.],
       [2000., 2000., 1000., ..., 2000., 1000., 2000.]])

In [53]:
np.stack((lending_data_numreic[:,1], lending_data_numreic[:,0]), axis=1)

array([[  40., 2000.],
       [  40., 2000.],
       [  40., 1000.],
       ...,
       [  40., 2000.],
       [  40., 1000.],
       [  40., 2000.]])

In [55]:
# conforming the shape
print(lending_data_numreic.shape, lending_data_numreic_nan.shape)

(1043, 6) (1043, 6)


In [57]:
np.vstack((lending_data_numreic, lending_data_numreic_nan)).shape

(2086, 6)

In [58]:
np.hstack((lending_data_numreic, lending_data_numreic_nan)).shape

(1043, 12)

In [60]:
np.dstack((lending_data_numreic, lending_data_numreic_nan))

array([[[ 2000.,  2000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 3121.,  3121.],
        [ 4241.,  4241.],
        [13621., 13621.]],

       [[ 2000.,  2000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 3061.,  3061.],
        [ 4171.,  4171.],
        [15041., 15041.]],

       [[ 1000.,  1000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 2160.,  2160.],
        [ 3280.,  3280.],
        [15340., 15340.]],

       ...,

       [[ 2000.,    nan],
        [   40.,    40.],
        [  365.,   365.],
        [ 4201.,  4201.],
        [ 5001.,  5001.],
        [16600., 16600.]],

       [[ 1000.,  1000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 2080.,  2080.],
        [ 3320.,  3320.],
        [15600., 15600.]],

       [[ 2000.,  2000.],
        [   40.,    40.],
        [  365.,   365.],
        [ 4601.,  4601.],
        [ 4601.,  4601.],
        [16600., 16600.]]])

In [61]:
# it adds the depth [row * cols * original array]
np.dstack((lending_data_numreic, lending_data_numreic_nan)).shape

(1043, 6, 2)

#### Concatenation
Linking together objects in chain. Creating a new (larger) array by meriging existing smaller arrays along a given axis.

In [62]:
np.concatenate((lending_data_numreic[0,:], lending_data_numreic[1,:]))

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.,  2000.,    40.,
         365.,  3061.,  4171., 15041.])

In [64]:
# acts as hstack, Another array is concatenated at the bottom of one array
np.concatenate((lending_data_numreic, lending_data_numreic_nan)).shape

(2086, 6)

#### Unique

In [65]:
np.unique(lending_data_numreic[:,1], return_counts=True, return_index=True)

(array([ 35.,  40.,  50., 125., 165.]),
 array([327,   0,   4,  19,  27]),
 array([  4, 567, 451,  19,   2]))