# Preprocessing with NumPy

In [1]:
import numpy as np

## Checking for Missing Values

In [2]:
file_num_nan = "Lending-company-Numeric-NAN.csv"
file_num = "Lending-company-Numeric.csv"
file_total_Price = "Lending-Company-Total-Price.csv"

In [3]:
lending_co_data_numeric = np.loadtxt(file_num, delimiter=",")

In [4]:
np.isnan(lending_co_data_numeric)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [5]:
np.isnan(lending_co_data_numeric).sum()

0

In [7]:
lending_co_data_numeric_nan = np.loadtxt(file_num_nan, delimiter=';')

ValueError: could not convert string to float: ''

In [10]:
# if we change, loadtxt to genfromtxt, we get no ValueError
lending_co_data_numeric_nan = np.genfromtxt(file_num_nan, delimiter=';')

In [11]:
# find the nan values
np.isnan(lending_co_data_numeric_nan)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [ True, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [12]:
np.isnan(lending_co_data_numeric_nan).sum()

260

In [13]:
# find a way to account for the missing values before examining the data
lending_co_data_numeric_nan = np.genfromtxt(file_num_nan, 
                                            delimiter=';',
                                            filling_values=0)

In [14]:
np.isnan(lending_co_data_numeric_nan).sum()

0

In [15]:
# filling missing values: values that aren't part of our dataset, we can later substitute with something more appropriate, use a number greater than the highest value in the dataset
lending_co_data_numeric_nan = np.genfromtxt(file_num_nan, delimiter=';')

In [18]:
temp_fill = np.nanmax(lending_co_data_numeric_nan).round(2) + 1
temp_fill

64002.0

In [19]:
lending_co_data_numeric_nan = np.genfromtxt(file_num_nan, 
                                            delimiter=';',
                                            filling_values=temp_fill)

In [20]:
np.isnan(lending_co_data_numeric_nan).sum()

0

## Substituting Missing Values

In [22]:
lending_co_data_numeric_nan = np.genfromtxt(file_num_nan, delimiter=';')
lending_co_data_numeric_nan

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [26]:
temp_mean = np.nanmean(lending_co_data_numeric_nan, axis=0).round(2)

In [27]:
temp_mean[0]

2250.25

In [28]:
temp_fill = np.nanmax(lending_co_data_numeric_nan).round(2) + 1
lending_co_data_numeric_nan = np.genfromtxt(file_num_nan, 
                                            delimiter=';',
                                            filling_values=temp_fill)

In [29]:
temp_fill

64002.0

In [31]:
# only get the values in the first column
np.mean(lending_co_data_numeric_nan[:,0]).round(2)

4263.25

In [32]:
temp_mean[0]

2250.25

In [34]:
lending_co_data_numeric_nan[:,0] = np.where(lending_co_data_numeric_nan[:,0] == temp_fill,
                                            temp_mean[0],
                                            lending_co_data_numeric_nan[:,0])
                                           

In [35]:
np.mean(lending_co_data_numeric_nan[:,0]).round(2)

2250.25

In [36]:
# [1] = numbers of columns
# [0] = numbers of rows
for i in range(lending_co_data_numeric_nan.shape[1]):
    lending_co_data_numeric_nan[:,i] = np.where(lending_co_data_numeric_nan[:,i] == temp_fill,
                                                temp_mean[i],
                                                lending_co_data_numeric_nan[:,i])

In [37]:
for i in range(lending_co_data_numeric_nan.shape[1]):
    lending_co_data_numeric_nan[:,i] = np.where(lending_co_data_numeric_nan[:,i] < 0, 0, lending_co_data_numeric_nan[:,i])

## Reshaping

In [38]:
lending_co_data_numeric = np.loadtxt(file_num, delimiter=',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [39]:
# reshaping: certain conditions about shapes and sizes need to be met, not always possible to store the outputs of a function as a part of an existing array (or series)
lending_co_data_numeric.shape

# 1043 rows, 6 columns

(1043, 6)

In [42]:
np.reshape(lending_co_data_numeric, (6, 1043))

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [43]:
np.transpose(lending_co_data_numeric)

array([[ 2000.,  2000.,  1000., ...,  2000.,  1000.,  2000.],
       [   40.,    40.,    40., ...,    40.,    40.,    40.],
       [  365.,   365.,   365., ...,   365.,   365.,   365.],
       [ 3121.,  3061.,  2160., ...,  4201.,  2080.,  4601.],
       [ 4241.,  4171.,  3280., ...,  5001.,  3320.,  4601.],
       [13621., 15041., 15340., ..., 16600., 15600., 16600.]])

In [44]:
np.reshape(lending_co_data_numeric, (3, 500))

ValueError: cannot reshape array of size 6258 into shape (3,500)

In [45]:
np.reshape(lending_co_data_numeric, (3, 2086))

array([[ 2000.,    40.,   365., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  4601.,  4601., 16600.]])

In [46]:
np.reshape(lending_co_data_numeric, (2, 3, 1043))

array([[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
        [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
        [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

       [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
        [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
        [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]])

In [47]:
np.reshape(lending_co_data_numeric, (1,1,2,3,1043))

array([[[[[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
          [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
          [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.]],

         [[ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
          [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
          [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]]]]])

In [48]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [49]:
lending_co_data_numeric_2 = np.reshape(lending_co_data_numeric, (6, 1043))
lending_co_data_numeric_2

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

In [50]:
lending_co_data_numeric.reshape(6, 1043)

array([[ 2000.,    40.,   365., ...,   365.,  1581.,  3041.],
       [12277.,  2000.,    40., ...,    50.,   365.,  5350.],
       [ 6850., 15150.,  1000., ...,  2000.,    40.,   365.],
       [ 3101.,  4351., 16600., ..., 16600.,  2000.,    40.],
       [  365.,  3441.,  4661., ...,  8450., 22250.,  2000.],
       [   40.,   365.,  3701., ...,  4601.,  4601., 16600.]])

## Removing Values

In [51]:
lending_co_data_numeric = np.loadtxt(file_num, delimiter=',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [54]:
lending_co_data_numeric.shape

(1043, 6)

In [55]:
lending_co_data_numeric.size

6258

In [56]:
np.delete(lending_co_data_numeric, 0).shape

(6257,)

In [57]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [59]:
# what to do to get rid of entire rows or columns?
np.delete(lending_co_data_numeric, 1, axis=0)


array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [61]:
# remove the first, third and fifth column of the array
np.delete(lending_co_data_numeric, (0,2,4), axis=1)

array([[   40.,  3121., 13621.],
       [   40.,  3061., 15041.],
       [   40.,  2160., 15340.],
       ...,
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.],
       [   40.,  4601., 16600.]])

In [62]:
# how to remove both rows and columns simultaneously?
np.delete(np.delete(lending_co_data_numeric, [0,2,4], axis=1), [0,2,-1], axis=0)

array([[   40.,  3061., 15041.],
       [   40.,  3041., 15321.],
       [   50.,  3470., 13720.],
       ...,
       [   40.,  4240., 16600.],
       [   40.,  4201., 16600.],
       [   40.,  2080., 15600.]])

## Sorting Data

In [63]:
# import the dataset
lending_co_data_numeric = np.loadtxt(file_num, delimiter=',')

In [64]:
# display the dataset
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [65]:
# np.sort() takes an array and returns a sorted version in asc order
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [66]:
np.sort(lending_co_data_numeric).shape

(1043, 6)

In [67]:
lending_co_data_numeric.shape

(1043, 6)

In [80]:
np.sort(lending_co_data_numeric, axis=0)

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [79]:
# to tell numpy to refrain from using scientific notation, warning: this applies to the entire work
np.set_printoptions(suppress=True)

In [81]:
np.sort(lending_co_data_numeric, axis=None)

array([-2870., -2870., -2550., ..., 54625., 54625., 64001.])

In [70]:
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [88]:
np.sort(lending_co_data_numeric)

array([[-2870., -2870.,  -350.,    35.,   365.,  1000.],
       [-2550., -2100.,    35.,   150.,   365.,  1000.],
       [-2450., -2000.,    35.,   365.,  1000.,  1100.],
       ...,
       [  125.,   365.,  9000., 16751., 18751., 54625.],
       [  165.,   365.,  9000., 17650., 20001., 54625.],
       [  165.,   365.,  9000., 19001., 22001., 64001.]])

In [89]:
# the minus sign literally changes the sign of every individual element of the input variable
-np.sort(-lending_co_data_numeric)

array([[ 1000.,   365.,    35.,  -350., -2870., -2870.],
       [ 1000.,   365.,   150.,    35., -2100., -2550.],
       [ 1100.,  1000.,   365.,    35., -2000., -2450.],
       ...,
       [54625., 18751., 16751.,  9000.,   365.,   125.],
       [54625., 20001., 17650.,  9000.,   365.,   165.],
       [64001., 22001., 19001.,  9000.,   365.,   165.]])

In [90]:
lending_co_data_numeric

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [96]:
lending_co_data_numeric.sort(axis=0)
lending_co_data_numeric

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [97]:
lending_co_data_numeric[:,3]

array([-2870., -2550., -2450., ..., 16751., 17650., 19001.])

In [98]:
lending_co_data_numeric[:,3].sort()
lending_co_data_numeric

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

## Argument Functions

### np.argsort()

In [110]:
lending_co_data_numeric = np.loadtxt(file_num, delimiter=',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [111]:
np.argsort(lending_co_data_numeric)
# the function returns the indices that would sort this array

array([[1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       ...,
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5],
       [1, 2, 0, 3, 4, 5]])

In [112]:
np.sort(lending_co_data_numeric)

array([[   40.,   365.,  2000.,  3121.,  4241., 13621.],
       [   40.,   365.,  2000.,  3061.,  4171., 15041.],
       [   40.,   365.,  1000.,  2160.,  3280., 15340.],
       ...,
       [   40.,   365.,  2000.,  4201.,  5001., 16600.],
       [   40.,   365.,  1000.,  2080.,  3320., 15600.],
       [   40.,   365.,  2000.,  4601.,  4601., 16600.]])

In [113]:
np.sort(lending_co_data_numeric, axis=0)

array([[ 1000.,    35.,   365., -2870., -2870.,  -350.],
       [ 1000.,    35.,   365., -2550., -2100.,   150.],
       [ 1000.,    35.,   365., -2450., -2000.,  1100.],
       ...,
       [ 9000.,   125.,   365., 16751., 18751., 54625.],
       [ 9000.,   165.,   365., 17650., 20001., 54625.],
       [ 9000.,   165.,   365., 19001., 22001., 64001.]])

In [114]:
np.argsort(lending_co_data_numeric, axis=0)

array([[ 537,  443,    0,   32,   32,  482],
       [ 639,  327,  687,  166,  166,  493],
       [ 849,  432,  688,   85,   85,  166],
       ...,
       [  27,  326,  355,  568, 1019,  568],
       [ 277,   27,  357,  718, 1033,  534],
       [ 420,  408, 1042,  912,  912,   27]])

In [115]:
lending_co_data_numeric[482,5]

-350.0

In [116]:
lending_co_data_numeric = lending_co_data_numeric[np.argsort(lending_co_data_numeric[:,0])]
lending_co_data_numeric
# we refer to the code inside the square brackets as the condition

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2200.,  3800., 15600.],
       [ 1000.,    40.,   365.,  2000.,  3950., 15600.],
       ...,
       [ 9000.,   165.,   365., 14501., 16846., 64001.],
       [ 9000.,   125.,   365., 12001., 15751., 38626.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

In [104]:
lending_co_data_numeric.argsort(axis=0)

array([[   0,   22,    0,  199,  199,  172],
       [ 155,   62,  687,   53,   53,  160],
       [ 156,   38,  688,  169,  169,   53],
       ...,
       [1022, 1042,  355, 1024, 1037, 1023],
       [1031, 1039,  357,  941, 1029, 1024],
       [1042, 1040, 1042, 1027, 1027, 1040]])

In [105]:
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  2200.,  3400., 15600.],
       [ 1000.,    40.,   365.,  2200.,  3800., 15600.],
       [ 1000.,    40.,   365.,  2000.,  3950., 15600.],
       ...,
       [ 9000.,   165.,   365., 14501., 16846., 64001.],
       [ 9000.,   125.,   365., 12001., 15751., 38626.],
       [ 9000.,   125.,   365., 12251., 14251., 25626.]])

### np.argwhere()

In [117]:
# load dataset and then display it
lending_co_data_numeric = np.loadtxt(file_num, delimiter=',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [132]:
np.argwhere(lending_co_data_numeric)
# goes over entire array and checks whether the individual element satisfy a given condition
# the outputs are indices for all the individual elements where the condition is met
# the coordinates for each value from the original array 

array([[   0,    0],
       [   0,    1],
       [   0,    2],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]])

In [119]:
np.argwhere(lending_co_data_numeric == False)

array([[116,   4],
       [430,   3]])

In [133]:
lending_co_data_numeric[116]

array([ 1000.,    50.,   365., -1450.,     0., 13850.])

In [121]:
lending_co_data_numeric[430]

array([1000.,   50.,  365.,    0.,  550., 5650.])

In [122]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [134]:
np.argwhere(lending_co_data_numeric > 1000)

array([[   0,    0],
       [   0,    3],
       [   0,    4],
       ...,
       [1042,    3],
       [1042,    4],
       [1042,    5]])

In [135]:
np.argwhere(lending_co_data_numeric < 1000)

array([[   0,    1],
       [   0,    2],
       [   1,    1],
       ...,
       [1041,    2],
       [1042,    1],
       [1042,    2]])

In [123]:
np.argwhere(lending_co_data_numeric % 2 == 0)

array([[   0,    0],
       [   0,    1],
       [   1,    0],
       ...,
       [1042,    0],
       [1042,    1],
       [1042,    5]])

In [124]:
np.isnan(lending_co_data_numeric).sum()
# there are zero missing values here

0

In [126]:
lending_co_data_numeric_nan = np.genfromtxt(file_num_nan, delimiter=';')
lending_co_data_numeric_nan

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [   nan,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [136]:
np.isnan(lending_co_data_numeric_nan)

array([[False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       ...,
       [False, False, False, False, False, False],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])

In [127]:
np.argwhere(np.isnan(lending_co_data_numeric_nan))

array([[  11,    3],
       [  15,    3],
       [  27,    3],
       [  58,    3],
       [  60,    4],
       [  85,    4],
       [ 117,    5],
       [ 152,    1],
       [ 152,    2],
       [ 152,    4],
       [ 172,    1],
       [ 175,    1],
       [ 175,    2],
       [ 176,    3],
       [ 177,    4],
       [ 178,    5],
       [ 211,    3],
       [ 229,    0],
       [ 230,    1],
       [ 237,    1],
       [ 247,    3],
       [ 251,    5],
       [ 252,    4],
       [ 258,    1],
       [ 260,    3],
       [ 262,    4],
       [ 271,    5],
       [ 272,    4],
       [ 284,    2],
       [ 284,    3],
       [ 297,    1],
       [ 297,    2],
       [ 300,    3],
       [ 315,    3],
       [ 315,    5],
       [ 327,    4],
       [ 336,    4],
       [ 343,    0],
       [ 344,    2],
       [ 346,    2],
       [ 363,    3],
       [ 375,    3],
       [ 377,    2],
       [ 398,    5],
       [ 416,    4],
       [ 428,    0],
       [ 432,    1],
       [ 433,

In [139]:
lending_co_data_numeric_nan[1028]

array([    0.,    40.,   365.,     0.,     0., 15600.])

In [129]:
for arr_i in np.argwhere(np.isnan(lending_co_data_numeric_nan)):
    lending_co_data_numeric_nan[arr_i[0], arr_i[1]] = 0

In [130]:
lending_co_data_numeric_nan[175]

array([ 2000.,     0.,     0.,  1851.,  3051., 13561.])

In [131]:
np.isnan(lending_co_data_numeric_nan).sum()

0

## Shuffling Data

In [144]:
# load the file but only display the first 8 
lending_co_data_numeric = np.loadtxt(file_num, delimiter=',')[:8]
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.]])

In [145]:
np.random.shuffle(lending_co_data_numeric)

In [146]:
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.]])

In [147]:
np.random.shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       [ 2000.,    40.,   365.,  3201.,  4141., 14141.],
       [ 2000.,    40.,   365.,  3041.,  4241., 15321.],
       [ 2000.,    50.,   365.,  3470.,  4820., 13720.],
       [ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    50.,   365.,  1851.,  3251., 17701.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 2000.,    40.,   365.,  3971.,  4131., 15351.]])

In [148]:
lending_co_data_numeric = np.loadtxt(file_num, delimiter=',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [149]:
from numpy.random import shuffle

In [151]:
shuffle(lending_co_data_numeric)
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  6581.,  8781., 15181.],
       [ 2000.,    40.,   365.,  3200.,  4620., 16600.],
       [ 1000.,    40.,   365.,  2360.,  3860., 15600.],
       ...,
       [ 4000.,    50.,   365.,  5500.,  7000., 22250.],
       [ 4000.,    50.,   365.,  5380.,  6770., 19005.],
       [ 4000.,    50.,   365., 14500., 14500., 21650.]])

In [152]:
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg 

In [153]:
arr_rg = gen(pcg(seed=365))
arr_rg.shuffle(lending_co_data_numeric)
lending_co_data_numeric

# a shuffle prevails over the use of seeds
# we can't replicate the same shuffle twice

array([[ 2500.,    50.,   365.,  2500.,  4100., 16650.],
       [ 2000.,    40.,   365.,  7200., 10200., 16050.],
       [ 2000.,    40.,   365.,  3401.,  5056.,  8996.],
       ...,
       [ 2000.,    40.,   365.,  3501.,  4506., 15166.],
       [ 4000.,    50.,   365.,  6000.,  7000., 22250.],
       [ 4000.,    50.,   365.,  5350.,  6850., 15150.]])

## Casting

In [154]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter=",")
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [156]:
# astype = assign type / dtype = data type
lending_co_data_numeric.astype(dtype=np.int32)

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]], dtype=int32)

In [157]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype=str)

In [158]:
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [159]:
type(lending_co_data_numeric)

numpy.ndarray

In [160]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype=np.float32)
lending_co_data_numeric.astype(dtype=np.int32)

array([[ 2000,    40,   365,  3121,  4241, 13621],
       [ 2000,    40,   365,  3061,  4171, 15041],
       [ 1000,    40,   365,  2160,  3280, 15340],
       ...,
       [ 2000,    40,   365,  4201,  5001, 16600],
       [ 1000,    40,   365,  2080,  3320, 15600],
       [ 2000,    40,   365,  4601,  4601, 16600]], dtype=int32)

In [163]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter=',')
lending_co_data_numeric = lending_co_data_numeric.astype(dtype=str)
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

In [164]:
lending_co_data_numeric.astype(dtype=np.float32).astype(dtype=np.int32)
lending_co_data_numeric

array([['2000.0', '40.0', '365.0', '3121.0', '4241.0', '13621.0'],
       ['2000.0', '40.0', '365.0', '3061.0', '4171.0', '15041.0'],
       ['1000.0', '40.0', '365.0', '2160.0', '3280.0', '15340.0'],
       ...,
       ['2000.0', '40.0', '365.0', '4201.0', '5001.0', '16600.0'],
       ['1000.0', '40.0', '365.0', '2080.0', '3320.0', '15600.0'],
       ['2000.0', '40.0', '365.0', '4601.0', '4601.0', '16600.0']],
      dtype='<U32')

## Stripping Data

In [208]:
import numpy as np

In [209]:
lending_co_total_price = np.genfromtxt("Lending-Company-Total-Price.csv", 
                                        delimiter=',', 
                                        dtype=str, 
                                        skip_header=1, 
                                        usecols=[1,2,4])
lending_co_total_price

array([['id_1', 'Product B', 'Location 2'],
       ['id_2', 'Product B', 'Location 3'],
       ['id_3', 'Product C', 'Location 5'],
       ...,
       ['id_413', 'Product B', 'Location 135'],
       ['id_414', 'Product C', 'Location 200'],
       ['id_415', 'Product A', 'Location 8']], dtype='<U12')

In [210]:
# remove id from the first column 
lending_co_total_price[:,0] = np.chararray.strip(lending_co_total_price[:,0], "id_")
# remove Product from the second 
lending_co_total_price[:,1] = np.chararray.strip(lending_co_total_price[:,1], "Product ")
# and Location from the third one
lending_co_total_price[:,2] = np.chararray.strip(lending_co_total_price[:,2], "Location ")

lending_co_total_price

array([['1', 'B', '2'],
       ['2', 'B', '3'],
       ['3', 'C', '5'],
       ...,
       ['413', 'B', '135'],
       ['414', 'C', '200'],
       ['415', 'A', '8']], dtype='<U12')

In [211]:
# we can combine stripping with substituting to transform all the letters into numbers
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'A', 1, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'B', 2, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'C', 3, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'D', 4, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'E', 5, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'F', 6, lending_co_total_price[:,1]) 

lending_co_total_price

array([['1', '2', '2'],
       ['2', '2', '3'],
       ['3', '3', '5'],
       ...,
       ['413', '2', '135'],
       ['414', '3', '200'],
       ['415', '1', '8']], dtype='<U12')

In [212]:
lending_co_total_price = lending_co_total_price.astype(dtype=np.int32)
lending_co_total_price

array([[  1,   2,   2],
       [  2,   2,   3],
       [  3,   3,   5],
       ...,
       [413,   2, 135],
       [414,   3, 200],
       [415,   1,   8]], dtype=int32)

## Stacking

In [213]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter=",")
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [214]:
# create a filler, reimport and fill all the nans then replace all the temporary fillers with more appropriate values

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter=';')

temp_filler = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temp_mean = np.nanmean(lending_co_data_numeric_NAN, axis=0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter=';', filling_values=temp_filler)

for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temp_filler, \
    temp_mean[i], lending_co_data_numeric_NAN[:,i])

lending_co_data_numeric_NAN

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [215]:
# stack the first two rows
np.stack((lending_co_data_numeric[:,1], lending_co_data_numeric[:,0]))

array([[  40.,   40.,   40., ...,   40.,   40.,   40.],
       [2000., 2000., 1000., ..., 2000., 1000., 2000.]])

In [216]:
np.transpose(lending_co_data_numeric[:,:2])

array([[2000., 2000., 1000., ..., 2000., 1000., 2000.],
       [  40.,   40.,   40., ...,   40.,   40.,   40.]])

In [224]:
np.stack((lending_co_data_numeric[:,0], lending_co_data_numeric[:,1], lending_co_data_numeric[:,2]), axis=0)

array([[2000., 2000., 1000., ..., 2000., 1000., 2000.],
       [  40.,   40.,   40., ...,   40.,   40.,   40.],
       [ 365.,  365.,  365., ...,  365.,  365.,  365.]])

In [217]:
# stack more than two arrays
np.stack((lending_co_data_numeric[:,0], lending_co_data_numeric[:,1], lending_co_data_numeric[:,2]), axis=1)

array([[2000.,   40.,  365.],
       [2000.,   40.,  365.],
       [1000.,   40.,  365.],
       ...,
       [2000.,   40.,  365.],
       [1000.,   40.,  365.],
       [2000.,   40.,  365.]])

In [218]:
lending_co_data_numeric.shape

(1043, 6)

In [225]:
lending_co_data_numeric_NAN.shape

(1043, 6)

In [219]:
# stack 2-D arrays
np.stack((lending_co_data_numeric, lending_co_data_numeric_NAN))[0,:,0]

array([2000., 2000., 1000., ..., 2000., 1000., 2000.])

In [230]:
np.vstack((lending_co_data_numeric, lending_co_data_numeric_NAN))

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [231]:
type(np.vstack((lending_co_data_numeric, lending_co_data_numeric_NAN)))

numpy.ndarray

In [233]:
np.vstack((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape

(2086, 6)

In [237]:
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN))

array([[[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 3121.  ,  3121.  ],
        [ 4241.  ,  4241.  ],
        [13621.  , 13621.  ]],

       [[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 3061.  ,  3061.  ],
        [ 4171.  ,  4171.  ],
        [15041.  , 15041.  ]],

       [[ 1000.  ,  1000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 2160.  ,  2160.  ],
        [ 3280.  ,  3280.  ],
        [15340.  , 15340.  ]],

       ...,

       [[ 2000.  ,  2250.25],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 4201.  ,  4201.  ],
        [ 5001.  ,  5001.  ],
        [16600.  , 16600.  ]],

       [[ 1000.  ,  1000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 2080.  ,  2080.  ],
        [ 3320.  ,  3320.  ],
        [15600.  , 15600.  ]],

       [[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  

In [235]:
# stack along a given axis
np.stack((lending_co_data_numeric, lending_co_data_numeric_NAN), axis=-1)

array([[[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 3121.  ,  3121.  ],
        [ 4241.  ,  4241.  ],
        [13621.  , 13621.  ]],

       [[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 3061.  ,  3061.  ],
        [ 4171.  ,  4171.  ],
        [15041.  , 15041.  ]],

       [[ 1000.  ,  1000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 2160.  ,  2160.  ],
        [ 3280.  ,  3280.  ],
        [15340.  , 15340.  ]],

       ...,

       [[ 2000.  ,  2250.25],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 4201.  ,  4201.  ],
        [ 5001.  ,  5001.  ],
        [16600.  , 16600.  ]],

       [[ 1000.  ,  1000.  ],
        [   40.  ,    40.  ],
        [  365.  ,   365.  ],
        [ 2080.  ,  2080.  ],
        [ 3320.  ,  3320.  ],
        [15600.  , 15600.  ]],

       [[ 2000.  ,  2000.  ],
        [   40.  ,    40.  ],
        [  365.  

In [221]:
# create some 3D arrays to show how dstack works for higher dimensions
array_example_one = np.array([[[1,2,3,4],[5,6,7,8],[9,10,11,12]],[[21,22,23,24],[25,26,27,28],[29,30,31,32]]])
array_example_two = array_example_one * 2

In [222]:
np.dstack((array_example_one, array_example_two)).shape

(2, 3, 8)

In [238]:
np.dstack((array_example_one, array_example_two))

array([[[ 1,  2,  3,  4,  2,  4,  6,  8],
        [ 5,  6,  7,  8, 10, 12, 14, 16],
        [ 9, 10, 11, 12, 18, 20, 22, 24]],

       [[21, 22, 23, 24, 42, 44, 46, 48],
        [25, 26, 27, 28, 50, 52, 54, 56],
        [29, 30, 31, 32, 58, 60, 62, 64]]])

In [223]:
np.stack((array_example_one, array_example_two), axis=2).shape

(2, 3, 2, 4)

In [240]:
np.stack((array_example_one, array_example_two), axis=1)

array([[[[ 1,  2,  3,  4],
         [ 5,  6,  7,  8],
         [ 9, 10, 11, 12]],

        [[ 2,  4,  6,  8],
         [10, 12, 14, 16],
         [18, 20, 22, 24]]],


       [[[21, 22, 23, 24],
         [25, 26, 27, 28],
         [29, 30, 31, 32]],

        [[42, 44, 46, 48],
         [50, 52, 54, 56],
         [58, 60, 62, 64]]]])

## Concatenate

In [245]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter=',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [246]:
lending_co_data_numeric[0,:]

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.])

In [247]:
lending_co_data_numeric[1,:]

array([ 2000.,    40.,   365.,  3061.,  4171., 15041.])

In [264]:
np.concatenate((lending_co_data_numeric[0,:], lending_co_data_numeric[1,:]), axis=0)
# the concatenated array has the same number of dimensions as the inputs

array([ 2000.,    40.,   365.,  3121.,  4241., 13621.,  2000.,    40.,
         365.,  3061.,  4171., 15041.])

In [249]:
# load the data
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')
# store the means/create a filler
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)
# plug in that filler in the empty spots
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv",
                                            delimiter = ';', 
                                            filling_values = temporary_fill)
# swich those with the appropriate means for every column
for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,
                                                temporary_mean[i],
                                                lending_co_data_numeric_NAN[:,i])
    
lending_co_data_numeric_NAN

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [252]:
lending_co_data_numeric.shape

(1043, 6)

In [253]:
lending_co_data_numeric_NAN.shape

(1043, 6)

In [265]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN))

array([[ 2000.  ,    40.  ,   365.  ,  3121.  ,  4241.  , 13621.  ],
       [ 2000.  ,    40.  ,   365.  ,  3061.  ,  4171.  , 15041.  ],
       [ 1000.  ,    40.  ,   365.  ,  2160.  ,  3280.  , 15340.  ],
       ...,
       [ 2250.25,    40.  ,   365.  ,  4201.  ,  5001.  , 16600.  ],
       [ 1000.  ,    40.  ,   365.  ,  2080.  ,  3320.  , 15600.  ],
       [ 2000.  ,    40.  ,   365.  ,  4601.  ,  4601.  , 16600.  ]])

In [266]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN)).shape

(2086, 6)

In [267]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN), axis=1)

array([[ 2000.,    40.,   365., ...,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365., ...,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365., ...,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365., ...,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365., ...,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365., ...,  4601.,  4601., 16600.]])

In [268]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN), axis=1).shape

(1043, 12)

In [256]:
array_example_1 = np.array([[[1,2,3,4],[5,6,7,8],[9,10,11,12]],[[13,14,15,16],[17,18,19,20],[21,22,23,24]]])
array_example_2 = array_example_1 * 2

In [257]:
array_example_1

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]],

       [[13, 14, 15, 16],
        [17, 18, 19, 20],
        [21, 22, 23, 24]]])

In [258]:
array_example_2

array([[[ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24]],

       [[26, 28, 30, 32],
        [34, 36, 38, 40],
        [42, 44, 46, 48]]])

In [269]:
np.vstack((array_example_1, array_example_2))

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12]],

       [[13, 14, 15, 16],
        [17, 18, 19, 20],
        [21, 22, 23, 24]],

       [[ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24]],

       [[26, 28, 30, 32],
        [34, 36, 38, 40],
        [42, 44, 46, 48]]])

In [270]:
np.dstack((array_example_1, array_example_2))

array([[[ 1,  2,  3,  4,  2,  4,  6,  8],
        [ 5,  6,  7,  8, 10, 12, 14, 16],
        [ 9, 10, 11, 12, 18, 20, 22, 24]],

       [[13, 14, 15, 16, 26, 28, 30, 32],
        [17, 18, 19, 20, 34, 36, 38, 40],
        [21, 22, 23, 24, 42, 44, 46, 48]]])

In [271]:
np.hstack((array_example_1, array_example_2))

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12],
        [ 2,  4,  6,  8],
        [10, 12, 14, 16],
        [18, 20, 22, 24]],

       [[13, 14, 15, 16],
        [17, 18, 19, 20],
        [21, 22, 23, 24],
        [26, 28, 30, 32],
        [34, 36, 38, 40],
        [42, 44, 46, 48]]])

In [275]:
np.concatenate((lending_co_data_numeric[0,:], lending_co_data_numeric[:,0]))

array([2000.,   40.,  365., ..., 2000., 1000., 2000.])

In [277]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric[:,:1]))

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 6 and the array at index 1 has size 1

In [278]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric[:,:1]), axis=1)

array([[ 2000.,    40.,   365., ...,  4241., 13621.,  2000.],
       [ 2000.,    40.,   365., ...,  4171., 15041.,  2000.],
       [ 1000.,    40.,   365., ...,  3280., 15340.,  1000.],
       ...,
       [ 2000.,    40.,   365., ...,  5001., 16600.,  2000.],
       [ 1000.,    40.,   365., ...,  3320., 15600.,  1000.],
       [ 2000.,    40.,   365., ...,  4601., 16600.,  2000.]])

## Unique 

In [279]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter=',')
lending_co_data_numeric

array([[ 2000.,    40.,   365.,  3121.,  4241., 13621.],
       [ 2000.,    40.,   365.,  3061.,  4171., 15041.],
       [ 1000.,    40.,   365.,  2160.,  3280., 15340.],
       ...,
       [ 2000.,    40.,   365.,  4201.,  5001., 16600.],
       [ 1000.,    40.,   365.,  2080.,  3320., 15600.],
       [ 2000.,    40.,   365.,  4601.,  4601., 16600.]])

In [280]:
np.unique(lending_co_data_numeric)

array([-2870., -2550., -2450., ..., 52751., 54625., 64001.])

In [285]:
print(np.unique(lending_co_data_numeric),end=" ")

[-2870. -2550. -2450. ... 52751. 54625. 64001.] 

In [286]:
np.unique(lending_co_data_numeric[:,1])

array([ 35.,  40.,  50., 125., 165.])

In [282]:
np.unique(lending_co_data_numeric[:,1], return_counts=True)

(array([ 35.,  40.,  50., 125., 165.]), array([  4, 567, 451,  19,   2]))

In [281]:
# unique = returns the unique values within the array in increasing order
np.unique(lending_co_data_numeric[:,1])

array([ 35.,  40.,  50., 125., 165.])

In [283]:
np.unique(lending_co_data_numeric[:,1], return_counts=True, return_index=True)

(array([ 35.,  40.,  50., 125., 165.]),
 array([327,   0,   4,  19,  27]),
 array([  4, 567, 451,  19,   2]))

In [287]:
array_example = np.array(["a1", "a3","A1","A3","A3","AA1","B1","A2","B1","A2","B2","B2", "B3","a2","a3","B3","B3","a3" ])
np.unique(array_example)

array(['A1', 'A2', 'A3', 'AA1', 'B1', 'B2', 'B3', 'a1', 'a2', 'a3'],
      dtype='<U3')