# Introduction to NumPy

## Installing & Updating the Package

NumPy comes pre-isntalled if you're using Anaconda. pip install numpy

To get the latest version: pip install numpy --upgrade

In [3]:
!pip install numpy




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# Check the version
import numpy as np
print(np.__version__)

2.0.1


In [5]:
### Importing NumPy

In [6]:
import numpy as np

## We import packages libraries with "import name_of_packagge".
## Using "as np" is not mandatory. It's just common practice.

### Using NumPy

In [7]:
array_a = np.array([1,2,3])

## "np." -> Signals this is a NumPy function, since we imported the package as "np".
## np.array() creates an array with the values we input.

In [8]:
array_a

## Displays the variable (unique to Jupyter).

array([1, 2, 3])

In [9]:
array_b = np.array([[1,2,3],[4,5,6]])
array_b

array([[1, 2, 3],
       [4, 5, 6]])

### NumPy Documentation
https://numpy.org/devdocs/ <- <i> A link to the NumPy documentation

In [None]:
np.mean(array_b, axis = 0)

## The np.mean() function takes an array and returns its algebraic mean.
## Axis 0 -> We're running this function over each column.

In [None]:
# Why NumPy?

In [None]:
import numpy as np

In [None]:
### Ndarrays

In [None]:
array_a = np.array([1,2,3])

In [None]:
print(array_a)

In [None]:
type(array_a)

In [None]:
array_a.shape

## "shape" is an attribute of the ndarray, rather than a method, so it's not callable (doesn't need "()" at the end)

In [None]:
array_b = np.array([[7,8,9],[10,11,12]])

In [None]:
type(array_b)

In [None]:
print(array_b)

In [None]:
array_b.shape
#array_b.shape[0]
#array_b.shape[1]

## We can use indexing to get a specific part of the output tuple. 

In [None]:
array_c = np.array(13)

In [None]:
type(array_c)

## array_c is a 0-D array

In [None]:
print(array_c)

In [None]:
array_c.shape

In [None]:
array_d = np.array([15])

## We can add square brackets to assign the right number of dimensions to the array we're creating.

In [None]:
type(array_d)

In [None]:
print(array_d)

In [None]:
array_d.shape

## Similar to array_a from the start of the lecture "(3,)". It's because array_d is a vector of size 1, rather than a scalar.

In [None]:
array_e = np.array([[15]])
array_e

### List vs Array

In [None]:
#list_a = [[1,2,3,4,5,6]]
list_a = [[1,2,3],[4,5,6]]

## Syntax for creating lists is similar to syntax for creating arrays.

In [None]:
len(list_a)

## The length is 2 because we're only counting the sublists of list_a. 

In [None]:
array_a = np.array(list_a)

In [None]:
type(list_a)

In [None]:
type(array_a)

In [None]:
print(list_a)

In [None]:
print(array_a)

In [None]:
array_a.shape

In [None]:
# list_a.shape results in an error because lists don't have a shape.
#len(list_a)

len(list_a[1])

## We can use indexing to refer to a specific sublists out of a list. 

In [None]:
list_a

In [None]:
array_a

In [None]:
list_b = list_a[0] + list_a[1]
array_b = array_a[0] + array_a[1]

In [None]:
print(list_b)

## The primary purpose of lists is to store data, so list_b concatenates the rows of list_a.

In [None]:
print(array_b)

## The primary purpose of arrays is computational, so array_b adds the values elementwise.

In [None]:
import math
# array_e = math.sqrt(list_a)       # We can't provide an entire list as the input.
# array_e = math.sqrt(list_a[1,0])  # Even individual elements of a list result in an error. 
array_e = math.sqrt(array_a[1,0])
print(array_e)

## The "e" in "array_e" stands for "example".

In [10]:
np.sqrt(array_a)

## The square root function computes the square root for every element of the array. 

array([1.        , 1.41421356, 1.73205081])

In [None]:
# NumPy Fundamentals

In [None]:
import numpy as np

In [None]:
## Indexing

In [None]:
array_a = np.array([[1,2,3],[4,5,6]])
array_a

In [None]:
### Specific Values

In [None]:
array_a[1]

## By adding numbers between square brackets, we can reference specific values of the array. 
## Python uses 0-indexing, so the first position has an index 0, the second position has index 1, and so on.

In [None]:
array_a[1][0]

## We can index every dimension of the array separately. 

In [None]:
array_a[1,0]

## [1,0] is equivalent to [1][0] 

In [None]:
array_a[:,0]

## The ":" is equivalent to "from start to end" in this context. 

In [None]:
### Negative Indices

In [None]:
array_b = np.array([1,2,3])
array_b[-1]

## Negative indices mean traversing from the back. 
## No such thing as  -0 , so the first negative index is -1.

In [None]:
array_a

In [None]:
array_a[-1]

In [None]:
# array_a[-3] 
# Goes out of bounds, since -3 implies there are 3 rows. 

In [None]:
## Assigning Values

In [None]:
array_a = np.array([[1,2,3],[4,5,6]])
array_a

In [None]:
array_a[0,2] = 9
array_a 

## Assign a value to an individual element.

In [None]:
array_a[0] = 9
array_a

## Assign a value to an entire row.

In [None]:
array_a[:,0] = 9
array_a

## Assign a value to an entire column.

In [None]:
list_a = [8,7,8]

array_a[0] = list_a
array_a

## Assign different values to an entire row via a list. 

In [None]:
type(array_a[0])

In [None]:
array_a[:] = 9
array_a

## Assign the same value to all the individual elements in the array.

In [None]:
array_a = 9
array_a

## Type assignment in Python is dynamic. Hence, a variable's type can change based on what values we assign to it. 
## Here, array_a changes from an ndarray to an integer.

In [None]:
type(array_a)

In [None]:
array_a = np.array([[1,2,3],[4,5,6]])
array_a

In [None]:
## Elementwise Properties

In [None]:
array_a = np.array([7,8,9])
array_a

In [None]:
array_b = np.array([[1,2,3],[4,5,6]])
array_b

In [None]:
array_b * 2

## Multiplying each element of array_b by 2

In [None]:
list_a = [1,2,3]
list_a + [2]

## Since lists don't work elementwise, we're concatenating [2] to list_a.

In [None]:
array_a + 2

## Elementwise addition adds 2 to each element of array_a.

In [None]:
array_a * array_b[1]

## Elementwise multiplication. 
## We multiply each individual element of array_a by its corresponding element in the second row of array_b.

In [None]:
array_b - array_a

## The order of the elements matters for elementwise subtraction, division, as well as other operations. 

In [None]:
## Types of Data Supported by NumPy

In [None]:
array_a = np.array([[1,2,3],[4,5,6]])
array_a

In [None]:
array_a = np.array([[1,2,3],[4,5,6]], dtype = np.float16)
array_a

# Defining all the values as floats (decimals).

In [None]:
array_a = np.array([[1,2,3],[4,5,6]], dtype = np.complex64)
array_a

# Defining all the values as complex numbers.

In [None]:
array_a = np.array([[1,2,0],[4,5,6]], dtype = np.bool)
array_a

# Defining all the values as Booleans.

In [None]:
array_a = np.array([[10,2,3],[4,5,6]], dtype = np.str)
array_a

# Defining all the values as text.

In [None]:
https://numpy.org/devdocs/reference/generated/numpy.dtype.kind.html <- A link to the documentation explaining the unicode abbreviation.

In [None]:
## Characteristics of NumPy Functions

In [None]:
### Universal Functions

In [None]:
https://numpy.org/devdocs/reference/ufuncs.html <- <i> A link to the documentation page on Universal Functions

In [None]:
### Broadcasting

In [None]:
array_a = np.array([1,2,3])
array_a

In [None]:
array_b = np.array([[1],[2]])
array_b

In [None]:
matrix_C = np.array([[1,2,3],[4,5,6]])
matrix_C

In [None]:
np.add(array_b, matrix_C)

## Adding up values, even though the arrays don't have matching shapes. 

In [None]:
### Type Casting

In [None]:
np.add(array_b, matrix_C, dtype = np.float64)

## We can define the datatyep

In [None]:
### Running over an Axis

In [None]:
np.mean(matrix_C, axis = 1)

## Axis = 0 runs the function over every column. 
## Axis = 1 runs the function over every row. 

In [None]:
matrix_C 

In [None]:
# Working with Arrays

In [None]:
## Slicing

In [None]:
matrix_A = np.array([[1,2,3],[4,5,6]])
matrix_A

In [None]:
### Basic Slicing

In [None]:
matrix_A[:]

## The default start and stop for slicing are the origin and the end of the array. 
## Hence, [:] includes the entire array. 

In [None]:
type(matrix_A[:,:])

## [:,:] -> All rows, and all columns.

In [None]:
matrix_A[:2]

# [:2] -> All the rows up to the 3rd one (excluding the third one).

In [None]:
matrix_A[1]

In [None]:
matrix_A[:-1]

# [:-1] -> All the rows up to the last one.

In [None]:
matrix_A[:,1:]

# All the rows, but only the columns from the one with index 1 (second column) onwards.

In [None]:
matrix_A

In [None]:
matrix_A[1:,1:]

# All the rows after the first one and all the column after the first one. 

In [None]:
### Stepwise Slicing

In [None]:
matrix_B = np.array([[1,1,1,2,0], [3,6,6,7,4], [4,5,3,8,0]])
matrix_B

In [None]:
matrix_B[-1::-1,::2]

# The syntax for each dimension is "[start : stop : step]". 
# A negative step means we're going through the array in reverse.

In [None]:
### Conditional Slicing

In [None]:
matrix_C = np.array([[1,1,1,2,0], [3,6,6,7,4], [4,5,3,8,0]])
matrix_C

In [None]:
matrix_C[:,0]

In [None]:
matrix_C[:,0] > 2

# Returns True/False based on whether the individual element satisfies the condition. 

In [None]:
matrix_C[:,:] > 2

# Returns True/False based on whether the individual element satisfies the condition. 

In [None]:
matrix_C[matrix_C[:,:] % 2 == 0]

# Returns the actual values which satisfy the condition, not simply True or False.

In [None]:
matrix_C[(matrix_C[:,:] % 2 == 0) | (matrix_C[:,:] <= 4)]

# We can have more complex conditions, which are comprised of several smaller conditions. 
# & -> Both conditions must be met. 
# | -> Either condition can be met. 

In [None]:
## Dimensions and the Squeeze Function

In [None]:
matrix_D = np.array([[1,1,1,2,0], [3,6,6,7,4], [4,5,3,8,0]])
matrix_D

In [None]:
type(matrix_D[0,0])

# Fixing both indices. 
# 0-D array

In [None]:
print(matrix_D[0,0])

In [None]:
type(matrix_D[0,0:1])

# 1 index is fixed, the second one is a slice
# 1-D array

In [None]:
print(matrix_D[0,0:1])

In [None]:
type(matrix_D[0:1,0:1])

# Both indices are ranges (slices)
# 2-D array

In [None]:
print(matrix_D[0:1,0:1])

In [None]:
print(matrix_D[0,0].shape)
print(matrix_D[0,0:1].shape)
print(matrix_D[0:1,0:1].shape)

# Same value stored in 3 different ways -> 0-D, 1-D and 2-D array

In [None]:
print(matrix_D[0:1,0:1].squeeze())

## Removes excess dimensions

In [None]:
np.squeeze(matrix_D[0:1,0:1])

## The function is equivalent to the method. 

In [None]:
print(matrix_D[0,0].squeeze().shape)
print(matrix_D[0,0:1].squeeze().shape)
print(matrix_D[0:1,0:1].squeeze().shape)

## All excess dimensions are lost and our outputs are aligned. 

In [None]:
# Generating Data w/ Numpy

In [None]:
### np.empty(), np.zeros(), np.ones(), np.full()

In [None]:
array_empty = np.empty(shape = (2,3))
array_empty

In [None]:
# zeros
array_0s = np.zeros(shape  = (2,3))
array_0s

In [None]:
array_0s = np.zeros(shape = (2,3), dtype = np.int8) 
array_0s

In [None]:
# ones
array_1s = np.ones(shape  = (2,3))
array_1s

In [None]:
# full
array_full = np.full(shape = (2,3), fill_value = 2) # One additional mandatory argument - fill_value -> scalar
array_full

In [None]:
array_full = np.full(shape = (2,3), fill_value = 'Three-Six-Five')
array_full

In [None]:
### "_like" functions

In [None]:
matrix_A = np.array([[1,0,9,2,2],[3,23,4,5,1],[0,2,3,4,1]])
matrix_A

In [None]:
array_empty_like = np.empty_like(matrix_A)    

# Shape and type are like the prototype. 
# If we want to override this, we can define dtype and shape and pass different values (but why even use empty_like then). 

array_empty_like

In [None]:
array_0s_like = np.zeros_like(matrix_A)    
array_0s_like

# We have corresponding functions for 1s and full as well. 

In [None]:
#range(30)
list(range(30))

# range(30) results in a range object.
# list(range(30)) creates a list with all the values in this range.

In [None]:
array_rng = np.arange(30)
array_rng

## Creates an ndarray with the values in this range.

In [None]:
# array_rng = np.arange(stop =  30)
array_rng = np.arange(start =  30)
array_rng

# The only mandatory argument is "start", rather than stop. 
# If we specify only a start, the function assumes this is the "stop" and starts from the origin (0).

In [None]:
array_rng = np.arange(start = 0, stop =  30)
array_rng

In [None]:
array_rng = np.arange(start = 0, stop =  30, step = 2.5)
array_rng

# "Step" doesn't have to be the same type as the values of the array. 

In [None]:
array_rng = np.arange(start = 0, stop =  30, step = 2.5, dtype = np.float32)
array_rng = np.arange(start = 0, stop =  30, step = 2.5, dtype = np.int32)
array_rng

# The casting happens after all the computations. 

In [None]:
## Random Generators

In [None]:
### Defining Random Generators

In [None]:
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg


## We load two functions from the numpy.random module.

In [None]:
array_RG = gen(pcg())

#array_RG.normal()
#array_RG.normal(size = 5)
array_RG.normal(size = (5,5))

# RG is short for Random Generator.

In [None]:
array_RG = gen(pcg(seed = 365)) 
array_RG.normal(size = (5,5))

# Re-running this cell provides a consistent output, since the seed (with fixed starting values) is set. 

In [None]:
array_RG.normal(size = (5,5))

# The seed is fixed for a single itteration. 

In [None]:
### Generating Integers, Probabilities and Random Choices

In [None]:
array_RG = gen(pcg(seed = 365)) 
#array_RG.integers(10, size = (5,5))
array_RG.integers(low = 10, high = 100, size = (5,5))

# Generates integers within a range.

In [None]:
array_RG = gen(pcg(seed = 365)) 
array_RG.random(size = (5,5))

In [None]:
#array_RG.choice(matrix_A[0], size = (5,5))
array_RG = gen(pcg(seed = 365)) 
#array_RG.choice([1,2,3,4,5], size = (5,5))
array_RG.choice((1,2,3,4,5), p = [0.1,0.1,0.1,0.1,0.6],size = (5,5))

# Chooses among a given set (with possible weighted probabilities).

In [None]:
### Generating Arrays From Known Distributions

In [None]:
array_RG = gen(pcg(seed = 365)) 
array_RG.poisson(size = (5,5))

# The default Poisson distribution.

In [None]:
array_RG = gen(pcg(seed = 365)) 
array_RG.poisson(lam = 10,size = (5,5))

# Specifying lambda. 

In [None]:
array_RG = gen(pcg(seed = 365)) 
array_RG.binomial(n = 100, p = 0.4, size = (5,5))

# A binomial distribution with p = 0.4 and 100 trials. 

In [None]:
array_RG = gen(pcg(seed = 365)) 
array_RG.logistic(loc = 9, scale = 1.2, size = (5,5))

# A logistic distribution with a location = 9 and scale = 1.2.

In [None]:
### Applications of Random Generators

In [None]:
#### Creating Tests

In [None]:
array_RG = gen(pcg(seed = 365)) 

array_column_1 = array_RG.normal(loc = 2, scale = 3, size = (1000))
array_column_2 = array_RG.normal(loc = 7, scale = 2, size = (1000))
array_column_3 = array_RG.logistic(loc = 11, scale = 3, size = (1000))
array_column_4  = array_RG.exponential(scale = 4, size = (1000))
array_column_5  = array_RG.geometric(p = 0.7, size = (1000))

# Create the individual columns of the dataset we're creating. 

In [None]:
#random_test_data = np.array([array_column_1, array_column_2, array_column_3, array_column_4, array_column_5]).transpose()
random_test_data = np.array([array_column_1, array_column_2, array_column_3, array_column_4, array_column_5]).transpose()
random_test_data

# Use np.array to generate a new array with the 5 arrays we created earlier. 
# Use the transpose method to make sure our dataset isn't flipped. 

In [None]:
random_test_data.shape

In [None]:
np.savetxt("Random-Test-from-NumPy.csv", random_test_data, fmt = '%s', delimiter = ',')


# Saving the arrays to an extrenal file we're creating. 

# file name -> "Random-Test-from-NumPy.csv"
# random_test_data -> data we're exporting (saving to an external file)
# format -> strings
# delimiter ","

# We'll talk more about these in just a bit. 

In [None]:
np.genfromtxt("Random-Test-from-NumPy.csv", delimiter = ',')

# Importing the data from the file we just created. 

In [None]:
rand_test_data = np.genfromtxt("Random-Test-from-NumPy.csv", delimiter = ',')
print(rand_test_data)

In [None]:
## Importing Data with NumPy

In [None]:
### np.loadtxt() vs np.genfromtxt()

In [None]:
lending_co_data_numeric_1 = np.loadtxt("Lending-Company-Numeric-Data.csv", delimiter = ',')
lending_co_data_numeric_1

# We can use Notepad++ to determine delimiters

In [None]:
lending_co_data_numeric_2 = np.genfromtxt("Lending-Company-Numeric-Data.csv", delimiter = ',')
lending_co_data_numeric_2

In [None]:
np.array_equal(lending_co_data_numeric_1, lending_co_data_numeric_2)

## np.array_equal() compares two (or more) arrays and tells us if whether they're identical

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';')
lending_co_data_numeric_NAN

# np.loadtxt() fails to import incomplete datasets by default

In [None]:
lending_co_data_numeric_NAN = np.loadtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                         delimiter = ';',
                                         dtype = np.str)
lending_co_data_numeric_NAN

# If we import all the values as text, then we don't get a type inconcsistency, so we can use np.loadtxt()

In [None]:
lending_co_data_numeric_NAN[0,0] + lending_co_data_numeric_NAN[0,1]

# Adding '2000' and '40' results in a concatenated '200040' rather than 2040.

In [None]:
### Partial Cleaning While Importing

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", delimiter = ';') 
lending_co_data_numeric_NAN

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            skip_header = 2) 
lending_co_data_numeric_NAN

# skip_header omits lines from the top of the text file

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            skip_footer = 2) 
lending_co_data_numeric_NAN

# skip_footer omits lines from the bottom of the text file

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            usecols = (5,0,1)) 
lending_co_data_numeric_NAN

# use_cols tells the function to only take the following columns based on their indices.

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                            delimiter = ';',
                                            usecols = (5,0,1), 
                                            skip_header = 2, 
                                            skip_footer = 2) 
lending_co_data_numeric_NAN

# We can define all these arguments (and many more) together to only import what we want. 

In [None]:
lending_co_data_5, lending_co_data_0, lending_co_data_1 = np.genfromtxt("Lending-Company-Numeric-Data-NAN.csv", 
                                                                        delimiter = ';',
                                                                        usecols = (5,0,1), 
                                                                        skip_header = 2, 
                                                                        skip_footer = 2, 
                                                                        unpack = True)
print(lending_co_data_5)
print(lending_co_data_0)
print(lending_co_data_1)

# Unpacking allows us to split the output array into smaller 1-D arrays.

In [None]:
### String vs Object vs Numbers

In [None]:
lending_co_lt = np.genfromtxt("lending-co-LT.csv", 
                              delimiter = ',',
                              dtype = np.int32
                              #dtype = np.float16
                              #dtype = np.str
                              #dtype = np.object
                              #dtype = (np.int32, np.str, np.str, np.str, np.str, np.str, np.int32)
                             )
print(lending_co_lt)

# The same dataset is imported differently based on the datatype we define. 

In [None]:
lending_co_lt[0,0] + lending_co_lt[0,1]

In [None]:
## Saving Files with NumPy

In [None]:
### np.save()

In [None]:
lending_co = np.genfromtxt("Lending-Company-Saving.csv", 
                           delimiter = ',', 
                           dtype = np.str)

print(lending_co)

## We're just importing a dataset, so we can save it later. 
## Usually, we will be working with an array already, so we could skip this. 

In [None]:
np.save("Lending-Company-Saving", lending_co)

## Create an .npy file with the data from the lending_co array. 

In [None]:
lending_data_save = np.load("Lending-Company-Saving.npy")

## Load the NPY file we just created. (Load =/= Import in this case)

In [None]:
print(lending_data_save)

In [None]:
np.array_equal(lending_data_save, lending_co)

# The original array is identical to the one we saved and then loaded back into Python. 

In [None]:
### np.savez()

In [None]:
lending_co = np.genfromtxt("Lending-Company-Saving.csv", 
                           delimiter = ',',
                           dtype = np.str) 

lending_data_save = np.load('Lending-Company-Saving.npy') 

# Just getting two arrays we want to store (we import one, and load the other)

In [None]:
np.savez("Lending-Company-Saving", lending_co, lending_data_save)

# Creates the .npz file, which is an archive of .npy files. 

In [None]:
lending_data_savez = np.load('Lending-Company-Saving.npz')

# We also load .npz files.

In [None]:
print(lending_data_savez["arr_1"])

# np.savez() assigns default names to each .npy inside the archive.

In [None]:
np.savez("Lending-Company-Saving", company = lending_co, data_save = lending_data_save) 

# Assign custom recognizable names to the individual .npy files in the .npz

In [None]:
lending_data_savez = np.load("Lending-Company-Saving.npz")

In [None]:
lending_data_savez.files

# Shows the names of all the .npy files stored in the .npz

In [None]:
print(lending_data_savez["data_save"])

In [None]:
np.array_equal(lending_data_savez["company"],lending_data_savez["data_save"])

# Even after saving and loading the datasets back into Python, they are still identical.

In [None]:
lending_co = np.genfromtxt("Lending-Company-Saving.csv",
                           delimiter = ',',
                           dtype = np.str) 

In [None]:
np.savetxt("Lending-Company-Saving.txt", 
           lending_co, 
           fmt = '%s', 
           delimiter = ',')

# We must specify the file extension (txt or csv).
# We must specify the format (strings in this case).
# We must set a delimiter (comma in this case).

In [None]:
lending_data_savetxt = np.genfromtxt("Lending-Company-Saving.txt", 
                                     delimiter = ',', 
                                     dtype = np.str)

print(lending_data_savetxt)

# We're importing the .txt file we just created.

In [None]:
lending_data_save = np.load("Lending-Company-Saving.npy")

In [None]:
np.array_equal(lending_data_savetxt, lending_data_save)

In [None]:
# Statistsics w/ NumPy

In [None]:
### np.mean()

In [None]:
matrix_A = np.array([[1,0,0,3,1],[3,6,6,2,9],[4,5,3,8,0]])
matrix_A

In [None]:
np.mean(matrix_A, axis = 1)

# We can call the function over a given axis (e.g for every row or column when it comes to 2-D arrays)

In [None]:
matrix_A.mean()

# An equivalent method exists.

In [None]:
# matrix_A.sqrt() 

# Results in an attribute error, since not every function has an equivalent method. 

In [None]:
np.mean(matrix_A, axis = 1, dtype = np.int64)

## We can cast the values to a specific type as well. 

In [None]:
### Min & Max Values

In [None]:
matrix_A = np.array([[1,0,0,3,1],[3,6,6,2,9],[4,5,3,8,0]])
matrix_A

In [None]:
np.min(matrix_A)

# Returns the minimum value. 

In [None]:
np.amin(matrix_A)

# Equivalent function (literally just a different alias for np.min())

In [None]:
np.minimum(matrix_A[1], matrix_A[2])

# Elementwise minimum. Returns the lowest value out of a given set.
# In this case, np.minimum() returns the lower value every position across the two arrays. (e.g. lowest value in 1st position, 2nd, etc)

In [None]:
np.minimum.reduce(matrix_A)

## A way to make minimum() equivalent to min()

In [None]:
np.min(matrix_A, axis = 0)

In [None]:
np.max(matrix_A)

## Corresponding max, a max and maximum functions exist. 

In [None]:
np.amax(matrix_A)

In [None]:
np.maximum.reduce(matrix_A)

In [None]:
### Statistical Order Functions 

In [None]:
matrix_A = np.array([[1,0,0,3,1],[3,6,6,2,9],[4,5,3,8,0]])
matrix_A

In [None]:
np.ptp(matrix_A)

## Returns difference between max and min (peak-to-peak) over the flattened array. 

In [None]:
np.ptp(matrix_A, axis = 0)

## Returns difference between max and min (peak-to-peak) for every column. 

In [None]:
np.ptp(matrix_A, axis = 1)

## Returns difference between max and min (peak-to-peak) for every row. 

In [None]:
np.sort(matrix_A, axis = None)

# A sorted version of the flattened matrix_A

In [None]:
type(np.percentile(matrix_A, 70))
# N-th Percentile = A value which is greater than n% of the dataset. 

In [None]:
np.percentile(matrix_A, 100)

# 100-th percentile = max
# 50-th percentile = median
# 0-th percentile = min

In [None]:
np.quantile(matrix_A, 0.70, interpolation = "nearest")

# Quantile -> Similar to percentile, but works with parts of the dataset, rather than percentages. 
# Hence, the N-th Quantile = 100*N-th Percentile of the same dataset. 

In [None]:
### Averages and Variances

In [None]:
matrix_A = np.array([[1,0,0,3,1],[3,6,6,2,9],[4,5,3,8,0]])
matrix_A

In [None]:
np.median(matrix_A)

# Returns the median for the flattened array. 
# Median -> The middle value of a sorted version of the dataset. 

In [None]:
np.sort(matrix_A, axis = None)

In [None]:
np.mean(matrix_A)

# The arithmetic average of the flattened array. 

In [None]:
np.average(matrix_A)

# The average of the flattened array. 

In [None]:
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg

array_RG = gen(pcg(365))

array_weights = array_RG.random(size = (3,5))
array_weights

# Generating some random weights for each entry of matrix_A (for the sake of the example)

In [None]:
np.average(matrix_A, weights = array_weights)

# The weighted average of the flattened array. 

In [None]:
np.var(matrix_A)

# The variance of the array. 

In [None]:
np.std(matrix_A)

# The standard deviation of the array. 

In [None]:
2.8**2

In [None]:
### Correlation

In [None]:
matrix_A = np.array([[1,0,0,3,1],[3,6,6,2,9],[4,5,3,8,0]])
matrix_A

In [None]:
np.cov(matrix_A)

# The covariance of every row (array) of matrix_A and every other row of the variable. 

In [None]:
np.corrcoef(matrix_A)

# The correlation coefficient of every row (array) of matrix_A and every other row of the variable. 

In [None]:
### Histograms

In [None]:
matrix_A = np.array([[1,0,0,3,1],[3,6,6,2,9],[4,5,3,8,0]])
matrix_A

In [None]:
np.sort(matrix_A, axis = None)

In [None]:
np.histogram(matrix_A, bins = 4, range = (1,7))

# Computes the bin edges and how many points fall in each bin. 
# The 1-st array contains the number of points. The 2-nd array contains the bin edges. 

In [None]:
import matplotlib.pyplot as plt
plt.hist(matrix_A.flat, bins = np.histogram(matrix_A)[1])
plt.show()

# NumPy has no plotting capabilities, so we're using matplotlib's help. 
# .flat -> The flattened version of the array

In [None]:
matrix_A

In [None]:
np.histogram2d(matrix_A[0], matrix_A[1], bins = 4)

# We pass two datasets for the 2-D histogram. 

In [None]:
np.histogramdd(matrix_A.transpose(), bins = 4)

In [None]:
### NaN-Equivalents

In [None]:
matrix_A = np.array([[1,0,0,3,1],[3,6,6,2,9],[4,5,3,8,0]])
matrix_A

In [None]:
np.nanmean(matrix_A)

In [None]:
np.mean(matrix_A)

# NAN functions work the same way as non-NAN functions for non-NAN datasets

In [None]:
matrix_B = np.array([[1,0,0,3,1],[3,6,np.nan,2,9],[4,5,3,8,0]])
matrix_B

In [None]:
np.nanmean(matrix_B)

# NAN functions ignore "nan" values and compute the mean. 

In [None]:
np.mean(matrix_B)

# Non-NAN functions return "nan" when missing values are present.

In [None]:
np.nanquantile(matrix_B, 0.7)

In [None]:
np.nanvar(matrix_B)

In [None]:
# Preprocessing with NumPy

In [None]:
## Checking for Missing Values

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')

## If np.loadtxt() compiles first time, the dataset consists of only numeric values and has no missing data. 

In [None]:
np.isnan(lending_co_data_numeric).sum()

## isnan() determines whether data is missing data for the individual elements in an array (True -> Missing, False -> Not missing)
## By adding .sum(), we get the total number of missing elements in the data. 

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

In [None]:
np.isnan(lending_co_data_numeric_NAN).sum()

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';',
                                            filling_values = 0)

## Filling_values substitutes every nan with the value we're passing (0 in this case)

In [None]:
np.isnan(lending_co_data_numeric_NAN).sum()

## All the previously missing values are now 0s.

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';') 

# We need to reimport the dataset since all the missing values are filled up. 

In [None]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1

# We use nanmax(), since max() returns nan. 
# We want a value greater than the max, since we have be certain it's unique to the dataset.

In [None]:
temporary_fill

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';',
                                            filling_values = temporary_fill) 

# Filling up all the missing values with the temporary filler. 

In [None]:
np.isnan(lending_co_data_numeric_NAN)

In [None]:
np.isnan(lending_co_data_numeric_NAN).sum()

In [None]:
## Substituting Missing Values

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')
lending_co_data_numeric_NAN

In [None]:
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

## Storing the means of every column. 

In [None]:
temporary_mean[0]

In [None]:
temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv",
                                            delimiter = ';',
                                            filling_values = temporary_fill)

## Creating a unique filler and using it to take care of all the missing values.

In [None]:
temporary_fill

In [None]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2) 

# Supposed mean (w/ fillers)

In [None]:
temporary_mean[0]

# Actual mean (w/0 fillers)

In [None]:
lending_co_data_numeric_NAN[:,0] = np.where(lending_co_data_numeric_NAN[:,0] == temporary_fill,
                                            temporary_mean[0], 
                                            lending_co_data_numeric_NAN[:,0])

# Going through the first column and substituting any temporary fillers (previously missing) with the mean for that column.

In [None]:
np.mean(lending_co_data_numeric_NAN[:,0]).round(2)

# New mean equals old mean. 

In [None]:
for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill, 
                                                temporary_mean[i], 
                                                lending_co_data_numeric_NAN[:,i])
    
# We're generalizing the filling from earlier and going through all the columns. 

In [None]:
for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:, i] < 0,
                                                0, 
                                                lending_co_data_numeric_NAN[:,i])
    
# We can use this approach for other applications as well (e.g. remove all negative values and set them to 0)

In [None]:
## Reshaping

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')

In [None]:
lending_co_data_numeric

In [None]:
lending_co_data_numeric.shape

In [None]:
np.reshape(lending_co_data_numeric, (6,1043))

# Reshaping (1043,6) to (6,1043) is not the same as transposing.

In [None]:
np.transpose(lending_co_data_numeric)

In [None]:
np.reshape(lending_co_data_numeric, (1,1,2,3,1043))

# We can choose whatever shape we wish as long as the product of the dimensions matches the total number of elements in the array.

In [None]:
lending_co_data_numeric

# Reshaping doesn't alter the original array. 

In [None]:
lending_co_data_numeric_2 = np.reshape(lending_co_data_numeric, (6,1043))
lending_co_data_numeric_2

In [None]:
lending_co_data_numeric.reshape(6,1043)

# Equivalent method. 

In [None]:
lending_co_data_numeric

In [None]:
## Removing Values

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 

In [None]:
lending_co_data_numeric

In [None]:
np.delete(lending_co_data_numeric, 0).shape

# Removes the first value of the flattened array. 

In [None]:
lending_co_data_numeric.size

In [None]:
lending_co_data_numeric

In [None]:
np.delete(lending_co_data_numeric, [0,2,4] , axis = 1)

# By setting an axis, we can simultaneously delete entire rows or columns. 

In [None]:
np.delete(np.delete(lending_co_data_numeric, [0,2,4] , axis = 1), [0,2,-1] , axis = 0)

# We can simultaneously delete rows AND columns. 

In [None]:
## Sorting Data

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

In [None]:
np.sort(lending_co_data_numeric).shape

In [None]:
lending_co_data_numeric.shape

In [None]:
np.sort(lending_co_data_numeric, axis = None)

In [None]:
np.set_printoptions(suppress = True)

# Supresses scientific notatoin when printing. 

In [None]:
np.sort(lending_co_data_numeric)

In [None]:
-np.sort(-lending_co_data_numeric)

## Adding two minus signs sorts the array in descending order

In [None]:
lending_co_data_numeric

In [None]:
lending_co_data_numeric.sort(axis = 0)
lending_co_data_numeric

# The equivalent method stores the values in place. 

In [None]:
## Argument Functions

In [None]:
### np.argsort()

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

In [None]:
np.argsort(lending_co_data_numeric)

# Returns the order which will sort the array. 

In [None]:
np.sort(lending_co_data_numeric, axis = 0)

In [None]:
np.argsort(lending_co_data_numeric, axis = 0)

In [None]:
lending_co_data_numeric[482,5]

In [None]:
lending_co_data_numeric = lending_co_data_numeric[np.argsort(lending_co_data_numeric[:,0])]
lending_co_data_numeric

# Sorts the array based on the values in the 1st column. 

In [None]:
lending_co_data_numeric.argsort(axis = 0)

# The method doesn't sort in place. 

In [None]:
lending_co_data_numeric

In [None]:
### np.argwhere()

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

In [None]:
np.argwhere(lending_co_data_numeric == False)

# Default condition is to return values different from 0. 

In [None]:
lending_co_data_numeric[430]

In [None]:
lending_co_data_numeric

In [None]:
np.argwhere(lending_co_data_numeric %2 == 0)

# The condition can be more complex 

In [None]:
np.isnan(lending_co_data_numeric).sum()

In [None]:
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';') 
lending_co_data_numeric_NAN

In [None]:
np.argwhere(np.isnan(lending_co_data_numeric_NAN))

# Returns the coordinates of all the missing values within the array. 

In [None]:
lending_co_data_numeric_NAN[175]

In [None]:
for array_index in np.argwhere(np.isnan(lending_co_data_numeric_NAN)):
    lending_co_data_numeric_NAN[array_index[0], array_index[1]] = 0

## By going through the coordinates of all the mising values of the array, we can fill them up. 

In [None]:
lending_co_data_numeric_NAN[175]

In [None]:
np.isnan(lending_co_data_numeric_NAN).sum()

In [None]:
## Shuffling Data

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')[:8]
lending_co_data_numeric

# We can directly index the output of the np.loadtxt() function to only take certain parts of the dataset. 

In [None]:
np.random.shuffle(lending_co_data_numeric)

# Shuffles the array (and automatically overwrites it).

In [None]:
lending_co_data_numeric

In [None]:
np.random.shuffle(lending_co_data_numeric)
lending_co_data_numeric

# We can shuffle the array as many times as we wish (although 1 usually suffices).

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric

# We can now use the entire dataset. 

In [None]:
from numpy.random import shuffle

# We can import functions we use multiple times for convenience. 

In [None]:
shuffle(lending_co_data_numeric)
lending_co_data_numeric

# We write shuffle() instead of numpy.random.shuffle() since we imported the function earlier. 

In [None]:
from numpy.random import Generator as gen
from numpy.random import PCG64 as pcg

# Random generators can be used for shuffling. 

In [None]:
array_RG = gen(pcg(seed = 365))
array_RG.shuffle(lending_co_data_numeric)
lending_co_data_numeric

# Seeds don't work for shuffling (and it's intended).

In [None]:
## Casting

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

In [None]:
lending_co_data_numeric.astype(dtype = np.int32)

# Creates an integer version of the array. 

In [None]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)

# We need to overwrite the variable in order to work with strings. 

In [None]:
lending_co_data_numeric

In [None]:
type(lending_co_data_numeric)

In [None]:
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.float32)
lending_co_data_numeric.astype(dtype = np.int32)

## We can't directly cast strings to integers. We can go through floats (string -> float -> integer).

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',')
lending_co_data_numeric = lending_co_data_numeric.astype(dtype = np.str)
lending_co_data_numeric

# To showcase the other way to go from strings to integers, we need to get the strings version of the array once again. 

In [None]:
lending_co_data_numeric.astype(dtype = np.float32).astype(dtype = np.int32)
lending_co_data_numeric

## We can chain methods in NumPy.

In [None]:
## Stripping Data

In [None]:
lending_co_total_price = np.genfromtxt("Lending-Company-Total-Price.csv",
                                       delimiter = ',',
                                       dtype = np.str,
                                       skip_header = 1, 
                                       usecols = [1,2,4])
lending_co_total_price

# We don't neeed the entire array. We only want a few columns to showcase how stripping data works.

In [None]:
lending_co_total_price[:,0] = np.chararray.strip(lending_co_total_price[:,0], "id_")
lending_co_total_price[:,1] = np.chararray.strip(lending_co_total_price[:,1], "Product ")
lending_co_total_price[:,2] = np.chararray.strip(lending_co_total_price[:,2], "Location ")
lending_co_total_price

# Remove "id_" from the 1st column, as well as "Product " from the second and "Location " from the third one. 

In [None]:
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'A', 1, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'B', 2, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'C', 3, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'D', 4, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'E', 5, lending_co_total_price[:,1]) 
lending_co_total_price[:,1] = np.where(lending_co_total_price[:,1] == 'F', 6, lending_co_total_price[:,1]) 

lending_co_total_price

# We can combine stripping with substituting to transform all the letters in numbers. 

In [None]:
lending_co_total_price = lending_co_total_price.astype(dtype = np.int32)
lending_co_total_price

# Even though the values look like numbers, they're actually just text, so we need to cast them once again. 

In [None]:
## Stacking

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

In [None]:
# Recall

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", 
                                            delimiter = ';', 
                                            filling_values = temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,
                                                temporary_mean[i],
                                                lending_co_data_numeric_NAN[:,i])
lending_co_data_numeric_NAN


## We create a filler, reimport and fill all the nan-s, then subsitute all the temporary fillers with more appropriate values

In [None]:
np.stack((lending_co_data_numeric[:,1],lending_co_data_numeric[:,0]))

# Stacking the first 2 columns. (We can stack them in any order we like)

In [None]:
np.transpose(lending_co_data_numeric[:,:2])

In [None]:
np.stack((lending_co_data_numeric[:,0],lending_co_data_numeric[:,1], lending_co_data_numeric[:,2]), axis = 1)

# We can stack more than 2 arrays. 

In [None]:
lending_co_data_numeric_NAN.shape

In [None]:
np.dstack((lending_co_data_numeric, lending_co_data_numeric_NAN))[0,:,0]

# We can stack 2-D arrays as well. 

In [None]:
np.stack((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = -1)

# We can stack along a given axis (with np.stack())

In [None]:
array_example_1 = np.array([[[1,2,3,4],[5,6,7,8],[9,10,11,12]],[[21,22,23,24],[25,26,27,28],[29,30,31,32]]])
array_example_2 = array_example_1 * 2

# We're quickly creating some 3-D arrays to showcase how dstack works for higher dimensions. 

In [None]:
np.dstack((array_example_1, array_example_2)).shape

In [None]:
np.stack((array_example_1, array_example_2), axis = 2).shape

# We can no longer replicate the output of dstack by simply specifying an axis. 

In [None]:
## Concatenate

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

In [None]:
np.concatenate((lending_co_data_numeric[0,:], lending_co_data_numeric[1,:]))

# The concatenated array has the same number of dimensions as the inputs. 

In [None]:
#Recall: 
    
lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv", delimiter = ';')

temporary_fill = np.nanmax(lending_co_data_numeric_NAN).round(2) + 1
temporary_mean = np.nanmean(lending_co_data_numeric_NAN, axis = 0).round(2)

lending_co_data_numeric_NAN = np.genfromtxt("Lending-company-Numeric-NAN.csv",
                                            delimiter = ';', 
                                            filling_values = temporary_fill)

for i in range(lending_co_data_numeric_NAN.shape[1]):        
    lending_co_data_numeric_NAN[:,i] = np.where(lending_co_data_numeric_NAN[:,i] == temporary_fill,
                                                temporary_mean[i],
                                                lending_co_data_numeric_NAN[:,i])
    
lending_co_data_numeric_NAN

In [None]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric_NAN), axis = 1).shape

In [None]:
array_example_1 = np.array([[[1,2,3,4],[5,6,7,8],[9,10,11,12]],[[21,22,23,24],[25,26,27,28],[29,30,31,32]]])
array_example_2 = array_example_1 * 2

# We create 3-D arrays to showcase concatenate vs stacking

In [None]:
np.concatenate((array_example_1, array_example_2), axis = 2)

In [None]:
np.dstack((array_example_1, array_example_2))

In [None]:
np.concatenate((lending_co_data_numeric, lending_co_data_numeric[:,:1]), axis = 1)

In [None]:
## Unique 

In [None]:
lending_co_data_numeric = np.loadtxt("Lending-company-Numeric.csv", delimiter = ',') 
lending_co_data_numeric

In [None]:
np.unique(lending_co_data_numeric[:,1], return_counts = True, return_index = True)

# Unique -> returns the unique values within the array in increasing order
# return_counts -> returns how many times each unique value appears in the array
# return_index -> returns the index of the first encounter with each unique value

In [None]:
array_example = np.array(["a1", "a3","A1","A3","A3","AA1","B1","A2","B1","A2","B2","B2", "B3","a2","a3","B3","B3","a3" ])
np.unique(array_example)

# If the values of the array are text, the unique function sorts them in "alphabetical" order by their ASCII codes. 

In [None]:
#A loan data with woth Example numPy

In [None]:
np.set_printoptions(suppress = True, linewidth = 100, precision = 2)

In [None]:
## Importing the Data

In [None]:
raw_data_np = np.genfromtxt("loan-data.csv", delimiter = ';', skip_header = 1, autostrip = True)
raw_data_np

In [None]:
## Checking for Incomplete Data

In [None]:
np.isnan(raw_data_np).sum()

In [None]:
temporary_fill = np.nanmax(raw_data_np) + 1
temporary_mean = np.nanmean(raw_data_np, axis = 0)

In [None]:
temporary_mean

In [None]:
temporary_stats = np.array([np.nanmin(raw_data_np, axis = 0),
                           temporary_mean,
                           np.nanmax(raw_data_np, axis = 0)])

In [None]:
temporary_stats

In [None]:
## Splitting the Dataset

In [None]:
### Splitting the Columns

In [None]:
columns_strings = np.argwhere(np.isnan(temporary_mean)).squeeze()
columns_strings

In [None]:
columns_numeric = np.argwhere(np.isnan(temporary_mean) == False).squeeze()
columns_numeric

In [None]:
### Re-importing the Dataset

In [None]:
loan_data_strings = np.genfromtxt("loan-data.csv",
                                  delimiter = ';',
                                  skip_header = 1,
                                  autostrip = True, 
                                  usecols = columns_strings,
                                  dtype = np.str)
loan_data_strings

In [None]:
loan_data_numeric = np.genfromtxt("loan-data.csv",
                                  delimiter = ';',
                                  autostrip = True,
                                  skip_header = 1,
                                  usecols = columns_numeric,
                                  filling_values = temporary_fill)
loan_data_numeric

In [None]:
### The Names of the Columns

In [None]:
header_full = np.genfromtxt("loan-data.csv",
                            delimiter = ';',
                            autostrip = True,
                            skip_footer = raw_data_np.shape[0],
                            dtype = np.str)
header_full

In [None]:
header_strings, header_numeric = header_full[columns_strings], header_full[columns_numeric]

In [None]:
header_strings

In [None]:
header_numeric

In [None]:
## Creating Checkpoints:

In [None]:
def checkpoint(file_name, checkpoint_header, checkpoint_data):
    np.savez(file_name, header = checkpoint_header, data = checkpoint_data)
    checkpoint_variable = np.load(file_name + ".npz")
    return(checkpoint_variable)

In [None]:
checkpoint_test = checkpoint("checkpoint-test", header_strings, loan_data_strings)

In [None]:
checkpoint_test['data']

In [None]:
np.array_equal(checkpoint_test['data'], loan_data_strings)

In [None]:
## Manipulating String Columns

In [None]:
header_strings[0] = "issue_date"

In [None]:
loan_data_strings

In [None]:
### Issue Date

In [None]:
np.unique(loan_data_strings[:,0])

In [None]:
loan_data_strings[:,0] = np.chararray.strip(loan_data_strings[:,0], "-15")

In [None]:
np.unique(loan_data_strings[:,0])

In [None]:
months = np.array(['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

In [None]:
for i in range(13):
        loan_data_strings[:,0] = np.where(loan_data_strings[:,0] == months[i],
                                          i,
                                          loan_data_strings[:,0])

In [None]:
np.unique(loan_data_strings[:,0])

In [None]:
### Loan Status

In [None]:
header_strings

In [None]:
np.unique(loan_data_strings[:,1])

In [None]:
np.unique(loan_data_strings[:,1]).size

In [None]:
status_bad = np.array(['','Charged Off','Default','Late (31-120 days)'])

In [None]:
loan_data_strings[:,1] = np.where(np.isin(loan_data_strings[:,1], status_bad),0,1)

In [None]:
np.unique(loan_data_strings[:,1])

In [None]:
### Term

In [None]:
header_strings

In [None]:
np.unique(loan_data_strings[:,2])

In [None]:
loan_data_strings[:,2] = np.chararray.strip(loan_data_strings[:,2], " months")
loan_data_strings[:,2]

In [None]:
header_strings[2] = "term_months"

In [None]:
loan_data_strings[:,2] = np.where(loan_data_strings[:,2] == '', 
                                  '60', 
                                  loan_data_strings[:,2])
loan_data_strings[:,2]

In [None]:
np.unique(loan_data_strings[:,2])

In [None]:
### Grade and Subgrade

In [None]:
header_strings

In [None]:
np.unique(loan_data_strings[:,3])

In [None]:
np.unique(loan_data_strings[:,4])

In [None]:
#### Filling Sub Grade

In [None]:
for i in np.unique(loan_data_strings[:,3])[1:]:
    loan_data_strings[:,4] = np.where((loan_data_strings[:,4] == '') & (loan_data_strings[:,3] == i),
                                      i + '5',
                                      loan_data_strings[:,4])

In [None]:
np.unique(loan_data_strings[:,4], return_counts = True)

In [None]:
loan_data_strings[:,4] = np.where(loan_data_strings[:,4] == '',
                                  'H1',
                                  loan_data_strings[:,4])

In [None]:
np.unique(loan_data_strings[:,4])

In [None]:
#### Removing Grade

In [None]:
loan_data_strings = np.delete(loan_data_strings, 3, axis = 1)

In [None]:
loan_data_strings[:,3]

In [None]:
header_strings = np.delete(header_strings, 3)

In [None]:
header_strings[3]

In [None]:
#### Converting Sub Grade

In [None]:
np.unique(loan_data_strings[:,3])

In [None]:
keys = list(np.unique(loan_data_strings[:,3]))                         
values = list(range(1, np.unique(loan_data_strings[:,3]).shape[0] + 1)) 
dict_sub_grade = dict(zip(keys, values))

In [None]:
dict_sub_grade

In [None]:
for i in np.unique(loan_data_strings[:,3]):
        loan_data_strings[:,3] = np.where(loan_data_strings[:,3] == i, 
                                          dict_sub_grade[i],
                                          loan_data_strings[:,3])

In [None]:
np.unique(loan_data_strings[:,3])

In [None]:
### Verification Status

In [None]:
header_strings

In [None]:
np.unique(loan_data_strings[:,4])

In [None]:
loan_data_strings[:,4] = np.where((loan_data_strings[:,4] == '') | (loan_data_strings[:,4] == 'Not Verified'), 0, 1)

In [None]:
np.unique(loan_data_strings[:,4])

In [None]:
### URL

In [None]:
loan_data_strings[:,5]

In [None]:
np.chararray.strip(loan_data_strings[:,5], "https://www.lendingclub.com/browse/loanDetail.action?loan_id=")

In [None]:
loan_data_strings[:,5] = np.chararray.strip(loan_data_strings[:,5], "https://www.lendingclub.com/browse/loanDetail.action?loan_id=")

In [None]:
header_full

In [None]:
loan_data_numeric[:,0].astype(dtype = np.int32)

In [None]:
loan_data_strings[:,5].astype(dtype = np.int32)

In [None]:
np.array_equal(loan_data_numeric[:,0].astype(dtype = np.int32), loan_data_strings[:,5].astype(dtype = np.int32))

In [None]:
loan_data_strings = np.delete(loan_data_strings, 5, axis = 1)
header_strings = np.delete(header_strings, 5)

In [None]:
loan_data_strings[:,5]

In [None]:
header_strings

In [None]:
loan_data_numeric[:,0]

In [None]:
header_numeric

In [None]:
### State Address

In [None]:
header_strings

In [None]:
header_strings[5] = "state_address"

In [None]:
states_names, states_count = np.unique(loan_data_strings[:,5], return_counts = True)
states_count_sorted = np.argsort(-states_count)
states_names[states_count_sorted], states_count[states_count_sorted]

In [None]:
loan_data_strings[:,5] = np.where(loan_data_strings[:,5] == '', 
                                  0, 
                                  loan_data_strings[:,5])

In [None]:
states_west = np.array(['WA', 'OR','CA','NV','ID','MT', 'WY','UT','CO', 'AZ','NM','HI','AK'])
states_south = np.array(['TX','OK','AR','LA','MS','AL','TN','KY','FL','GA','SC','NC','VA','WV','MD','DE','DC'])
states_midwest = np.array(['ND','SD','NE','KS','MN','IA','MO','WI','IL','IN','MI','OH'])
states_east = np.array(['PA','NY','NJ','CT','MA','VT','NH','ME','RI'])

In [None]:
https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf

In [None]:
loan_data_strings[:,5] = np.where(np.isin(loan_data_strings[:,5], states_west), 1, loan_data_strings[:,5])
loan_data_strings[:,5] = np.where(np.isin(loan_data_strings[:,5], states_south), 2, loan_data_strings[:,5])
loan_data_strings[:,5] = np.where(np.isin(loan_data_strings[:,5], states_midwest), 3, loan_data_strings[:,5])
loan_data_strings[:,5] = np.where(np.isin(loan_data_strings[:,5], states_east), 4, loan_data_strings[:,5])

In [None]:
np.unique(loan_data_strings[:,5])

In [None]:
## Converting to Numbers

In [None]:
loan_data_strings

In [None]:
loan_data_strings = loan_data_strings.astype(np.int)

In [None]:
loan_data_strings

In [None]:
### Checkpoint 1: Strings

In [None]:
checkpoint_strings = checkpoint("Checkpoint-Strings", header_strings, loan_data_strings)

In [None]:
checkpoint_strings["header"]

In [None]:
checkpoint_strings["data"]

In [None]:
np.array_equal(checkpoint_strings['data'], loan_data_strings)

In [None]:
## Manipulating Numeric Columns

In [None]:
loan_data_numeric

In [None]:
np.isnan(loan_data_numeric).sum()

In [None]:
### Substitute "Filler" Values

In [None]:
header_numeric

In [None]:
#### ID

In [None]:
temporary_fill

In [None]:
np.isin(loan_data_numeric[:,0], temporary_fill)

In [None]:
np.isin(loan_data_numeric[:,0], temporary_fill).sum()

In [None]:
header_numeric

In [None]:
#### Temporary Stats

In [None]:
temporary_stats[:, columns_numeric]

In [None]:
#### Funded Amount

In [None]:
loan_data_numeric[:,2]

In [None]:
loan_data_numeric[:,2] = np.where(loan_data_numeric[:,2] == temporary_fill, 
                                  temporary_stats[0, columns_numeric[2]],
                                  loan_data_numeric[:,2])
loan_data_numeric[:,2]

In [None]:
temporary_stats[0,columns_numeric[3]]

In [None]:
#### Loaned Amount, Interest Rate, Total Payment, Installment

In [None]:
header_numeric

In [None]:
for i in [1,3,4,5]:
    loan_data_numeric[:,i] = np.where(loan_data_numeric[:,i] == temporary_fill,
                                      temporary_stats[2, columns_numeric[i]],
                                      loan_data_numeric[:,i])

In [None]:
loan_data_numeric

In [None]:
### Currency Change

In [None]:
#### The Exchange Rate

In [None]:
EUR_USD = np.genfromtxt("EUR-USD.csv", delimiter = ',', autostrip = True, skip_header = 1, usecols = 3)
EUR_USD

In [None]:
loan_data_strings[:,0]

In [None]:
exchange_rate = loan_data_strings[:,0]

for i in range(1,13):
    exchange_rate = np.where(exchange_rate == i,
                             EUR_USD[i-1],
                             exchange_rate)    

exchange_rate = np.where(exchange_rate == 0,
                         np.mean(EUR_USD),
                         exchange_rate)

exchange_rate

In [None]:
exchange_rate.shape

In [None]:
loan_data_numeric.shape

In [None]:
exchange_rate = np.reshape(exchange_rate, (10000,1))

In [None]:
loan_data_numeric = np.hstack((loan_data_numeric, exchange_rate))

In [None]:
header_numeric = np.concatenate((header_numeric, np.array(['exchange_rate'])))
header_numeric

In [None]:
#### From USD to EUR

In [None]:
header_numeric

In [None]:
columns_dollar = np.array([1,2,4,5])

In [None]:
loan_data_numeric[:,6]

In [None]:
for i in columns_dollar:
    loan_data_numeric = np.hstack((loan_data_numeric, np.reshape(loan_data_numeric[:,i] / loan_data_numeric[:,6], (10000,1))))

In [None]:
loan_data_numeric.shape

In [None]:
loan_data_numeric

In [None]:
#### Expanding the header

In [None]:
header_additional = np.array([column_name + '_EUR' for column_name in header_numeric[columns_dollar]])

In [None]:
header_additional

In [None]:
header_numeric = np.concatenate((header_numeric, header_additional))

In [None]:
header_numeric

In [None]:
header_numeric[columns_dollar] = np.array([column_name + '_USD' for column_name in header_numeric[columns_dollar]])

In [None]:
header_numeric

In [None]:
columns_index_order = [0,1,7,2,8,3,4,9,5,10,6]

In [None]:
header_numeric = header_numeric[columns_index_order]

In [None]:
loan_data_numeric

In [None]:
loan_data_numeric = loan_data_numeric[:,columns_index_order]

In [None]:
### Interest Rate

In [None]:
header_numeric

In [None]:
loan_data_numeric[:,5]

In [None]:
loan_data_numeric[:,5] = loan_data_numeric[:,5]/100

In [None]:
loan_data_numeric[:,5]

In [None]:
### Checkpoint 2: Numeric

In [None]:
checkpoint_numeric = checkpoint("Checkpoint-Numeric", header_numeric, loan_data_numeric)

In [None]:
checkpoint_numeric['header'], checkpoint_numeric['data']

In [None]:
## Creating the "Complete" Dataset

In [None]:
checkpoint_strings['data'].shape

In [None]:
checkpoint_numeric['data'].shape

In [None]:
loan_data = np.hstack((checkpoint_numeric['data'], checkpoint_strings['data']))

In [None]:
loan_data

In [None]:
np.isnan(loan_data).sum()

In [None]:
header_full = np.concatenate((checkpoint_numeric['header'], checkpoint_strings['header']))

In [None]:
## Sorting the New Dataset

In [None]:
loan_data = loan_data[np.argsort(loan_data[:,0])]

In [None]:
loan_data

In [None]:
np.argsort(loan_data[:,0])

In [None]:
## Storing the New Dataset

In [None]:
loan_data = np.vstack((header_full, loan_data))

In [None]:
np.savetxt("loan-data-preprocessed.csv", 
           loan_data, 
           fmt = '%s',
           delimiter = ',')