# 2. Python essentials for Data Science

In [1]:
# to make the .py script runnable
#!/usr/bin/env python

In [2]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('ggplot')

In [3]:
import os

# 2.1 Numpy

## 2.1.1 Array

Arrays are used to work with vectors, matrices and tensors. The data it contains should all be of the same type (as opposed to, e.g., lists).

In [None]:
import numpy as np

In [None]:
#?np.array #Delete the '?' and run to see extra help

In [None]:
arr_1d = np.array((1, 2, 3))
print(arr_1d)

In [None]:
type(arr_1d)

In [None]:
#arr_1d. # Delete the first '#', press tab to explore options

In [None]:
arr_1d.shape

In [None]:
arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 
arr_2d

In [None]:
arr_2d.shape

In [None]:
arr_3d = np.array([arr_2d, arr_2d])
arr_3d

In [None]:
arr_3d.shape

## 2.1.2 Random generation

### 2.1.2.1 Using generation functions

In [None]:
np.arange(16)

In [None]:
np.arange(16).reshape(4, 4)

In [None]:
np.linspace(0, 1, 10).reshape(5, 2)

In [None]:
arr_2d.T #This transposes the matrix

###  2.1.2.2 Using random numbers

In [None]:
#np.random. # Delete the first '#', press tab to explore options

In [None]:
arr_1d = np.random.randint(1, 100, 16)

In [None]:
arr_1d

In [None]:
arr_2d = np.random.randint(0, 1000, 16).reshape(4, 4)

In [None]:
arr_2d

In [None]:
np.random.randn(10).round(2) #This generates random number from the standard normal distribution

In [None]:
np.random.randn(30).reshape(5, 6).round(2) #more on random generation later

## 2.1.3 Subsetting (get a number or many numbers from an array)

In [None]:
arr_1d

In [None]:
arr_1d[::2]

In [None]:
ind = np.array((0,1,2,5,4,3))
arr_1d[ind]

In [None]:
arr_2d

In [None]:
arr_2d[1, 2] # Getting a single number. the first index is the row number, the second the column. Count starts at 0

In [None]:
arr_2d[0, :]

In [None]:
arr_2d[:, 0]

In [None]:
arr_2d[:2, :]

In [None]:
arr_2d[:, 2:]

In [None]:
arr_2d[2:4, 2:4]

### 2.1.3.1 Subsetting with Booleans

ndarrays can be subsetted with a boolean array of equal length
- will return a value wherever True

In [None]:
list(filter(lambda x: x>50, list(arr_1d)))

In [None]:
# Logical comparision is broadcasted
arr_1d > 50

In [None]:
list(zip(arr_1d, arr_1d > 50)) #combine arrays elementwise into a list

In [None]:
arr_1d[arr_1d > 50]

In [None]:
arr_2d

In [None]:
arr_2d % 2 == 0

In [None]:
arr_2d[arr_2d % 2 == 0]

## 2.1.4 Mathematical Operations

In [None]:
print(arr_1d)
arr_1d + arr_1d

In [None]:
print(arr_2d)
arr_2d + arr_2d

In [None]:
#arr_1d + arr_2d # Delete the first '#' # This doesn't work as the dimensionality should match

In [None]:
print(arr_2d[0,0] * arr_2d[0,0]) #multiplication is elementwise, not the matrix multiplication
print(arr_2d * arr_2d)

### 2.1.4.1 Math Functions

In [None]:
np.sqrt(arr_1d).round(2)

In [None]:
np.log(arr_2d).round(2)

### 2.1.4.2 Array Attributes and Methods

- commonly used: `reshape, round... `

In [None]:
np.array([True, True, False, True]).all() #logical AND

In [None]:
np.array([True, True, False, True]).any() #logical OR

In [None]:
arr_1d

In [None]:
arr_1d.sum()

In [None]:
arr_1d.argmax()

In [None]:
arr_1d[arr_1d.argmax()]

In [None]:
arr_1d.argsort()

In [None]:
arr_1d[arr_1d.argsort()]

In [None]:
np.array(list(zip(arr_1d, arr_1d.clip(30, 80))))

In [None]:
type(zip(arr_1d, arr_1d.clip(30, 80)))

## Try!

Create an array where you store the (randomly generated) grades of an exam (max grade 20). Suppose there are 100 students. Show student 10 to 15. Select the students who pass the exam, store this into a new array. Find the/a/all student(s) with the best grade.

## Solution

In [1]:
import numpy as np

In [2]:
grades = np.random.randint(0, 21, 100)
grades

array([ 9, 16,  9,  4,  7,  4, 19,  0, 19, 14, 16, 14, 19, 17, 12, 19, 18,
       19,  7,  0, 10, 20,  1,  8, 10, 10, 19, 13,  8, 15, 14, 15,  6,  3,
       16,  6,  2, 13,  3,  8,  6, 15,  6, 10, 16,  6,  7,  0,  6, 12, 14,
        0,  4,  8,  1, 18, 16, 11, 13,  0, 13,  3, 13,  7, 10,  6, 14,  3,
        1,  8,  5, 18,  9, 20, 18,  8,  7, 15, 16, 19,  4,  4,  7, 10,  4,
       14, 17,  4,  2,  1,  7,  3, 12, 17,  9,  8,  1, 12, 14,  9])

In [3]:
grades[34] = 20

In [4]:
grades

array([ 9, 16,  9,  4,  7,  4, 19,  0, 19, 14, 16, 14, 19, 17, 12, 19, 18,
       19,  7,  0, 10, 20,  1,  8, 10, 10, 19, 13,  8, 15, 14, 15,  6,  3,
       20,  6,  2, 13,  3,  8,  6, 15,  6, 10, 16,  6,  7,  0,  6, 12, 14,
        0,  4,  8,  1, 18, 16, 11, 13,  0, 13,  3, 13,  7, 10,  6, 14,  3,
        1,  8,  5, 18,  9, 20, 18,  8,  7, 15, 16, 19,  4,  4,  7, 10,  4,
       14, 17,  4,  2,  1,  7,  3, 12, 17,  9,  8,  1, 12, 14,  9])

In [5]:
grades[9:15]

array([14, 16, 14, 19, 17, 12])

In [6]:
passed=grades[grades>10]
passed

array([16, 19, 19, 14, 16, 14, 19, 17, 12, 19, 18, 19, 20, 19, 13, 15, 14,
       15, 20, 13, 15, 16, 12, 14, 18, 16, 11, 13, 13, 13, 14, 18, 20, 18,
       15, 16, 19, 14, 17, 12, 17, 12, 14])

In [7]:
passed.argmax()

12

In [8]:
passed[passed.argmax()]

20

In [9]:
passed[passed==passed[passed.argmax()]]

array([20, 20, 20])