# Numpy

NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.

Numpy arrays come in two types, vectors and matrices. Vectors are one dimensional where as matrices can be multiple dimensions.

In [4]:
import numpy as np

## Arrays

In [3]:
numbers = [1,2,3,4,5]
numbers

[1, 2, 3, 4, 5]

In [4]:
np.array(numbers)

array([1, 2, 3, 4, 5])

In [5]:
matrix = [[1,2,3], [4,5,6], [7,8,9]]

np.array(matrix)

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [6]:
np.arange(0, 10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [8]:
# (rows, columns)

np.zeros((2,3))

array([[0., 0., 0.],
       [0., 0., 0.]])

In [24]:
# Between 0-1

np.random.rand(2,3)

array([[0.26363933, 0.35718736, 0.83613998],
       [0.86246623, 0.58535647, 0.83344463]])

In [25]:
# Standard normal distribution centred around 0

np.random.randn(2,3)

array([[-0.59257945, -0.10390845, -0.12824884],
       [ 0.05958264,  1.28192348, -2.6631545 ]])

In [28]:
np.random.randint(1,10,(3,5))

array([[3, 2, 2, 2, 8],
       [2, 5, 1, 3, 4],
       [8, 1, 2, 5, 2]])

## Manipulating Arrays

In [30]:
a = np.random.rand(10)
a

array([0.31745184, 0.33106686, 0.25041696, 0.53602353, 0.62297748,
       0.49884087, 0.50380835, 0.22150992, 0.83102773, 0.21008392])

In [35]:
a.reshape(5,2)

array([[0.31745184, 0.33106686],
       [0.25041696, 0.53602353],
       [0.62297748, 0.49884087],
       [0.50380835, 0.22150992],
       [0.83102773, 0.21008392]])

In [36]:
a.max()

0.8310277340665524

In [37]:
a.min()

0.2100839243666518

In [39]:
a.argmax()

8

In [41]:
a.shape

(10,)

In [44]:
a.dtype

dtype('float64')

## Indexing

In [45]:
a = np.arange(0,11)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [46]:
a[5:]

array([ 5,  6,  7,  8,  9, 10])

In [47]:
a[1:3]

array([1, 2])

## Broadcasting

In [48]:
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [49]:
a[0:5] = 100

In [50]:
a

array([100, 100, 100, 100, 100,   5,   6,   7,   8,   9,  10])

In [54]:
a[0:5] = [0,1,2,3,4]

In [55]:
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

## Indexing a Matrix

In [56]:
matrix = np.random.rand(3,5)

In [57]:
matrix

array([[0.89068137, 0.88806237, 0.00617139, 0.16062384, 0.02630025],
       [0.33074294, 0.65834617, 0.81853415, 0.07600783, 0.91488609],
       [0.14085414, 0.97359263, 0.19070044, 0.0321894 , 0.42151866]])

In [63]:
# Row, column

print(matrix[0][1])

print(matrix[0,1])

0.8880623707991189
0.8880623707991189


In [61]:
# Just the row

matrix[1]

array([0.33074294, 0.65834617, 0.81853415, 0.07600783, 0.91488609])

In [64]:
matrix[:2, 2:]

array([[0.00617139, 0.16062384, 0.02630025],
       [0.81853415, 0.07600783, 0.91488609]])

In [65]:
mask = matrix > 0.5

In [66]:
mask

array([[ True,  True, False, False, False],
       [False,  True,  True, False,  True],
       [False,  True, False, False, False]])

In [67]:
matrix[mask]

array([0.89068137, 0.88806237, 0.65834617, 0.81853415, 0.91488609,
       0.97359263])

In [68]:
matrix[matrix > 0.5]

array([0.89068137, 0.88806237, 0.65834617, 0.81853415, 0.91488609,
       0.97359263])

# Exercises

1. Create a vector of 100 random numbers

2. Reshape your vector into a matrix of 10 rows and 10 columns

3. Select a sub matrix of 5x5 from anywhere you like in the grid

4. Check its shape is (5,5)

5. Use masking to return only the numbers less than 0.5

## Numpy operations

In [83]:
a = np.random.rand(3,3)
a

array([[0.57544628, 0.24769945, 0.78122239],
       [0.09498786, 0.94737925, 0.32334789],
       [0.78319308, 0.51574378, 0.57487461]])

In [87]:
# Operations happen element wise

a + 100

array([[100.57544628, 100.24769945, 100.78122239],
       [100.09498786, 100.94737925, 100.32334789],
       [100.78319308, 100.51574378, 100.57487461]])

In [89]:
np.log(a)

array([[-0.55260941, -1.39553917, -0.24689542],
       [-2.35400614, -0.05405579, -1.12902647],
       [-0.24437603, -0.66214519, -0.55360333]])

# Pandas

Pandas is a software library written for the Python programming language for data manipulation and analysis. In particular, it offers data structures and operations for manipulating numerical tables and time series.

In [1]:
import pandas as pd

## Series

In [92]:
data = [1,2,3,4,5]
labels = ['a','b','c','d','e']

In [93]:
pd.Series(data, labels)

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [95]:
dictionary = {'a': 1, 'b': 2, 'c': 3}

pd.Series(dictionary)

a    1
b    2
c    3
dtype: int64

In [107]:
ser1 = pd.Series([1,2,3,4], ['UK', 'USA', 'Germany', 'France'])

In [108]:
ser1

UK         1
USA        2
Germany    3
France     4
dtype: int64

In [109]:
print(ser1['USA'])
print(ser1[0])

2
1


## Dataframes

In [6]:
# Data / Row labels / Col labels

df = pd.DataFrame(np.random.randn(4,3), ['A','B','C','D'], ['X', 'Y', 'Z'])
df

Unnamed: 0,X,Y,Z
A,0.277573,-1.474903,-0.084544
B,-1.559626,-0.107117,-1.082202
C,0.752205,-0.067989,-1.474795
D,1.172789,1.308139,0.687043


In [118]:
df['X']

A    0.143182
B    1.129345
C   -1.489626
D    1.368906
Name: X, dtype: float64

In [121]:
df[['X', 'Y']]

Unnamed: 0,X,Y
A,0.143182,-1.058217
B,1.129345,0.527965
C,-1.489626,0.363348
D,1.368906,-1.854009


In [122]:
df['new'] = np.random.rand(4,1)

In [123]:
df

Unnamed: 0,X,Y,Z,new
A,0.143182,-1.058217,1.553355,0.052446
B,1.129345,0.527965,0.353895,0.377055
C,-1.489626,0.363348,-0.113067,0.034976
D,1.368906,-1.854009,-0.093783,0.91818


In [126]:
df.drop('new', axis=1, inplace=True) # Axis=1 means columns

In [127]:
df

Unnamed: 0,X,Y,Z
A,0.143182,-1.058217,1.553355
B,1.129345,0.527965,0.353895
C,-1.489626,0.363348,-0.113067
D,1.368906,-1.854009,-0.093783


In [129]:
# Selecting rows...

df.loc['A']

X    0.143182
Y   -1.058217
Z    1.553355
Name: A, dtype: float64

In [134]:
df.iloc[0]

X    0.143182
Y   -1.058217
Z    1.553355
Name: A, dtype: float64

In [135]:
df.loc['B', 'Y']

0.5279645047142337

In [136]:
df.loc[['A', 'B'], ['X', 'Z']]

Unnamed: 0,X,Z
A,0.143182,1.553355
B,1.129345,0.353895


## Masking

Masking allows us to make masks of boolean values which we can then use to filter or select data.

In [7]:
df > 0

Unnamed: 0,X,Y,Z
A,True,False,False
B,False,False,False
C,True,False,False
D,True,True,True


In [8]:
mask = df > 0

df[mask]

Unnamed: 0,X,Y,Z
A,0.277573,,
B,,,
C,0.752205,,
D,1.172789,1.308139,0.687043


In [9]:
df[df > 0]

Unnamed: 0,X,Y,Z
A,0.277573,,
B,,,
C,0.752205,,
D,1.172789,1.308139,0.687043


In [10]:
df['X'] < 0

A    False
B     True
C    False
D    False
Name: X, dtype: bool

In [11]:
df

Unnamed: 0,X,Y,Z
A,0.277573,-1.474903,-0.084544
B,-1.559626,-0.107117,-1.082202
C,0.752205,-0.067989,-1.474795
D,1.172789,1.308139,0.687043


In [12]:
df[df['X'] < 0]

Unnamed: 0,X,Y,Z
B,-1.559626,-0.107117,-1.082202


In [13]:
df.set_index('Z')

Unnamed: 0_level_0,X,Y
Z,Unnamed: 1_level_1,Unnamed: 2_level_1
-0.084544,0.277573,-1.474903
-1.082202,-1.559626,-0.107117
-1.474795,0.752205,-0.067989
0.687043,1.172789,1.308139


In [2]:
df = pd.read_csv('../data/bitcoin.csv')

In [3]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Market Cap
0,"Sep 22, 2017",3628.02,3758.27,3553.53,3630.7,1194830000,60152300000
1,"Sep 21, 2017",3901.47,3916.42,3613.63,3631.04,1411480000,64677600000
2,"Sep 20, 2017",3916.36,4031.39,3857.73,3905.95,1213830000,64918500000
3,"Sep 19, 2017",4073.79,4094.07,3868.87,3924.97,1563980000,67520300000
4,"Sep 18, 2017",3591.09,4079.23,3591.09,4065.2,1943210000,59514100000


# Exercises

1. Working with the Bitcoin price data above, select the first 10 rows of the Closing price. Assign this to a new variable

2. Select all prices from your sub dataframe where the closing price was over $4000