<h1>Numpy Section From Book</h1>
<br>
<h3>Array Creation, Indexing and Operations</h3>

In [1]:
import numpy as np
# Element wise operations
data = np.arange(11)
print(data * data) # Each element is multiplied by 2, this is an element wise operation not cross product
data2 = np.arange(10)
# print(data + data2) this won't work because the arrays don't have the same shape

[  0   1   4   9  16  25  36  49  64  81 100]


In [2]:
# Its possible to create numpy arrays from any sequence, so what about a dictionary? 
data = np.array({
    1:[i for i in range(5)],
    2: [i for i in range(5, 10)],
    3: "Hello"
})
print(data.ndim) # 0D array? so it is just an element of an array
print(data.dtype)
print(data)
# You can't create array from a dict it will just create a single array element, maybe create from a list of dicts

0
object
{1: [0, 1, 2, 3, 4], 2: [5, 6, 7, 8, 9], 3: 'Hello'}


In [3]:
# Methods for creating arrays - not exhaustive but a good amount
arr1 = np.arange(1, 15, 2)
arr2 = np.ones_like(arr1)
arr3 = np.random.rand(5)
arr4 = np.identity(4)
arr5 = np.empty_like(arr4)
print(arr1, arr2, arr3, arr4, arr5, sep='\n')

[ 1  3  5  7  9 11 13]
[1 1 1 1 1 1 1]
[0.44084938 0.91225126 0.55482871 0.38763378 0.78959675]
[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
[[6.93608032e-310 1.55288864e-316 0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
 [0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]]


In [4]:
# You can change types of numpy arrays
print(f"Type of arr1 before change {arr1.dtype}")
arr1 = arr1.astype(np.uint8)
print(f"Type of arr1 after changing is {arr1.dtype}")
# One very useful case is to convert a list of strings representing numbers into a list of numbers

Type of arr1 before change int64
Type of arr1 after changing is uint8


In [5]:
# Indexing arrays
arr2D = np.random.randint(10, size=(3,5))
print(arr2D)
print(arr2D[-1, 2])
# For multidimensional arrays a slice slices along the arrays in that dimension e.g
# Perform a slice on top dimension
print(arr2D[:-1])
# Perform a slice on bottom dimension
print(arr2D[0, :-1])

names = np.array(["Bob", "Joe", "Will", "Bob", "Will", "Joe", "Joe"])
data = np.array([[4, 7], [0, 2], [-5, 6], [0, 0], [1, 2],
                [-12, -4], [3, 4]])
# Suppose that each name in names is represented by a vector in data, if you want to get all the vectors that represent Bob
# You could use a condition to index data
bob_vectors = data[names == 'Bob'] # No need to do .copy() since it already creates a copy
data < 0
# If I wanted to get the values in the last diagonal I'd use lists to index
print(arr2D[[0, 1, 2], [2, 3, 4]])

[[2 5 5 4 9]
 [9 1 7 9 5]
 [3 1 9 2 9]]
9
[[2 5 5 4 9]
 [9 1 7 9 5]]
[2 5 5 4]
[5 9 9]


<h3>Numpy Random Module</h3>

In [6]:
# Can generate arrays of random values, distribution values e.t.c
rng = np.random.default_rng(seed=42) # Used to create a random number generator with a specific seed
arr1 = rng.standard_normal(size=(4, 4)) # Generates a 4 x 4 matrix of values following a normal distribution
print(arr1)
arr2 = rng.uniform(low=5, high=10, size=(4, 4))
print(arr2)

[[ 0.30471708 -1.03998411  0.7504512   0.94056472]
 [-1.95103519 -1.30217951  0.1278404  -0.31624259]
 [-0.01680116 -0.85304393  0.87939797  0.77779194]
 [ 0.0660307   1.12724121  0.46750934 -0.85929246]]
[[7.77292394 5.31908628 9.13815586 8.158322  ]
 [8.7904387  6.77262984 9.85349012 9.46560561]
 [8.89191749 5.97319354 7.33360502 5.21901883]
 [5.77144746 8.41524477 8.72381078 9.83754866]]


<h3>Universal Functions</h3>

In [7]:
# Seed random number generator
gen = np.random.default_rng(seed=42)
arr1 = gen.integers(0, high=10, size=(20))
print(np.isnan(arr1))
print(np.square(arr1))

[False False False False False False False False False False False False
 False False False False False False False False]
[ 0 49 36 16 16 64  0 36  4  0 25 81 49 49 49 49 25  1 64 16]


<h2>Array Oriented Programming</h2>

In [8]:
# Use np.meshgrid to create 2 matrices from lists
arr1 = np.arange(11)
A, B = np.meshgrid(arr1, arr1)
print(A)
print(B)

[[ 0  1  2  3  4  5  6  7  8  9 10]
 [ 0  1  2  3  4  5  6  7  8  9 10]
 [ 0  1  2  3  4  5  6  7  8  9 10]
 [ 0  1  2  3  4  5  6  7  8  9 10]
 [ 0  1  2  3  4  5  6  7  8  9 10]
 [ 0  1  2  3  4  5  6  7  8  9 10]
 [ 0  1  2  3  4  5  6  7  8  9 10]
 [ 0  1  2  3  4  5  6  7  8  9 10]
 [ 0  1  2  3  4  5  6  7  8  9 10]
 [ 0  1  2  3  4  5  6  7  8  9 10]
 [ 0  1  2  3  4  5  6  7  8  9 10]]
[[ 0  0  0  0  0  0  0  0  0  0  0]
 [ 1  1  1  1  1  1  1  1  1  1  1]
 [ 2  2  2  2  2  2  2  2  2  2  2]
 [ 3  3  3  3  3  3  3  3  3  3  3]
 [ 4  4  4  4  4  4  4  4  4  4  4]
 [ 5  5  5  5  5  5  5  5  5  5  5]
 [ 6  6  6  6  6  6  6  6  6  6  6]
 [ 7  7  7  7  7  7  7  7  7  7  7]
 [ 8  8  8  8  8  8  8  8  8  8  8]
 [ 9  9  9  9  9  9  9  9  9  9  9]
 [10 10 10 10 10 10 10 10 10 10 10]]


In [9]:
# Building arrays based on conditions
# Python Way (alower and hard to do for multidimensional arrays)
arr1 = [i for i in range(11)]
arr2 = [x * 2 if x % 2 == 0 else x for x in arr1]
# Numpy way
arr1 = np.array(arr1)
arr2 = np.where(arr1 % 2 == 0, arr1 * 2, arr1)
arr2

array([ 0,  1,  4,  3,  8,  5, 12,  7, 16,  9, 20])

In [10]:
# You can also do mathematical and statistical functions on all the data in the array or on an axis in array
arr1 = np.reshape(np.arange(1, 21), newshape=(2, 10))
print(arr1)
# Get mean of all values
print(f"Mean of all items is {arr1.mean()}")
# Get a list of the means of each column
print(f"Mean of each row is {np.mean(arr1, axis=0)}")
# Get a list of the means of each row
print(f"Mean of each row is {np.mean(arr1, axis=1)}")

[[ 1  2  3  4  5  6  7  8  9 10]
 [11 12 13 14 15 16 17 18 19 20]]
Mean of all items is 10.5
Mean of each row is [ 6.  7.  8.  9. 10. 11. 12. 13. 14. 15.]
Mean of each row is [ 5.5 15.5]


In [11]:
rng = np.random.default_rng(42)
# Sorting
arr1 = np.arange(20, 0, -1).reshape((10, 2))
rng.shuffle(arr1)
print(arr1)
# Sort the entire arr
arr1.sort(axis=1)
print(arr1)

[[10  9]
 [ 8  7]
 [20 19]
 [ 6  5]
 [14 13]
 [16 15]
 [12 11]
 [ 2  1]
 [18 17]
 [ 4  3]]
[[ 9 10]
 [ 7  8]
 [19 20]
 [ 5  6]
 [13 14]
 [15 16]
 [11 12]
 [ 1  2]
 [17 18]
 [ 3  4]]


In [12]:
# Simulate a random walk using gained knowledge
rng = np.random.default_rng(42) # Set seed

# We are standing in a building with 60 floors, we go up a floor if we get heads on a coin flip and go down if we flip tails
# Let's see the highest we can go with 100 coin flips
no_coin_flips = 100
# 0 will be tails and 1 will be head
coin_flips = rng.integers(0, 2, size=no_coin_flips)

initial_floor = 0
walk = np.where(coin_flips == 1, 1, -1)
walk = np.cumsum(walk) # Walk

highest_floor = walk.max()
print(highest_floor)
lowest_floor = walk.min()
print(lowest_floor)


9
-2


<h1>Pandas Section From Book</h1>
<h2>Introduction</h2>

In [13]:
from pandas import Series
# Pandas has 2 main data structures - Series and Dataframes
# Series - 1D array like object with the same datatype and an array of indexes
ser1 = Series([1, 2, 3, -4, 5])
print(ser1)
# How to create a series with a specific index
ser2 = Series([100, 100, 50, 75, 40], index=['Roman', 'Rico', 'Oktombo', 'Ankantele', 'Uvuvwevwevwe'])
print(ser2[['Roman', 'Rico']])

0    1
1    2
2    3
3   -4
4    5
dtype: int64
Roman    100
Rico     100
dtype: int64


In [14]:
# You can still use numpy functions with Series objects
import numpy as np
ser3 = Series(np.arange(1, 21)) # Different ways to create 1D arrays can be used to make Series
print(ser3)
# Element Wise Operations
ser4 = ser3 + 2 # Do element wise operations with scalars
# print(ser4)
ser5 = ser3 * ser3  # Do element wise operations with other arrays
# print(ser5)
ser6 = np.cumsum(ser3)
# print(ser6)  # Use universal functions with Series
ser7 = np.mean(ser1)
# print(ser7) # Do array operations that don't result in an array
# ser8 = Series(np.identity(4)) # Can't make series from multidimensional array
arr1 = np.unique(ser1)
# print(arr1)
ser8 = np.where(ser1 % 2 == 0, ser1 * 2, ser1)
print(ser8) # Returned an array

0      1
1      2
2      3
3      4
4      5
5      6
6      7
7      8
8      9
9     10
10    11
11    12
12    13
13    14
14    15
15    16
16    17
17    18
18    19
19    20
dtype: int64
[ 1  4  3 -8  5]


In [15]:
# Series objects can also be used like dictionaries
from re import I


print('Roman' in ser2)
# Create a series from a dictionary
ser9 = Series({'Luffy': 3_000_000_000, 'Zoro': 1_100_000_000, 'Sanji': 1_050_000_000, 'Jinbe': 1_110_000_000, 'Robin': 1_000_000_000})
print(ser9)
# If I wanted to make sure ser9 had values in a certain order I can pass the index parameter with the order of values I want
ser9 = Series({'Luffy': 3_000_000_000, 'Zoro': 1_100_000_000, 'Sanji': 1_050_000_000, 'Jinbe': 1_110_000_000, 'Robin': 1_000_000_000}, index=['Jinbe', 'Robin', 'Sanji', 'Luffy', 'Robin'])
ser9


True
Luffy    3000000000
Zoro     1100000000
Sanji    1050000000
Jinbe    1110000000
Robin    1000000000
dtype: int64


Jinbe    1110000000
Robin    1000000000
Sanji    1050000000
Luffy    3000000000
Robin    1000000000
dtype: int64

In [16]:
print(np.isnan(ser9))

Jinbe    False
Robin    False
Sanji    False
Luffy    False
Robin    False
dtype: bool


In [17]:
from pandas import DataFrame
df1 = DataFrame({
    'name': ['Luffy', 'Zoro', 'Jinbe', 'Robin', 'Sanji'],
    'bounties': [3_000_000_000, 1_100_000_000, 1_110_000_000, 1_000_000_000, 1_050_000_000]
})
df1.set_index('name' ,inplace=True)
df1 


Unnamed: 0_level_0,bounties
name,Unnamed: 1_level_1
Luffy,3000000000
Zoro,1100000000
Jinbe,1110000000
Robin,1000000000
Sanji,1050000000


In [18]:
print('Sanji' in df1.index)  # How to use indexes to see if value in dataframe
# How to rearrange values in a dataframe
df2 = df1.reindex(['Luffy', 'Zoro', 'Jinbe', 'Sanji', 'Robin', 'Brook'], fill_value=500_000_000)
print(df2) 

True
         bounties
name             
Luffy  3000000000
Zoro   1100000000
Jinbe  1110000000
Sanji  1050000000
Robin  1000000000
Brook   500000000


In [19]:
# A good way to illustrate when to use iloc
ser1 = Series(np.arange(5), index=[2, 1, 0, 4, 3])
# Let's say you want to get the 4th and 5th elements
print(ser1[[3, 4]]) # This doesn't give me what I want
# To actually do this I use iloc
print(ser1.iloc[[3, 4]])

3    4
4    3
dtype: int64
4    3
3    4
dtype: int64


In [20]:
# Reindexing - rearrange according to a new index
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj2) # Has a null value because obj1 did not have an item with the index e

# If index is made of numbers you can fill the NaNs created by reindexing

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


In [21]:
# Reindexing dataframes
df = DataFrame(np.arange(1, 21).reshape((4, 5)),
                index=['a', 'b', 'c', 'd'],
                columns=['Nairobi', 'Naivasha', 'Nakuru', 'Eldoret', 'Machakos'])
print(df)
# Reindex method with only a sequence argument changes index
df2 = df.reindex(['b', 'c', 'd', 'a', 'e'])
print(df2)
df3 = df.reindex(columns=['Nairobi', 'Eldoret', 'Kikuyu']) # Allows you to only choose these columns
print(df3)
# You want to pick only Nakuru and Nairobi data for a, d, c
df4 = df.reindex(index=['a', 'd', 'c'], columns=['Nakuru', 'Nairobi'])
print(df4)
df.loc[:, ['Nairobi', 'Eldoret', 'Kikuyu']] # loc can also be used to reindex but if given a missing index it returns an error

   Nairobi  Naivasha  Nakuru  Eldoret  Machakos
a        1         2       3        4         5
b        6         7       8        9        10
c       11        12      13       14        15
d       16        17      18       19        20
   Nairobi  Naivasha  Nakuru  Eldoret  Machakos
b      6.0       7.0     8.0      9.0      10.0
c     11.0      12.0    13.0     14.0      15.0
d     16.0      17.0    18.0     19.0      20.0
a      1.0       2.0     3.0      4.0       5.0
e      NaN       NaN     NaN      NaN       NaN
   Nairobi  Eldoret  Kikuyu
a        1        4     NaN
b        6        9     NaN
c       11       14     NaN
d       16       19     NaN
   Nakuru  Nairobi
a       3        1
d      18       16
c      13       11


KeyError: "['Kikuyu'] not in index"

In [None]:
# Reindex with generator test
def generator(i):
    """
    A generator that returns numbers from 0 to i - 1
    """
    num = -5
    while num > i:
        yield num
        num += 1

gen = generator(df.Nairobi.__len__())
df2 = df.reindex(columns=['Nairobi', 'Kisumu', 'Naivasha'], fill_value=next(gen))
df2
# I don't know how to pass generator as fill_value

Unnamed: 0,Nairobi,Kisumu,Naivasha
a,1,10,2
b,6,10,7
c,11,10,12
d,16,10,17


In [None]:
# An easier way to drop values from a dataframe than using reindex is to use drop method
df2 = df.drop(index=['c'], columns=['Machakos', 'Eldoret'])
df2

Unnamed: 0,Nairobi,Naivasha,Nakuru
a,1,2,3
b,6,7,8
d,16,17,18


In [None]:
# See pitfalls of indexing with []
ex1 = Series(np.random.randint(50, size=4),
                index=np.arange(4, 0, -1))
print(ex1)
# I want to get first 2 items
# print(ex1[[0, 1]]) # Raises an error
print(ex1.iloc[[0, 1]])

4    26
3    35
2    35
1     3
dtype: int64
4    26
3    35
dtype: int64


In [None]:
# You want to multiply even results in the dataframe by 2 and do nothing to odd numbers
df3 = np.where(df % 2 == 0, df * 2, df) # Can use np.where on a dataframe just that it will return an array
df4 = df.copy()
df4[df4 % 2 == 0] = 0
df4
df3 = DataFrame(df3, index=df.index, columns=df.columns)
print(df3)
# You can use at method to get single scalar value at intersection between row and col label or iat to get scalar value
# at row and col position
print(df3.at['a', 'Nairobi'])
print(df3.iat[1, 2])

   Nairobi  Naivasha  Nakuru  Eldoret  Machakos
a        1         4       3        8         5
b       12         7      16        9        20
c       11        24      13       28        15
d       32        17      36       19        40
1
16


In [None]:
# Combining pandas objects with arithmetic operations
ser1 = Series([1, 2, 3], index=['a', 'c', 'd'])
ser2 = Series([2, 3 , 4], index=['a', 'e', 'f'])
print(ser1 + ser2)
print('Better Version: ')
print(ser1.add(ser2, fill_value=0))

df1 = DataFrame(np.ones(shape=(2, 2)),
                index=[0, 1],
                columns=['a', 'b'])
df2 = DataFrame(np.ones(shape=(2, 2)),
                index=[1, 2],
                columns=['a', 'c'])
print(df1 + df2)
print('Better Version: ')
print(df1.add(df2, fill_value=0))

a    3.0
c    NaN
d    NaN
e    NaN
f    NaN
dtype: float64
Better Version: 
a    3.0
c    2.0
d    3.0
e    3.0
f    4.0
dtype: float64
     a   b   c
0  NaN NaN NaN
1  2.0 NaN NaN
2  NaN NaN NaN
Better Version: 
     a    b    c
0  1.0  1.0  NaN
1  2.0  1.0  1.0
2  1.0  NaN  1.0


In [None]:
# Operations between a dataframe and series
df = DataFrame(np.arange(20).reshape(4, 5),
               columns=list('abcde'))
print(df)
# I want to add the value 5 to all items on column a, 4 to all on column b, 6 to c, 9 to d and 7 to e
ser = Series([5, 4, 6, 9, 7], index=list('abcde'))
print(df + ser) # By default it matches indexes of series to dataframe and broadcasts down the columns

# If I wanted to add the values along the rows I'd use arithmetic function and specify axis parameter
ser2 = Series([5, 4, 6, 9])
print(df.add(ser2, axis='index'))

    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
    a   b   c   d   e
0   5   5   8  12  11
1  10  10  13  17  16
2  15  15  18  22  21
3  20  20  23  27  26
    a   b   c   d   e
0   5   6   7   8   9
1   9  10  11  12  13
2  16  17  18  19  20
3  24  25  26  27  28


In [None]:
# You want to get the mean of the bounties and ages of the straw hats
strawHats = DataFrame({
    'name': ['Luffy', 'Zoro', 'Jinbe', 'Robin', 'Sanji'],
    'bounties': [3_000_000_000, 1_100_000_000, 1_110_000_000, 1_000_000_000, 1_050_000_000],
    'age': [20, 22, 50, 30, 22]
})
strawHats.set_index('name', inplace=True)
strawHats.columns.name = 'Details'
print(strawHats.apply(np.mean))

Details
bounties    1.452000e+09
age         2.880000e+01
dtype: float64


In [None]:
# Sort strawHats by rows
sorted_strawHats = strawHats.sort_index(axis='index')
print(sorted_strawHats)
# Sort strawHats by bounties
sorted_strawHats2 = strawHats.sort_values('bounties')
print(sorted_strawHats2)

Details    bounties  age
name                    
Jinbe    1110000000   50
Luffy    3000000000   20
Robin    1000000000   30
Sanji    1050000000   22
Zoro     1100000000   22
Details    bounties  age
name                    
Robin    1000000000   30
Sanji    1050000000   22
Zoro     1100000000   22
Jinbe    1110000000   50
Luffy    3000000000   20


In [None]:
# Performing summary statistics on a DataFrame - return a series
null_df = DataFrame([[1.4, np.nan],
                     [7.1, -4.5],
                     [np.nan, np.nan],
                     [0.75, -1.3]], index=list('abcd'), 
                     columns=['one', 'two'])
print(null_df)
# Compute sum summary statistic and deal with the NaN values
print(null_df.sum()) # Returns Series of sum of column values
print(null_df.sum(axis=1)) # Returns a Series containg sum of values of each row

# If you want the result to be NaN when NaN is present set parameter skipna to False
print(null_df.sum(skipna=False))

# Other summary statistics
print(null_df.cumsum(skipna=False)) # The cumsum of an item is set to NaN when value before it is NaN

# Describe on a dataframe - get summary statistics of one and of a
one_summary_stats = null_df.one.describe()
print(one_summary_stats)
a_summary_stats = null_df.loc['a'].describe()
print(a_summary_stats)
print(null_df.count().sum()) # Get count of all non NaN values in the whole dataframe
print(null_df.isna().sum()) # Get how many NaN values are in each col

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
one    9.25
two   -5.80
dtype: float64
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
one   NaN
two   NaN
dtype: float64
   one  two
a  1.4  NaN
b  8.5  NaN
c  NaN  NaN
d  NaN  NaN
count    3.000000
mean     3.083333
std      3.493685
min      0.750000
25%      1.075000
50%      1.400000
75%      4.250000
max      7.100000
Name: one, dtype: float64
count    1.0
mean     1.4
std      NaN
min      1.4
25%      1.4
50%      1.4
75%      1.4
max      1.4
Name: a, dtype: float64
5
one    1
two    2
dtype: int64


In [None]:
# Return a boolen Series showing which index has 1.4 in it
bool_ser = null_df.one.isin([1.4])
print(bool_ser)
null_df.loc[bool_ser] # You can index with a series of booleans

a     True
b    False
c    False
d    False
Name: one, dtype: bool


Unnamed: 0,one,two
a,1.4,


In [None]:
import pandas as pd
weather_data = DataFrame({
    'city': ['Nairobi', 'Nakuru', 'Mombasa'],
    'weather': ['Cool', 'Sunny', 'Windy'],
    'windspeed': [20, 10, 40]
})
weather_data.set_index('city', inplace=True)
print(weather_data)

# Get count of every item in dataframe
print(weather_data.apply(pd.value_counts).fillna(0))

# More useful to get the counts for a specific column
print(weather_data.weather.value_counts())

        weather  windspeed
city                      
Nairobi    Cool         20
Nakuru    Sunny         10
Mombasa   Windy         40
       weather  windspeed
10         0.0        1.0
20         0.0        1.0
40         0.0        1.0
Cool       1.0        0.0
Sunny      1.0        0.0
Windy      1.0        0.0
Cool     1
Sunny    1
Windy    1
Name: weather, dtype: int64


In [None]:
print(weather_data.at['Nairobi', 'weather'])

Cool
