In [1]:
from typing import List
import pandas as pd
import numpy as np

In [2]:
#we can separate or discretize continuous data into "bins" for analysis
ages : List[str] = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] 

In [3]:
#l3et's divide these into bins
bins : List[int] = [18, 25, 35, 60, 100]

In [4]:
#pd.cut divides a list into a category of bins
age_categories = pd.cut(ages, bins)
age_categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [5]:
#the object above is the pandas categorical type
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [6]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [7]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [8]:
pd.value_counts(age_categories) #summary of the results of pd.cut(age_categories)

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [9]:
#we could also use this one-liner
pd.value_counts(pd.cut(ages, bins))

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [10]:
#in the string representation of intervals, a parenthesis means the side is open, while the closed bracket means
#the side is closed. We can change which side is closed by passing the argument 'right=False' to pd.cut
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [11]:
#overriding interval-based bin representation: pass a list or array to the labels option
group_names : List[str] = ['Youth', 'YoungAdult', 'MiddleAged', 'Singer']

In [12]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Singer', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Singer']

In [13]:
#we need to ensure the list has the same number of values as the number of bins, or a value error will be raised
try:
    group_names : List[str] = ['Youth', 'YoungAdult', 'MiddleAged']
    pd.cut(ages, bins, labels=group_names)
except ValueError as e:
    print(e)

Bin labels must be one fewer than the number of bin edges


In [14]:
#passing an integer into pandas.cut instead of explicit bin edges creates an equally-sized number of bins.
data = np.random.uniform(size=20)
pd.cut(data, 4)

[(0.0514, 0.279], (0.0514, 0.279], (0.731, 0.957], (0.0514, 0.279], (0.731, 0.957], ..., (0.505, 0.731], (0.731, 0.957], (0.505, 0.731], (0.731, 0.957], (0.279, 0.505]]
Length: 20
Categories (4, interval[float64, right]): [(0.0514, 0.279] < (0.279, 0.505] < (0.505, 0.731] < (0.731, 0.957]]

In [15]:
#passing an integer into pandas.cut instead of explicit bin edges creates an equally-sized number of bins.
#use the precision argument to determine the number of decimals
data = np.random.uniform(size=20)
pd.cut(data, 4, precision=2)

[(0.45, 0.65], (0.65, 0.85], (0.45, 0.65], (0.049, 0.25], (0.25, 0.45], ..., (0.65, 0.85], (0.25, 0.45], (0.45, 0.65], (0.45, 0.65], (0.049, 0.25]]
Length: 20
Categories (4, interval[float64, right]): [(0.049, 0.25] < (0.25, 0.45] < (0.45, 0.65] < (0.65, 0.85]]

In [16]:
#pd.qcut bins the data based on sample quantiles with equal amounts in them.
data = np.random.standard_normal(1000)

In [17]:
quartiles = pd.qcut(data, 4, precision=2)
quartiles

[(-0.66, 0.023], (-0.66, 0.023], (0.023, 0.73], (-0.66, 0.023], (-3.3299999999999996, -0.66], ..., (-3.3299999999999996, -0.66], (0.023, 0.73], (-0.66, 0.023], (-0.66, 0.023], (0.73, 3.23]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.3299999999999996, -0.66] < (-0.66, 0.023] < (0.023, 0.73] < (0.73, 3.23]]

In [18]:
pd.value_counts(quartiles)

(-3.3299999999999996, -0.66]    250
(-0.66, 0.023]                  250
(0.023, 0.73]                   250
(0.73, 3.23]                    250
dtype: int64

In [19]:
#we can pass our own quantiles to pandas.qcut, similarly to pd.cut
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1])

[(-1.226, 0.023], (-1.226, 0.023], (0.023, 1.275], (-1.226, 0.023], (-3.3169999999999997, -1.226], ..., (-1.226, 0.023], (0.023, 1.275], (-1.226, 0.023], (-1.226, 0.023], (0.023, 1.275]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.3169999999999997, -1.226] < (-1.226, 0.023] < (0.023, 1.275] < (1.275, 3.233]]

In [20]:
#we can pass our own quantiles to pandas.qcut, similarly to pd.cut
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1]).value_counts()

(-3.3169999999999997, -1.226]    100
(-1.226, 0.023]                  400
(0.023, 1.275]                   400
(1.275, 3.233]                   100
dtype: int64

In [21]:
#we can pass our own quantiles to pandas.qcut, similarly to pd.cut
#precision=2 reduces the decimal points to 2 or however many we want
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1], precision=2).value_counts()

(-3.3299999999999996, -1.23]    100
(-1.23, 0.023]                  400
(0.023, 1.27]                   400
(1.27, 3.23]                    100
dtype: int64

In [22]:
#detecting and filtering outliers
#Outlier detection and filtration is typically performed through array operations
data = pd.DataFrame(np.random.standard_normal((1000, 4)))

data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.001609,-0.012556,-0.024807,0.049539
std,1.009408,0.97634,1.034832,1.002372
min,-2.998553,-3.00275,-3.15783,-3.42085
25%,-0.718279,-0.655978,-0.71822,-0.628698
50%,0.009001,-0.017015,0.014305,0.037637
75%,0.643901,0.627429,0.668807,0.792616
max,3.362639,2.855371,3.167351,3.638226


In [23]:
#let's find the data in the columns exceeding 3 in absolute value (pg. 218)
col = data[2]

In [24]:
col[col.abs() > 3]

205   -3.157830
641    3.167351
847    3.073285
991    3.017236
Name: 2, dtype: float64

In [25]:
#1) Select rows greater than 3 or less than -3 use the .any method on a DataFrame
data.abs() > 3
#boolean array

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
995,False,False,False,False
996,False,False,False,False
997,False,False,False,False
998,False,False,False,False


In [26]:
#here's the code to set values greater than 3 at 3 and less than -3 at -3
data[data.abs() > 3] = np.sign(data) * 3

In [27]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.001088,-0.012553,-0.024907,0.04927
std,1.007779,0.976331,1.033594,0.998759
min,-2.998553,-3.0,-3.0,-3.0
25%,-0.718279,-0.655978,-0.71822,-0.628698
50%,0.009001,-0.017015,0.014305,0.037637
75%,0.643901,0.627429,0.668807,0.792616
max,3.0,2.855371,3.0,3.0


In [28]:
#np.sign produces a series that 
np.sign(data)

Unnamed: 0,0,1,2,3
0,1.0,1.0,1.0,-1.0
1,1.0,1.0,-1.0,-1.0
2,1.0,1.0,-1.0,-1.0
3,-1.0,1.0,-1.0,1.0
4,-1.0,-1.0,1.0,1.0
...,...,...,...,...
995,1.0,1.0,-1.0,1.0
996,1.0,1.0,-1.0,1.0
997,1.0,-1.0,-1.0,1.0
998,-1.0,1.0,1.0,-1.0


In [29]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,1.0,-1.0
1,1.0,1.0,-1.0,-1.0
2,1.0,1.0,-1.0,-1.0
3,-1.0,1.0,-1.0,1.0
4,-1.0,-1.0,1.0,1.0


In [32]:
#permutation and random sampling
#series: randomly reorder rows in a series
#DataFrame: randomly reorder rows in a DataFrame
#calling permutation with the length of the axis you want to permute produces an array
#of integers indicating the new ordering
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [33]:
sampler = np.random.permutation(5)

In [37]:
def create_random_order(df):
    return np.random.permutation(len(df))

create_random_order(df=pd.DataFrame(np.arange(5 * 7).reshape((5, 7))))

array([0, 2, 4, 1, 3])

In [38]:
#then we can use the iloc-based indexing or the take function
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13


In [39]:
df.iloc[sampler]

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13


In [40]:
#if we invoke take with axis='columns' we select a permutation of the columns
column_sampler = np.random.permutation(7)
column_sampler

array([2, 4, 6, 0, 5, 3, 1])

In [41]:
df.take(column_sampler, axis='columns')

Unnamed: 0,2,4,6,0,5,3,1
0,2,4,6,0,5,3,1
1,9,11,13,7,12,10,8
2,16,18,20,14,19,17,15
3,23,25,27,21,26,24,22
4,30,32,34,28,33,31,29


In [42]:
#to select a random subset without replacement (same row cannot appear twice) use the sample method
df.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27
1,7,8,9,10,11,12,13


In [43]:
df.sample(n=4)

Unnamed: 0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20
4,28,29,30,31,32,33,34


In [44]:
#generate sample with replacement: pass replace=True to sample
choices = pd.Series([5, 7, -1, 6, 4])
choices.sample(n=10, replace=True)

4    4
4    4
0    5
4    4
4    4
1    7
2   -1
2   -1
4    4
3    6
dtype: int64

In [47]:
#converting categorical into dummy variables
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [48]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [50]:
#sometimes we want to add a prefix to the columns
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [52]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0
