In [1]:
from typing import List, Dict
import pandas as pd
import numpy as np

In [2]:
#we can separate continuous data into bins for analysis
#here we have ages and we want to separate them into age buckets
ages : List[int] = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [3]:
#we will divide the ages into bins of 18-25, 26-35, 36-60, 60-100
bins : List[int] = [18, 25, 35, 60, 100]

In [4]:
age_categories = pd.cut(ages, bins)

In [5]:
age_categories #a categorical object

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [6]:
#each bin has a special value that contains the lower and upper limits of the bin. The output
#is information that depicts the Categorical object.
#The square bracket is a hard boundary, while the curly bracket is a soft boundary.
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [7]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [8]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [9]:
pd.value_counts(age_categories) #bin counts for the result of pandas.cut

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [10]:
np.unique(age_categories.codes, return_counts=True)

(array([0, 1, 2, 3], dtype=int8), array([5, 3, 3, 1]))

In [11]:
#we can change which side of the intervals are exclusive with the "right=False" argument
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [12]:
pd.cut(ages, bins, right=True) #same as the original

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [13]:
#override interval-based labelling: pass a list or array of labels to the labels option
group_names : List[str] = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [14]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [15]:
#display the categories
pd.cut(ages, bins, labels=group_names).categories

Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype='object')

In [16]:
pd.cut(ages, bins, labels=group_names).codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [17]:
#if we pass an integer number of bins, pandas will automatically compute equal length bins
#make uniformly distributed data
data = np.random.uniform(size=20)

In [18]:
pd.cut(data, 4, precision=2)

[(0.011, 0.24], (0.47, 0.7], (0.24, 0.47], (0.24, 0.47], (0.47, 0.7], ..., (0.24, 0.47], (0.24, 0.47], (0.011, 0.24], (0.7, 0.93], (0.011, 0.24]]
Length: 20
Categories (4, interval[float64, right]): [(0.011, 0.24] < (0.24, 0.47] < (0.47, 0.7] < (0.7, 0.93]]

In [19]:
#pd.qcut: similar to pd.cut, but automatically calculates equally sized bins
data = np.random.standard_normal(1000)

In [20]:
quartiles = pd.qcut(data, 4, precision=2)
quartiles

[(-0.66, 0.016], (0.016, 0.63], (-0.66, 0.016], (0.63, 3.07], (0.016, 0.63], ..., (-0.66, 0.016], (0.63, 3.07], (0.63, 3.07], (0.016, 0.63], (-3.3899999999999997, -0.66]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.3899999999999997, -0.66] < (-0.66, 0.016] < (0.016, 0.63] < (0.63, 3.07]]

In [21]:
pd.value_counts(quartiles)

(-3.3899999999999997, -0.66]    250
(-0.66, 0.016]                  250
(0.016, 0.63]                   250
(0.63, 3.07]                    250
dtype: int64

In [22]:
#We can also pass our own quantiles to pandas.qcut. This is similar to pandas.cut.
#first let's type the code
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(-1.269, 0.0158], (0.0158, 1.23], (-1.269, 0.0158], (0.0158, 1.23], (0.0158, 1.23], ..., (-1.269, 0.0158], (1.23, 3.074], (1.23, 3.074], (0.0158, 1.23], (-3.385, -1.269]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.385, -1.269] < (-1.269, 0.0158] < (0.0158, 1.23] < (1.23, 3.074]]

In [23]:
#We can also pass our own quantiles to pandas.qcut. This is similar to pandas.cut.
#type the code then get the value_counts
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-3.385, -1.269]    100
(-1.269, 0.0158]    400
(0.0158, 1.23]      400
(1.23, 3.074]       100
dtype: int64

In [24]:
#We can also pass our own quantiles to pandas.qcut. This is similar to pandas.cut.
#we can also use the standalone pd.value_counts() method
pd.value_counts(pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]))

(-1.269, 0.0158]    400
(0.0158, 1.23]      400
(-3.385, -1.269]    100
(1.23, 3.074]       100
dtype: int64

Note: we will return to using pd.cut and pd.qcut later for aggregation and group operations.

In [25]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.026027,-0.010021,0.008695,-0.024887
std,0.993003,0.961783,1.001391,1.004265
min,-3.33147,-3.162111,-2.792254,-3.404943
25%,-0.655261,-0.649843,-0.683896,-0.695594
50%,0.030975,-0.019833,-0.00177,0.012172
75%,0.697027,0.645564,0.688124,0.628138
max,2.986102,3.06991,3.585247,3.298027


In [26]:
#Task: find values in one of the columns exceeding 3 in absolute value
col = data[2]
col

0      0.193585
1     -2.057431
2      0.293073
3      1.007873
4      0.058654
         ...   
995   -0.105766
996    0.699427
997   -1.138720
998   -1.421532
999    0.811973
Name: 2, Length: 1000, dtype: float64

In [27]:
#filter the column such that the absolute value is greater than three
col[col.abs() > 3]

97     3.370946
341    3.585247
766    3.442851
Name: 2, dtype: float64

In [28]:
#to select rows with a value greater than 3 or less than -3, use the any method on a boolean dataframe
data[(data.abs() > 3)] #we are using all so it is all np.Nan values

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
995,,,,
996,,,,
997,,,,
998,,,,


In [29]:
data[(data.abs() > 3).all(axis=1)]

Unnamed: 0,0,1,2,3


In [30]:
data[(data.abs() > 3).all(axis='columns')]

Unnamed: 0,0,1,2,3


In [31]:
#parentheses around the 'data.abs() > 3' are necessary to call the .any method (or .all)
data[(data.abs() > 3).any(axis='columns')]

Unnamed: 0,0,1,2,3
81,0.974968,0.435373,0.297386,3.298027
97,-0.509042,0.388323,3.370946,0.75604
341,-0.730128,-0.205276,3.585247,-1.323286
684,1.135871,-3.162111,-1.212583,-0.765358
696,-0.62228,3.06991,-0.878283,-0.388416
704,-3.33147,-0.218759,0.277659,-1.248944
766,-1.060706,0.862505,3.442851,0.461092
898,1.109918,-0.501277,0.109586,-3.404943


In [32]:
#using axis=1 also works
data[(data.abs() > 3).any(axis=1)]

Unnamed: 0,0,1,2,3
81,0.974968,0.435373,0.297386,3.298027
97,-0.509042,0.388323,3.370946,0.75604
341,-0.730128,-0.205276,3.585247,-1.323286
684,1.135871,-3.162111,-1.212583,-0.765358
696,-0.62228,3.06991,-0.878283,-0.388416
704,-3.33147,-0.218759,0.277659,-1.248944
766,-1.060706,0.862505,3.442851,0.461092
898,1.109918,-0.501277,0.109586,-3.404943


In [33]:
#we can set values based on these criteria.
#here's the code to cap values outside the [-3, 3] interval
data[data.abs() > 3] = np.sign(data) * 3

In [34]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.026358,-0.009929,0.007296,-0.02478
std,0.991936,0.961043,0.996859,1.002038
min,-3.0,-3.0,-2.792254,-3.0
25%,-0.655261,-0.649843,-0.683896,-0.695594
50%,0.030975,-0.019833,-0.00177,0.012172
75%,0.697027,0.645564,0.688124,0.628138
max,2.986102,3.0,3.0,3.0


In [35]:
#np.sign: returns 1 or -1 depending on whether or not the value is positive or negative
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,1.0,-1.0
1,-1.0,-1.0,-1.0,-1.0
2,1.0,-1.0,1.0,-1.0
3,-1.0,1.0,1.0,1.0
4,-1.0,1.0,1.0,-1.0


In [36]:
#np.sign returns -1 for negative numbers, 0 zero, and 1 for positive numbers
np.sign(-5), np.sign(0), np.sign(1)

(-1, 0, 1)

In [37]:
#permutation and random sampling
#permutation: reorder a Series or DataFrame
#calling permutation returns an array of integers indicating the new ordering
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [38]:
sampler = np.random.permutation(5)
sampler

array([0, 3, 2, 1, 4])

In [39]:
#we can then used the iloc-based index or the take function
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
3,21,22,23,24,25,26,27
2,14,15,16,17,18,19,20
1,7,8,9,10,11,12,13
4,28,29,30,31,32,33,34


In [40]:
df.iloc[:,sampler] #select permutation of columns with iloc based indexing

Unnamed: 0,0,3,2,1,4
0,0,3,2,1,4
1,7,10,9,8,11
2,14,17,16,15,18
3,21,24,23,22,25
4,28,31,30,29,32


In [41]:
#select permutation of columns:
column_sampler = np.random.permutation(7)
column_sampler

array([3, 4, 2, 5, 1, 0, 6])

In [42]:
df.take(column_sampler, axis='columns')

Unnamed: 0,3,4,2,5,1,0,6
0,3,4,2,5,1,0,6
1,10,11,9,12,8,7,13
2,17,18,16,19,15,14,20
3,24,25,23,26,22,21,27
4,31,32,30,33,29,28,34


In [43]:
#same thing with axis=1
df.take(column_sampler, axis=1)

Unnamed: 0,3,4,2,5,1,0,6
0,3,4,2,5,1,0,6
1,10,11,9,12,8,7,13
2,17,18,16,19,15,14,20
3,24,25,23,26,22,21,27
4,31,32,30,33,29,28,34


In [44]:
#sample method: select a random subset without replacement
df.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
0,0,1,2,3,4,5,6
2,14,15,16,17,18,19,20


In [45]:
#samples with replacement: allow repeat choices, pass replace=True
choices = pd.Series([5, 7, -1, 6, 4])

In [46]:
choices.sample(n=10, replace=True)

2   -1
3    6
2   -1
1    7
4    4
0    5
0    5
1    7
3    6
1    7
dtype: int64

In [47]:
#computing indicator/dummy variables
#sometimes we need to turn a column with k distinct values into k columns
#each of which contains boolean values
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  'data1': range(6)})

In [48]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [51]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [52]:
#add a prefix to the labels
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [55]:
df_with_dummy = df[['data1']].join(dummies)
#join method will be explained in more detail later

In [54]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [58]:
pd.concat([df['data1'], dummies], axis=1) #same result

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [62]:
pd.concat([df[['data1']], dummies], axis=1) #same result also

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [64]:
#row that belongs to multiple categories
mnames = ['movie_id', 'title', 'genres']

In [65]:
movies = pd.read_table('datasets/movielens/movies.dat',
                      sep='::',
                      header=None,
                      names=mnames,
                      engine='python')

In [66]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [70]:
#pd.Series.str.get_dummies(separator: str) addresses the multiple categories situation
dummies = movies['genres'].str.get_dummies('|')

In [73]:
dummies.iloc[:10, :16]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
7,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [74]:
#we can combine this with movies while adding a Genre_ prefix to the column names
#with the add-prefix method
