In [1]:
from typing import List, Dict
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)

In [2]:
#we can separate continuous data into bins for analysis
#here we have ages and we want to separate them into age buckets
ages : List[int] = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [3]:
#we will divide the ages into bins of 18-25, 26-35, 36-60, 60-100
bins : List[int] = [18, 25, 35, 60, 100]

In [4]:
age_categories = pd.cut(ages, bins)

In [5]:
age_categories #a categorical object

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [6]:
#each bin has a special value that contains the lower and upper limits of the bin. The output
#is information that depicts the Categorical object.
#The square bracket is a hard boundary, while the curly bracket is a soft boundary.
age_categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [7]:
age_categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [8]:
age_categories.categories[0]

Interval(18, 25, closed='right')

In [9]:
pd.value_counts(age_categories) #bin counts for the result of pandas.cut

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [10]:
np.unique(age_categories.codes, return_counts=True)

(array([0, 1, 2, 3], dtype=int8), array([5, 3, 3, 1]))

In [11]:
#we can change which side of the intervals are exclusive with the "right=False" argument
pd.cut(ages, bins, right=False)

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [12]:
pd.cut(ages, bins, right=True) #same as the original

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [13]:
#override interval-based labelling: pass a list or array of labels to the labels option
group_names : List[str] = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [14]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [15]:
#display the categories
pd.cut(ages, bins, labels=group_names).categories

Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype='object')

In [16]:
pd.cut(ages, bins, labels=group_names).codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [17]:
#if we pass an integer number of bins, pandas will automatically compute equal length bins
#make uniformly distributed data
data = np.random.uniform(size=20)

In [18]:
pd.cut(data, 4, precision=2)

[(0.25, 0.47], (0.69, 0.91], (0.25, 0.47], (0.69, 0.91], (0.69, 0.91], ..., (0.031, 0.25], (0.69, 0.91], (0.69, 0.91], (0.69, 0.91], (0.25, 0.47]]
Length: 20
Categories (4, interval[float64, right]): [(0.031, 0.25] < (0.25, 0.47] < (0.47, 0.69] < (0.69, 0.91]]

In [19]:
#pd.qcut: similar to pd.cut, but automatically calculates equally sized bins
data = np.random.standard_normal(1000)

In [20]:
quartiles = pd.qcut(data, 4, precision=2)
quartiles

[(0.044, 0.7], (0.044, 0.7], (-3.1599999999999997, -0.71], (0.044, 0.7], (0.044, 0.7], ..., (0.7, 2.98], (0.7, 2.98], (0.044, 0.7], (-0.71, 0.044], (-3.1599999999999997, -0.71]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.1599999999999997, -0.71] < (-0.71, 0.044] < (0.044, 0.7] < (0.7, 2.98]]

In [21]:
pd.value_counts(quartiles)

(-3.1599999999999997, -0.71]    250
(-0.71, 0.044]                  250
(0.044, 0.7]                    250
(0.7, 2.98]                     250
dtype: int64

In [22]:
#We can also pass our own quantiles to pandas.qcut. This is similar to pandas.cut.
#first let's type the code
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(0.044, 1.239], (0.044, 1.239], (-1.259, 0.044], (0.044, 1.239], (0.044, 1.239], ..., (0.044, 1.239], (0.044, 1.239], (0.044, 1.239], (-1.259, 0.044], (-3.155, -1.259]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.155, -1.259] < (-1.259, 0.044] < (0.044, 1.239] < (1.239, 2.985]]

In [23]:
#We can also pass our own quantiles to pandas.qcut. This is similar to pandas.cut.
#type the code then get the value_counts
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()

(-3.155, -1.259]    100
(-1.259, 0.044]     400
(0.044, 1.239]      400
(1.239, 2.985]      100
dtype: int64

In [24]:
#We can also pass our own quantiles to pandas.qcut. This is similar to pandas.cut.
#we can also use the standalone pd.value_counts() method
pd.value_counts(pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]))

(-1.259, 0.044]     400
(0.044, 1.239]      400
(-3.155, -1.259]    100
(1.239, 2.985]      100
dtype: int64

Note: we will return to using pd.cut and pd.qcut later for aggregation and group operations.

In [25]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.02069,0.005754,0.050483,0.004631
std,1.038708,1.034685,1.007291,0.977681
min,-3.140068,-2.710238,-3.176113,-2.958984
25%,-0.776334,-0.739367,-0.616876,-0.667209
50%,-0.030818,-0.009744,0.038793,0.035382
75%,0.699987,0.743789,0.713302,0.69384
max,3.393771,3.374004,3.214342,3.147762


In [26]:
#Task: find values in one of the columns exceeding 3 in absolute value
col = data[2]
col

0     -0.405839
1      1.355703
2      0.275712
3      0.168696
4      1.132208
         ...   
995   -0.370717
996    0.738879
997    0.239271
998    0.150163
999   -0.456955
Name: 2, Length: 1000, dtype: float64

In [27]:
#filter the column such that the absolute value is greater than three
col[col.abs() > 3]

559   -3.176113
723    3.214342
Name: 2, dtype: float64

In [28]:
#to select rows with a value greater than 3 or less than -3, use the any method on a boolean dataframe
data[(data.abs() > 3)] #we are using all so it is all np.Nan values

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
995,,,,
996,,,,
997,,,,
998,,,,


In [29]:
data[(data.abs() > 3).all(axis=1)]

Unnamed: 0,0,1,2,3


In [30]:
data[(data.abs() > 3).all(axis='columns')]

Unnamed: 0,0,1,2,3


In [31]:
#parentheses around the 'data.abs() > 3' are necessary to call the .any method (or .all)
data[(data.abs() > 3).any(axis='columns')]

Unnamed: 0,0,1,2,3
57,3.091119,-0.061546,1.806194,1.09535
123,0.170193,3.038181,0.674601,-1.678654
383,1.619491,3.374004,-0.291718,-0.595568
543,3.393771,0.332027,-1.434212,0.648033
559,0.519975,1.580449,-3.176113,0.84447
636,0.933592,-1.559964,0.902434,3.147762
714,-0.810171,3.273787,-2.253118,-0.5602
723,-0.997123,-1.184866,3.214342,0.657332
920,3.123525,0.072186,2.724482,0.980884
958,-3.140068,0.296958,-0.61459,0.028727


In [32]:
#using axis=1 also works
data[(data.abs() > 3).any(axis=1)]

Unnamed: 0,0,1,2,3
57,3.091119,-0.061546,1.806194,1.09535
123,0.170193,3.038181,0.674601,-1.678654
383,1.619491,3.374004,-0.291718,-0.595568
543,3.393771,0.332027,-1.434212,0.648033
559,0.519975,1.580449,-3.176113,0.84447
636,0.933592,-1.559964,0.902434,3.147762
714,-0.810171,3.273787,-2.253118,-0.5602
723,-0.997123,-1.184866,3.214342,0.657332
920,3.123525,0.072186,2.724482,0.980884
958,-3.140068,0.296958,-0.61459,0.028727


In [33]:
#we can set values based on these criteria.
#here's the code to cap values outside the [-3, 3] interval
data[data.abs() > 3] = np.sign(data) * 3

In [34]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.021159,0.005068,0.050445,0.004483
std,1.036437,1.032591,1.00609,0.977216
min,-3.0,-2.710238,-3.0,-2.958984
25%,-0.776334,-0.739367,-0.616876,-0.667209
50%,-0.030818,-0.009744,0.038793,0.035382
75%,0.699987,0.743789,0.713302,0.69384
max,3.0,3.0,3.0,3.0


In [35]:
#np.sign: returns 1 or -1 depending on whether or not the value is positive or negative
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,1.0
1,-1.0,-1.0,1.0,1.0
2,-1.0,-1.0,1.0,-1.0
3,1.0,-1.0,1.0,-1.0
4,-1.0,-1.0,1.0,1.0


In [36]:
#np.sign returns -1 for negative numbers, 0 zero, and 1 for positive numbers
np.sign(-5), np.sign(0), np.sign(1)

(-1, 0, 1)

In [37]:
#permutation and random sampling
#permutation: reorder a Series or DataFrame
#calling permutation returns an array of integers indicating the new ordering
df = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
2,14,15,16,17,18,19,20
3,21,22,23,24,25,26,27
4,28,29,30,31,32,33,34


In [38]:
sampler = np.random.permutation(5)
sampler

array([1, 4, 3, 2, 0])

In [39]:
#we can then used the iloc-based index or the take function
df.take(sampler)

Unnamed: 0,0,1,2,3,4,5,6
1,7,8,9,10,11,12,13
4,28,29,30,31,32,33,34
3,21,22,23,24,25,26,27
2,14,15,16,17,18,19,20
0,0,1,2,3,4,5,6


In [40]:
df.iloc[:,sampler] #select permutation of columns with iloc based indexing

Unnamed: 0,1,4,3,2,0
0,1,4,3,2,0
1,8,11,10,9,7
2,15,18,17,16,14
3,22,25,24,23,21
4,29,32,31,30,28


In [41]:
#select permutation of columns:
column_sampler = np.random.permutation(7)
column_sampler

array([0, 5, 3, 1, 4, 2, 6])

In [42]:
df.take(column_sampler, axis='columns')

Unnamed: 0,0,5,3,1,4,2,6
0,0,5,3,1,4,2,6
1,7,12,10,8,11,9,13
2,14,19,17,15,18,16,20
3,21,26,24,22,25,23,27
4,28,33,31,29,32,30,34


In [43]:
#same thing with axis=1
df.take(column_sampler, axis=1)

Unnamed: 0,0,5,3,1,4,2,6
0,0,5,3,1,4,2,6
1,7,12,10,8,11,9,13
2,14,19,17,15,18,16,20
3,21,26,24,22,25,23,27
4,28,33,31,29,32,30,34


In [44]:
#sample method: select a random subset without replacement
df.sample(n=3)

Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
4,28,29,30,31,32,33,34
3,21,22,23,24,25,26,27


In [45]:
#samples with replacement: allow repeat choices, pass replace=True
choices = pd.Series([5, 7, -1, 6, 4])

In [46]:
choices.sample(n=10, replace=True)

1    7
1    7
0    5
2   -1
3    6
1    7
3    6
2   -1
0    5
1    7
dtype: int64

In [47]:
#computing indicator/dummy variables
#sometimes we need to turn a column with k distinct values into k columns
#each of which contains boolean values
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                  'data1': range(6)})

In [48]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [49]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [50]:
#add a prefix to the labels
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [51]:
df_with_dummy = df[['data1']].join(dummies)
#join method will be explained in more detail later

In [52]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [53]:
pd.concat([df['data1'], dummies], axis=1) #same result

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [54]:
pd.concat([df[['data1']], dummies], axis=1) #same result also

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [55]:
#row that belongs to multiple categories
mnames = ['movie_id', 'title', 'genres']

In [56]:
movies = pd.read_table('datasets/movielens/movies.dat',
                      sep='::',
                      header=None,
                      names=mnames,
                      engine='python')

In [57]:
movies

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [58]:
#pd.Series.str.get_dummies(separator: str) addresses the multiple categories situation
dummies = movies['genres'].str.get_dummies('|')

In [59]:
dummies.iloc[:10, :16]

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
7,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [60]:
#we can combine this with movies while adding a Genre_ prefix to the column names
#with the add-prefix method
movies_windic = movies.join(dummies.add_prefix('Genre_'))

In [61]:
#note this method is not particularly fast in larger datasets
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Action                                   0
Genre_Adventure                                0
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Crime                                    0
Genre_Documentary                              0
Genre_Drama                                    0
Genre_Fantasy                                  0
Genre_Film-Noir                                0
Genre_Horror                                   0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Romance                                  0
Genre_Sci-Fi                                   0
Genre_Thriller                                 0
Genre_War                                      0
Genre_Western       

In [62]:
#statistical applications often combine something such as pd.get_dummies with pd.cut
np.random.seed(12345) #make result repeatable

In [63]:
values = np.random.uniform(size=10)

In [64]:
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])