In [1]:
import numpy as np
import pandas as pd

data=pd.Series(['apple','orange','apple','aple']*2)
data

0     apple
1    orange
2     apple
3      aple
4     apple
5    orange
6     apple
7      aple
dtype: object

In [2]:
data.value_counts( )

apple     4
aple      2
orange    2
dtype: int64

In [3]:
data.unique()

array(['apple', 'orange', 'aple'], dtype=object)

In [4]:
## dimension table

values=pd.Series([0,1,0,0]*2)
dim=pd.Series(['apple','orange'])
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [5]:
fruits=['apple','orange','apple','apple']*2
n=len(fruits)
df=pd.DataFrame({'fruit':fruits,
                'basket_id':np.arange(n),
                'count':np.random.randint(3,15,size=n),
                'weight':np.random.uniform(0,4,size=n)})
df

Unnamed: 0,fruit,basket_id,count,weight
0,apple,0,9,1.073354
1,orange,1,3,2.638381
2,apple,2,13,2.885758
3,apple,3,9,2.419791
4,apple,4,4,1.602073
5,orange,5,4,1.915113
6,apple,6,4,0.063492
7,apple,7,7,1.175026


In [6]:
fruit_cat=df['fruit'].astype('category')
fruit_cat


0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): [apple, orange]

In [7]:
fruit_cat.values

[apple, orange, apple, apple, apple, orange, apple, apple]
Categories (2, object): [apple, orange]

In [8]:
fruit_cat.values.categories

Index(['apple', 'orange'], dtype='object')

In [9]:
fruit_cat.values.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [10]:
df['fruit']=df['fruit'].astype('category')

In [11]:
df.fruit

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): [apple, orange]

In [12]:
my_cat=pd.Categorical(['foo','bar','baz','baz','baz','bar','foo','foo'])
my_cat

[foo, bar, baz, baz, baz, bar, foo, foo]
Categories (3, object): [bar, baz, foo]

In [13]:
df['fruit']=my_cat

In [14]:
df.fruit

0    foo
1    bar
2    baz
3    baz
4    baz
5    bar
6    foo
7    foo
Name: fruit, dtype: category
Categories (3, object): [bar, baz, foo]

In [15]:
categories=['foo','bar','baz']
codes=[0,2,1,0,0,1]
my_cat2=pd.Categorical.from_codes(codes,categories)
my_cat2

[foo, baz, bar, foo, foo, bar]
Categories (3, object): [foo, bar, baz]

In [16]:
my_cat2.as_ordered()

[foo, baz, bar, foo, foo, bar]
Categories (3, object): [foo < bar < baz]

In [17]:
draws=np.random.randn(1000)
bins=pd.qcut(draws,4)
bins

[(-0.0253, 0.665], (-0.0253, 0.665], (0.665, 3.055], (-3.23, -0.688], (-0.688, -0.0253], ..., (-0.0253, 0.665], (0.665, 3.055], (-0.0253, 0.665], (-3.23, -0.688], (-0.688, -0.0253]]
Length: 1000
Categories (4, interval[float64]): [(-3.23, -0.688] < (-0.688, -0.0253] < (-0.0253, 0.665] < (0.665, 3.055]]

In [18]:
bins=pd.qcut(draws,4,labels=['q1','q2','q3','q4'])
bins

[q3, q3, q4, q1, q2, ..., q3, q4, q3, q1, q2]
Length: 1000
Categories (4, object): [q1 < q2 < q3 < q4]

In [19]:
bins.value_counts()

q1    250
q2    250
q3    250
q4    250
dtype: int64

In [20]:
bins.codes[:10]

array([2, 2, 3, 0, 1, 2, 1, 2, 1, 2], dtype=int8)

In [21]:
bins.categories

Index(['q1', 'q2', 'q3', 'q4'], dtype='object')

In [22]:
results=pd.Series(draws).groupby(bins).agg(['count','min','max','mean'])
results

Unnamed: 0,count,min,max,mean
q1,250,-3.229343,-0.6897,-1.411316
q2,250,-0.687769,-0.027656,-0.334062
q3,250,-0.023023,0.664814,0.29476
q4,250,0.665368,3.055349,1.245718


In [23]:
n=10000000
labels=pd.Series(['foo','bar','baz','qux']*(n//4))
cat=labels.astype('category')

In [24]:
labels.memory_usage()

80000128

In [25]:
cat.memory_usage()

10000320

In [26]:
## categorical methods


In [27]:
s=pd.Series(['a','b','c','d']*2)
cat_s=s.astype('category')

In [28]:
cat_s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]

In [29]:
cat_s.cat.codes

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [30]:
cat_s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [31]:
cat_s.value_counts()

d    2
c    2
b    2
a    2
dtype: int64

In [32]:
actual_categories=['a','b','c','d','e','f']
cat_ss=cat_s.cat.set_categories(actual_categories)

In [33]:
cat_ss.value_counts()

d    2
c    2
b    2
a    2
f    0
e    0
dtype: int64

In [34]:
cat_ss=cat_ss.cat.remove_unused_categories()
cat_ss

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]

In [35]:
cat_ss.value_counts()

d    2
c    2
b    2
a    2
dtype: int64

In [36]:
pd.get_dummies(cat_ss)

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1


In [37]:
## transform function

In [38]:
data=pd.DataFrame({'key':['a','b','c']*4,'value':np.arange(12.)})
data

Unnamed: 0,key,value
0,a,0.0
1,b,1.0
2,c,2.0
3,a,3.0
4,b,4.0
5,c,5.0
6,a,6.0
7,b,7.0
8,c,8.0
9,a,9.0


In [39]:
group=data.groupby('key')
group.mean()

Unnamed: 0_level_0,value
key,Unnamed: 1_level_1
a,4.5
b,5.5
c,6.5


In [40]:
group.transform(lambda x:x.mean())

Unnamed: 0,value
0,4.5
1,5.5
2,6.5
3,4.5
4,5.5
5,6.5
6,4.5
7,5.5
8,6.5
9,4.5


In [41]:
group.apply(lambda x:x.mean())

Unnamed: 0_level_0,value
key,Unnamed: 1_level_1
a,4.5
b,5.5
c,6.5


In [42]:
group.transform('mean')

Unnamed: 0,value
0,4.5
1,5.5
2,6.5
3,4.5
4,5.5
5,6.5
6,4.5
7,5.5
8,6.5
9,4.5


In [43]:
group.transform(lambda x:x.rank(ascending=False))

Unnamed: 0,value
0,4.0
1,4.0
2,4.0
3,3.0
4,3.0
5,3.0
6,2.0
7,2.0
8,2.0
9,1.0


In [44]:
group.transform('mean')

Unnamed: 0,value
0,4.5
1,5.5
2,6.5
3,4.5
4,5.5
5,6.5
6,4.5
7,5.5
8,6.5
9,4.5


In [46]:
def normalize(x):
   return  (x-x.mean())/x.std()

group.transform(normalize)

Unnamed: 0,value
0,-1.161895
1,-1.161895
2,-1.161895
3,-0.387298
4,-0.387298
5,-0.387298
6,0.387298
7,0.387298
8,0.387298
9,1.161895


In [47]:
group.apply(normalize)

Unnamed: 0,value
0,-1.161895
1,-1.161895
2,-1.161895
3,-0.387298
4,-0.387298
5,-0.387298
6,0.387298
7,0.387298
8,0.387298
9,1.161895


In [49]:
normalized=(data['value']-group.transform('mean'))
normalized

Unnamed: 0,value,0,1,2,3,4,5,6,7,8,9,10,11
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,
5,,,,,,,,,,,,,
6,,,,,,,,,,,,,
7,,,,,,,,,,,,,
8,,,,,,,,,,,,,
9,,,,,,,,,,,,,
