In [1]:
import pandas as pd

In [2]:
drinks = pd.read_csv('../data-repo/drinks.csv')
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [3]:
drinks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 9.1+ KB


In [4]:
# 获取准确的内存大小
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 30.4 KB


In [6]:
drinks.memory_usage()
drinks.memory_usage(deep=True)

Index                              80
country                         12588
beer_servings                    1544
spirit_servings                  1544
wine_servings                    1544
total_litres_of_pure_alcohol     1544
continent                       12332
dtype: int64

In [7]:
drinks.memory_usage(deep=True).sum()

31176

In [10]:
sorted(drinks.continent.unique())

['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']

In [11]:
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: object

In [19]:
# 将字符串转换为category类型，也就是用数字来表示字符串，节省空间，见下
# 此种请款仅适用于某一列有许多重复的字符串时，通过这种转换可以大大节省存储空间
# 如果某一列的所有或者大多数数值都不一样，则转换后总空间还有可能增加，比如将country列转换
drinks['continent'] = drinks.continent.astype('category')
drinks.continent.head()

0      Asia
1    Europe
2    Africa
3    Europe
4    Africa
Name: continent, dtype: category
Categories (6, object): [Africa, Asia, Europe, North America, Oceania, South America]

In [18]:
# 实际存储的是数字
drinks.continent.cat.codes.head()

0    1
1    2
2    0
3    2
4    0
dtype: int8

In [14]:
# 总空间减小
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
country                         193 non-null object
beer_servings                   193 non-null int64
spirit_servings                 193 non-null int64
wine_servings                   193 non-null int64
total_litres_of_pure_alcohol    193 non-null float64
continent                       193 non-null category
dtypes: category(1), float64(1), int64(3), object(1)
memory usage: 19.1 KB


In [16]:
# 仍然可以使用这种方式进行过滤
drinks[drinks.continent == 'Europe'].head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
1,Albania,89,132,54,4.9,Europe
3,Andorra,245,138,312,12.4,Europe
7,Armenia,21,179,11,3.8,Europe
9,Austria,279,75,191,9.7,Europe
10,Azerbaijan,21,46,5,1.3,Europe


In [6]:
# 有序category
df = pd.DataFrame({'id':[1,2,3,4], 'score': ['normal', 'excellent', 'very good', 'good']})

# categories、ordered是Catigorical对象的属性
df['score'] = df.score.astype('category', categories=['normal', 'good', 'very good', 'excellent'], ordered=True)
df.sort_values('score')

Unnamed: 0,id,score
0,1,normal
3,4,good
2,3,very good
1,2,excellent


In [10]:
df.score.cat.categories

Index(['normal', 'good', 'very good', 'excellent'], dtype='object')