# 카테고리 타입 데이터(Categorical Data)

In [1]:
import numpy as np 
import pandas as pd 

In [3]:
gender = pd.Series(['M', 'F', 'F', 'M', 'F'])
gender
#> dtype: object - Series가 저장하고 있는 데이터들의 타입은 문자열

0    M
1    F
2    F
3    M
4    F
dtype: object

In [6]:
# astype(newType) 메서드:  데이터 타입을 newType 타입으로 변환한 새로운 시리즈를 리턴 
gender2 = gender.astype('category')
gender2

0    M
1    F
2    F
3    M
4    F
dtype: category
Categories (2, object): ['F', 'M']

In [5]:
gender

0    M
1    F
2    F
3    M
4    F
dtype: object

In [9]:
age = pd.Series([1, 18, 25, 35, 18, 18, 25, 25, 35])
age

0     1
1    18
2    25
3    35
4    18
5    18
6    25
7    25
8    35
dtype: int64

In [10]:
age2 = age.astype('category')
age2

0     1
1    18
2    25
3    35
4    18
5    18
6    25
7    25
8    35
dtype: category
Categories (4, int64): [1, 18, 25, 35]

In [12]:
# 시리즈를 생성할 때 category 타입으로 생성 
s = pd.Series(data=['a', 'b', 'a', 'a'], dtype='category')
s

0    a
1    b
2    a
3    a
dtype: category
Categories (2, object): ['a', 'b']

In [14]:
np.random.seed(1)

df = pd.DataFrame({'fruit': ['apple', 'banana', 'apple'] * 2,
                   'count': np.random.randint(5, 15, 6),
                   'weight': np.random.uniform(1.0, 5.0, 6)})
df

Unnamed: 0,fruit,count,weight
0,apple,10,2.586323
1,banana,13,2.551643
2,apple,14,3.678984
3,apple,10,4.742156
4,banana,5,4.385244
5,apple,5,2.253094


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   fruit   6 non-null      object 
 1   count   6 non-null      int64  
 2   weight  6 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 272.0+ bytes


In [None]:
df['fruit'] = df['fruit'].astype('category')

In [18]:
df.dtypes

fruit      object
count       int64
weight    float64
dtype: object

In [23]:
# 데이터프레임 df에서 weioght 컬럼의 값들을 사용해서 weight_level 파생 변수를 생성
# 0.0 <= weight < 1.0 : W1
# 1.0 <= weight < 2.0 : W2 
# ...
# 4.0 <= weight < 5.0 : W5
wlevels = []
for w in df['weight']:
    if 0.0 <= w < 1.0:
        l = 'W1'
    elif 1.0 <= w < 2.0:
        l = 'W2' 
    elif 2.0 <=  w < 3.0: 
        l = 'W3'
    elif 3.0 <=  w < 4.0:
        l = 'W4'
    elif 4.0 <=  w < 5.0:
        l = 'W5'
    else:
        l = 'Other'
    wlevels.append(l)

df['weight_level'] = pd.Series(wlevels, dtype='category')
df

Unnamed: 0,fruit,count,weight,weight_level
0,apple,10,2.586323,W3
1,banana,13,2.551643,W3
2,apple,14,3.678984,W4
3,apple,10,4.742156,W5
4,banana,5,4.385244,W5
5,apple,5,2.253094,W3


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   fruit         6 non-null      object  
 1   count         6 non-null      int64   
 2   weight        6 non-null      float64 
 3   weight_level  6 non-null      category
dtypes: category(1), float64(1), int64(1), object(1)
memory usage: 382.0+ bytes


In [32]:
df['w_l'] =pd.cut(x=df['weight'], 
                  bins=np.arange(0.0, 5.1, 1.0),
                  right=False,
                  labels=['W1', 'W2', 'W3', 'W4', 'W5'])
df

Unnamed: 0,fruit,count,weight,weight_level,w_l
0,apple,10,2.586323,W3,W3
1,banana,13,2.551643,W3,W3
2,apple,14,3.678984,W4,W4
3,apple,10,4.742156,W5,W5
4,banana,5,4.385244,W5,W5
5,apple,5,2.253094,W3,W3


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   fruit         6 non-null      object  
 1   count         6 non-null      int64   
 2   weight        6 non-null      float64 
 3   weight_level  6 non-null      category
 4   w_l           6 non-null      category
dtypes: category(2), float64(1), int64(1), object(1)
memory usage: 588.0+ bytes


In [35]:
np.random.seed(1)
df2 = pd.DataFrame({'age': np.random.randint(0, 100, 20)})
df2

Unnamed: 0,age
0,37
1,12
2,72
3,9
4,75
5,5
6,79
7,64
8,16
9,1


In [42]:
# 데이터프레임 df2에 age_range 컬럼을 추가 - ['age_0', 'age_10', 'age_20', ..., 'age_90'] 카테고리
ar_bins = np.arange(0, 101, 10)
ar_labels = [f'age_{i}' for i in range(0, 100, 10)]
df2['age_range'] =pd.cut(df2['age'], 
                        bins=ar_bins,
                        right=False,
                        labels=ar_labels)
df2

Unnamed: 0,age,age_range
0,37,age_30
1,12,age_10
2,72,age_70
3,9,age_0
4,75,age_70
5,5,age_0
6,79,age_70
7,64,age_60
8,16,age_10
9,1,age_0


In [43]:
# 데이터프레임 df2에 age_group 컬럼을 추가 - ['young', 'middle', 'old'] 카테고리
#  young : 0 <= age < 20
#  middle : 20 <= age < 60 
#  old :  60 <= age 
df2['age_group'] = pd.cut(df2['age'], 
                        bins=[0, 20, 60, 100],
                        right=False,
                        labels=['young', 'middle', 'old'])
df2

Unnamed: 0,age,age_range,age_group
0,37,age_30,middle
1,12,age_10,young
2,72,age_70,old
3,9,age_0,young
4,75,age_70,old
5,5,age_0,young
6,79,age_70,old
7,64,age_60,old
8,16,age_10,young
9,1,age_0,young


In [44]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   age        20 non-null     int64   
 1   age_range  20 non-null     category
 2   age_group  20 non-null     category
dtypes: category(2), int64(1)
memory usage: 832.0 bytes
