# Categorical Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
gender=pd.Series(['M', 'F', 'F', 'M', 'F'])
gender
#> dtype: object --- It means String type

0    M
1    F
2    F
3    M
4    F
dtype: object

# astype() method
* **`astype(newType)` method** : return new pd.Series changed datatype to *newType*.

In [3]:
gender2=gender.astype('category')
gender2

0    M
1    F
2    F
3    M
4    F
dtype: category
Categories (2, object): ['F', 'M']

In [4]:
gender

0    M
1    F
2    F
3    M
4    F
dtype: object

In [5]:
age=pd.Series([1,19,25,45,23,23,15,25,25,38])
age
#> dtype:int 64 --- it means, 'integer'.

0     1
1    19
2    25
3    45
4    23
5    23
6    15
7    25
8    25
9    38
dtype: int64

In [6]:
age2=age.astype('category')
age2

0     1
1    19
2    25
3    45
4    23
5    23
6    15
7    25
8    25
9    38
dtype: category
Categories (7, int64): [1, 15, 19, 23, 25, 38, 45]

# How to Generate Categorical type Series

In [7]:
s=pd.Series(data=['a','b','a','a'], dtype='category')
s

0    a
1    b
2    a
3    a
dtype: category
Categories (2, object): ['a', 'b']

# example 1 : fruit

In [8]:
df=pd.DataFrame({'fruit':['apple','banana','apple']*2,
                'count': np.random.randint(5,15,6),
                'weight': np.random.uniform(1.0, 5.0, 6)})
df

Unnamed: 0,fruit,count,weight
0,apple,9,1.926657
1,banana,10,4.809335
2,apple,13,2.712139
3,apple,8,1.275425
4,banana,13,4.515694
5,apple,12,2.701515


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   fruit   6 non-null      object 
 1   count   6 non-null      int32  
 2   weight  6 non-null      float64
dtypes: float64(1), int32(1), object(1)
memory usage: 248.0+ bytes


## .astype('category')

In [10]:
# Change fruit column's data type to category type.
df['fruit']=df['fruit'].astype('category')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   fruit   6 non-null      category
 1   count   6 non-null      int32   
 2   weight  6 non-null      float64 
dtypes: category(1), float64(1), int32(1)
memory usage: 330.0 bytes


## Generate derived variable, 'weight_level'

### by using for loop

In [12]:
# Generate derived variable named 'weight_level'. Use values of weight column in df.
# 0.0<=weight<1.0 : w1, ... 4.0<=weight<5.0 : w5
wlevels=[]
for w in df['weight']:
    if 0.0<=w<1.0:
        l='W1'
    elif 1.0<=w<2.0:
        l='W2'
    elif 2.0<=w<3.0:
        l='W3'
    elif 3.0<=w<4.0:
        l='W4'
    else:
        l='W5'
    wlevels.append(l)
    
df['weight_level']=pd.Series(wlevels, dtype='category')
df

Unnamed: 0,fruit,count,weight,weight_level
0,apple,9,1.926657,W2
1,banana,10,4.809335,W5
2,apple,13,2.712139,W3
3,apple,8,1.275425,W2
4,banana,13,4.515694,W5
5,apple,12,2.701515,W3


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   fruit         6 non-null      category
 1   count         6 non-null      int32   
 2   weight        6 non-null      float64 
 3   weight_level  6 non-null      category
dtypes: category(2), float64(1), int32(1)
memory usage: 468.0 bytes


### by using pd.cut() function

In [15]:
df['w_1']=pd.cut(x=df['weight'],
                bins=np.arange(0.0, 5.1, 1.0),
                right=False,
                labels=['W1','W2','W3','W4','W5'])
df

Unnamed: 0,fruit,count,weight,weight_level,w_1
0,apple,9,1.926657,W2,W2
1,banana,10,4.809335,W5,W5
2,apple,13,2.712139,W3,W3
3,apple,8,1.275425,W2,W2
4,banana,13,4.515694,W5,W5
5,apple,12,2.701515,W3,W3


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   fruit         6 non-null      category
 1   count         6 non-null      int32   
 2   weight        6 non-null      float64 
 3   weight_level  6 non-null      category
 4   w_1           6 non-null      category
dtypes: category(3), float64(1), int32(1)
memory usage: 686.0 bytes


# example 2 : age

In [17]:
df2=pd.DataFrame({'age': np.random.randint(0,100,20)})
df2

Unnamed: 0,age
0,66
1,83
2,50
3,1
4,28
5,69
6,9
7,64
8,50
9,37


In [18]:
# generate derived variable(categorical) : age_range
# ['age_0', 'age_10', ..., 'age_90']
ar_bins=np.arange(0,101,10)
ar_labels=[f'age_{i}' for i in range(0,100,10)]
df2['age_range']=pd.cut(x=df2['age'],
                       bins=ar_bins,
                       labels=ar_labels,
                       right=False)
df2

Unnamed: 0,age,age_range
0,66,age_60
1,83,age_80
2,50,age_50
3,1,age_0
4,28,age_20
5,69,age_60
6,9,age_0
7,64,age_60
8,50,age_50
9,37,age_30


In [19]:
# generate derived variable(categorical) : age_group
# ['young', 'middle', 'old']
# young : 0 <= age <20 / middle : 20 <= age < 60 / old : 60 <= age < 100
df2['age_group']=pd.cut(x=df2['age'],
                       bins=[0,20,60,100],
                       right=False,
                       labels=['young', 'middle', 'old'])
df2

Unnamed: 0,age,age_range,age_group
0,66,age_60,old
1,83,age_80,old
2,50,age_50,middle
3,1,age_0,young
4,28,age_20,middle
5,69,age_60,old
6,9,age_0,young
7,64,age_60,old
8,50,age_50,middle
9,37,age_30,middle


In [20]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   age        20 non-null     int32   
 1   age_range  20 non-null     category
 2   age_group  20 non-null     category
dtypes: category(2), int32(1)
memory usage: 760.0 bytes
