# 범주형 데이터 다루기 - 원핫인코딩(One Hot Encoding)

데이터에는 수치형 데이터와 텍스트 데이터나 범주형 테이터가 있다. 머신러닝이나 딥러닝 알고리즘은 수치로 된 데이터만 이해할 수 있다. 그래서 기계가 이해할 수 있는 형태로 데이터를 변환해 주어야 하는데 범주형 데이터는 원핫인코딩 형태로 변환해 주어야 한다. 원핫인코딩은 하나의 데이터는 1로 변경해주고, 나머지는 0으로 채워주는 것을 뜻한다.

예를 들어 과일이라는 컬럼에 사과, 배, 감이 들어있다고 하자. 이 때, 각각의 과일인 사과, 배, 감으로 컬럼을 만들어주고 해당되는 과일에만 1로 표기해주고 나머지 과일은 0으로 표기해주는 것이다.

### 원핫인코딩 전

|과일|
---
|사과|
|배|
|감|
|사과|


### 원핫인코딩 후

|과일|과일_사과|과일_배|과일_감|
|---|---|---|---|
|사과|1|0|0|
|배|0|1|0|
|감|0|0|1|
|사과|1|0|0|


원핫인코딩은 직접 구현할수도 있고, 판다스나 사이킷런으로 변환해줄수도 있다.

Data : https://www.kaggle.com/c/titanic/data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#판다스를 통해 데이터를 읽어보자
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print('train.shape : ', train.shape, "\n", 'test.shape : ', test.shape)

train.shape :  (891, 12) 
 test.shape :  (418, 11)


In [3]:
print(train.dtypes)
train.head()

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
print(test.dtypes)
test.head()

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
#판다스를 통해 수치형 데이터에 대한 정보를 볼수 있다.
#데이터의 수량, 최댓값, 최솟값, 평균, 중간값들을 확인할 수 있다.
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Age에 누락된 정보들이 있다.

In [6]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [7]:
# 오브젝트 타입의 데이터만 따로 추출해보자
# 이 데이터 중 카테고리 형태의 데이터가 무엇인지 보고 원핫인코딩릏 해준다.
# 원핫인코딩 뿐만 아니라 자연어처리(NLP)에서 배운 TF, TF-IDF의 인코딩을 해줄수 있으며, 어떤 인코딩이 적합할지 생각해봐야 한다.

obj_df = train.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


성별은 원핫인코딩을 하기에 적합하다. 캐빈데이터는 영문자와 숫자를 분리시켜줘야 사용하기 좋다.

In [8]:
#누락된 데이터를 출력해보자.
obj_df[obj_df.isnull().any(axis=1)].head(5)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
4,"Allen, Mr. William Henry",male,373450,,S
5,"Moran, Mr. James",male,330877,,Q
7,"Palsson, Master. Gosta Leonard",male,349909,,S


In [9]:
# cabin이 카테고리 데이터로 적합한지 확인해보자
obj_df["Cabin"].value_counts().head(20)

B96 B98        4
C23 C25 C27    4
G6             4
F33            3
E101           3
F2             3
C22 C26        3
D              3
C126           2
C83            2
D26            2
D17            2
C92            2
C93            2
C2             2
F G73          2
C65            2
E67            2
E25            2
B28            2
Name: Cabin, dtype: int64

데이터가 너무 분산되어 있어 전처리를 해줘야 사용하기 적합하다

In [10]:
# 처리 전과 비교하기 위해 데이터를 복사

train_c_df = train.copy()
test_c_df = test.copy()

# 성별 데이터 인코딩

In [12]:
#판다스 방식
train.loc[train["Sex"]=="male", "Sex"] = 0
train.loc[train["Sex"]=="female", "Sex"] = 1

test.loc[test["Sex"]=="male", "Sex"] = 0
test.loc[test["Sex"]=="female", "Sex"] = 1

#람다 방식
# data['Sex'] = data['Sex'].apply(lambda s : 1 if s == 'female' else 0)
# testdata['Sex'] = data['Sex'].apply(lambda s : 1 if s == 'female' else 0)

In [13]:
train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",0,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,C


사이킷런의 Label Encoder를 사용해보자.

In [11]:
from sklearn.preprocessing import LabelEncoder

def gender_to_int(data) :
    le = LabelEncoder()
    le.fit(["female", "male"]) #male = 1, female = 0   []의 안의 순서는 값 배정에 영향을 주지 않는다.
    data["Sex"] = le.transform(data["Sex"])
    return data

train_c_df = gender_to_int(train_c_df)
test_c_df = gender_to_int(test_c_df)
train_c_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S
