# 데이터 표준화

- 실무에서 접하는 데이터들은 다양한 곳에서 수집되어 여러가지 원인에 의해 다양한 형태로 표현될 수 있음
    - 단위 선택
    - 대소문자 구분
    - 약칭 활용 등
    
- 이처럼 동일한 대상을 표현하는 방법에 차이가 있으면 분석의 정확도는 현저하게 낮아짐
    - 데이터 형식을 일관성 있게 표준화하는 작업이 필요
        - 데이터 표준화

## 단위 환산

- 같은 데이터셋 안에서는 측정 단위를 동일하게 맞춰줘야함
    - 특히 외국 데이터를 가져오면 국내에서 사용하지 않는 도량형 단위를 사용하는 경우가 많아서 주의가 필요
        -마일, 야드, 온수 등

In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("./data/auto-mpg.csv", header = None)
df.columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year", "origin", "name"]

In [4]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [6]:
# mpg(mile per gallon)를 kpl(kilometer per liter)로 변환
# (mpg_to_kpl = 0.425)
mpg_to_kpl = 1.60934 / 3.7854
mpg_to_kpl

0.4251439742167274

In [8]:
# mpg 열에 mpg_to_kpl을 곱한 결과를 새로운 열(kpl)에 추가
df["kpl"] = df["mpg"] * mpg_to_kpl

In [9]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,kpl
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,7.652592
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,6.37716
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,7.652592
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,6.802304
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,7.227448


In [10]:
# kpl 열을 소수점 아래 둘째 자리에서 반올림
df["kpl"] = df["kpl"].round(2)

In [11]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,kpl
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,7.65
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,6.38
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,7.65
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,6.8
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,7.23


# 자료형 변환

- 숫자가 문자열로 저장된 경우에는 숫자형으로 변환해야함
- dtypes 속성을 사용하여 데이터프레임을 구성하는 각 열의 자료형을 확인해야함

In [12]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model_year        int64
origin            int64
name             object
kpl             float64
dtype: object

In [13]:
# horsepower 열의 고유값 확인
df["horsepower"].unique()

array(['130.0', '165.0', '150.0', '140.0', '198.0', '220.0', '215.0',
       '225.0', '190.0', '170.0', '160.0', '95.00', '97.00', '85.00',
       '88.00', '46.00', '87.00', '90.00', '113.0', '200.0', '210.0',
       '193.0', '?', '100.0', '105.0', '175.0', '153.0', '180.0', '110.0',
       '72.00', '86.00', '70.00', '76.00', '65.00', '69.00', '60.00',
       '80.00', '54.00', '208.0', '155.0', '112.0', '92.00', '145.0',
       '137.0', '158.0', '167.0', '94.00', '107.0', '230.0', '49.00',
       '75.00', '91.00', '122.0', '67.00', '83.00', '78.00', '52.00',
       '61.00', '93.00', '148.0', '129.0', '96.00', '71.00', '98.00',
       '115.0', '53.00', '81.00', '79.00', '120.0', '152.0', '102.0',
       '108.0', '68.00', '58.00', '149.0', '89.00', '63.00', '48.00',
       '66.00', '139.0', '103.0', '125.0', '133.0', '138.0', '135.0',
       '142.0', '77.00', '62.00', '132.0', '84.00', '64.00', '74.00',
       '116.0', '82.00'], dtype=object)

In [16]:
# 누락 데이터 ("?") 삭제
df["horsepower"].replace("?", np.nan, inplace = True) # ?를 NaN으로 변경
df.dropna(subset = ["horsepower"], axis = 0, inplace = True) #누락데이터 행을 삭제
df["horsepower"] = df["horsepower"].astype("float")# 문자열을 실수형으로 변환

In [18]:
# horsepower 열의 자료형 확인
df["horsepower"].dtypes

dtype('float64')

# 범주형 데이터 처리

## 구간 분할

- 데이터 분석 알고리즘에 따라서는 연속 데이터를 일정한 구간으로 나눠서 분석하는 것이 효율적인 경우가 있음
    - 가격, 비율, 효율 등 연속적인 값을 일정한 수준이나 정도를 나타내는 이산값으로 나타내어 구간별 차이를 드러냄
    
- 연속 변수를 일정한 구간으로 나누고 각 구간을 범주형 이산 변수로 변환하는 과정을 구간 분할(binning)이라고 함

- cut()

In [19]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name,kpl
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,7.65
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,6.38
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,7.65
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,6.8
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,7.23


In [23]:
# 3개의 bin으로 나누는 경계 값의 리스트 구하기
count, bin_dividers = np.histogram(df["horsepower"], bins = 3)

In [24]:
count

array([257, 103,  32], dtype=int64)

In [25]:
bin_dividers

array([ 46.        , 107.33333333, 168.66666667, 230.        ])

In [26]:
# 3개의 bin에 이름 지정
bin_names = ["저출력", "보통출력", "고출력"]

In [27]:
# 각 데이터를 3개의 bin에 할당
df["hp_bin"] = pd.cut(x = df["horsepower"],
                     bins = bin_dividers,
                     labels = bin_names,
                     include_lowest = True)

In [28]:
df[["horsepower", "hp_bin"]].head(10)

Unnamed: 0,horsepower,hp_bin
0,130.0,보통출력
1,165.0,보통출력
2,150.0,보통출력
3,150.0,보통출력
4,140.0,보통출력
5,198.0,고출력
6,220.0,고출력
7,215.0,고출력
8,225.0,고출력
9,190.0,고출력


## 더미 변수

- 범주형 데이터를 머신러닝 알고리즘에 바로 사용할 수 없는 경우에는 컴퓨터가 인식 가능한 값으로 변환해야 함

- 이 때 숫자 0 또는 1로 표현되는 더미 변수(dummy variable)를 사용
    - 0과 1은 수의 크고 작음이 아니라 어떤 특성이 있는지 없는지 여부만을 표시
    - 해당 특성이 존재하면1, 존재하지 않으면 0
    
- 범주형 데이터를 컴퓨터가 인식할 수 있도록 0과 1로만 구성되는 벡터로 변환하는 것을 원핫인코딩이라고 부름

In [36]:
# hp_bin 열의 범주형 데이터를 더미 변수로 변환
pd.get_dummies(df["hp_bin"])

Unnamed: 0,저출력,보통출력,고출력
0,False,True,False
1,False,True,False
2,False,True,False
3,False,True,False
4,False,True,False
...,...,...,...
393,True,False,False
394,True,False,False
395,True,False,False
396,True,False,False


# 정규화

- 각 변수의 상대적 크기 차이 때문에 머신러닝 분석의 결과가 달라질 수 있음
    - 예) 0 - 10000 범위의 값을 갖는 변수와 0 - 1 범위의 값을 갖는 변수 중 상대적으로 큰 숫자 값을 갖는 변수의 영향이 더 커짐
    
- 숫자 데이터의 상대적인 크기 차이를 제거할 필요가 있음

- 각 열에 속하는 데이터값을 동일한 기준으로 나눈 비율로 나타내는 것을 정규화 (normalization)라고 함
    - 일반적으로 데이터의 범위는 0 - 1 또는 -1 - 1로 정규화
    
- 가장 간단한 정규화 방법은 데이터를 해당 열의 최댓값으로 나누는 방법


In [37]:
df["horsepower"].describe()

count    392.000000
mean     104.469388
std       38.491160
min       46.000000
25%       75.000000
50%       93.500000
75%      126.000000
max      230.000000
Name: horsepower, dtype: float64

In [38]:
# horsepower 열의 최댓값의 절대값으로 모든 데이터를 나눠서 저장
df["scaled_hp"] = df["horsepower"] / abs(df["horsepower"]).max()

In [40]:
df["scaled_hp"].head()

0    0.565217
1    0.717391
2    0.652174
3    0.652174
4    0.608696
Name: scaled_hp, dtype: float64

# 시계열 데이터

- 시계열 데이터란 일정 시간 간격으로 배치된 데이터들의 수열을 의미

- 시계열 데이터를 데이터프레임의 행 인덱스로 사용하면 시간으로 기록된 데이터를 분석하는 것이 편리함

## 문자열을 Timestamp로 변환

- to_datetime()

In [42]:
df = pd.read_csv("./data/scientists.csv")

In [43]:
df.head()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist


In [44]:
df.dtypes

Name          object
Born          object
Died          object
Age            int64
Occupation    object
dtype: object

In [45]:
df["born_df"] = pd.to_datetime(df["Born"], format = "%Y-%m-%d")

In [46]:
df.head()

Unnamed: 0,Name,Born,Died,Age,Occupation,born_df
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27


In [47]:
df.dtypes

Name                  object
Born                  object
Died                  object
Age                    int64
Occupation            object
born_df       datetime64[ns]
dtype: object

In [53]:
df["born_dt"] = pd.to_datetime(df["Born"], format = "%Y-%m-%d")
df["died_dt"] = pd.to_datetime(df["Died"], format = "%Y-%m-%d")

In [54]:
df.head()

Unnamed: 0,Name,Born,Died,Age,Occupation,born_df,died_df,born_dt,died_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,1907-05-27,1964-04-14


In [50]:
df.dtypes

Name                  object
Born                  object
Died                  object
Age                    int64
Occupation            object
born_df       datetime64[ns]
died_df       datetime64[ns]
dtype: object

In [56]:
df["age_days"] = df["died_dt"] - df["born_dt"]

In [57]:
df.head()

Unnamed: 0,Name,Born,Died,Age,Occupation,born_df,died_df,born_dt,died_dt,age_days
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,1907-05-27,1964-04-14,20777 days


## 시계열 데이터 활용

In [58]:
# 날짜 데이터 분리
df["born_dt"].dt.year

0    1920
1    1876
2    1820
3    1867
4    1907
5    1813
6    1912
7    1777
Name: born_dt, dtype: int32

In [59]:
df["born_dt"].dt.month

0     7
1     6
2     5
3    11
4     5
5     3
6     6
7     4
Name: born_dt, dtype: int32

In [61]:
df["born_dt"].dt.day

0    25
1    13
2    12
3     7
4    27
5    15
6    23
7    30
Name: born_dt, dtype: int32

In [63]:
# 날짜 인덱스 활용
df2 = df.set_index("born_dt")

In [66]:
df2.head()

Unnamed: 0_level_0,Name,Born,Died,Age,Occupation,born_df,died_df,died_dt,age_days
born_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1920-07-25,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,1958-04-16,13779 days
1876-06-13,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,1937-10-16,22404 days
1820-05-12,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,1910-08-13,32964 days
1867-11-07,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,1934-07-04,24345 days
1907-05-27,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,1964-04-14,20777 days


In [65]:
df2.loc[:"1900"]

Unnamed: 0_level_0,Name,Born,Died,Age,Occupation,born_df,died_df,died_dt,age_days
born_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1876-06-13,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,1937-10-16,22404 days
1820-05-12,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,1910-08-13,32964 days
1867-11-07,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,1934-07-04,24345 days
1813-03-15,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,1858-06-16,16529 days
1777-04-30,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23,1855-02-23,28422 days


In [67]:
df2.shape

(8, 9)

In [69]:
df2.loc[:"1900"]

Unnamed: 0_level_0,Name,Born,Died,Age,Occupation,born_df,died_df,died_dt,age_days
born_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1876-06-13,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,1937-10-16,22404 days
1820-05-12,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,1910-08-13,32964 days
1867-11-07,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,1934-07-04,24345 days
1813-03-15,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,1858-06-16,16529 days
1777-04-30,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23,1855-02-23,28422 days


In [71]:
df2.loc["1900":]

KeyError: 'Value based partial slicing on non-monotonic DatetimeIndexes with non-existing keys is not allowed.'

In [75]:
df2

Unnamed: 0_level_0,Name,Born,Died,Age,Occupation,born_df,died_df,died_dt,age_days
born_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1920-07-25,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,1958-04-16,13779 days
1876-06-13,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,1937-10-16,22404 days
1820-05-12,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,1910-08-13,32964 days
1867-11-07,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,1934-07-04,24345 days
1907-05-27,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,1964-04-14,20777 days
1813-03-15,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,1858-06-16,16529 days
1912-06-23,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07,1954-06-07,15324 days
1777-04-30,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23,1855-02-23,28422 days


In [79]:
df2.loc["1800":]

KeyError: 'Value based partial slicing on non-monotonic DatetimeIndexes with non-existing keys is not allowed.'

In [74]:
df2.loc[:"1813-03-15"]

Unnamed: 0_level_0,Name,Born,Died,Age,Occupation,born_df,died_df,died_dt,age_days
born_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1813-03-15,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,1858-06-16,16529 days
1777-04-30,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23,1855-02-23,28422 days
