## 1. 데이터 표준화

### 1-1. 단위 환산 : round()

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [2]:
df["fare"] = df["fare"].round(2)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.28,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.92,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
print(round(7.2500))
print(round(7.2500,2))

7
7.25


### 1-2. 자료형 변환 : replace(), astype()

In [4]:
print(df.dtypes)

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object


In [6]:
import numpy as np

df["age"].unique()
df["age"].replace(np.nan, 22, inplace=True)
df["age"] = df["age"].astype("int")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38,1,0,71.28,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26,0,0,7.92,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35,0,0,8.05,S,Third,man,True,,Southampton,no,True


> 타입 변경시 다음과 같이 입력 <br>
> 정수형은 .astype("int") <br>
> 문자형은 .astype("str") <br>
> 범주형은 .astype("category")

In [7]:
df["pclass"].unique()

array([3, 1, 2], dtype=int64)

In [8]:
df["pclass"].replace({1:"first",2:"second",3:"third"}, inplace=True)
print(df["pclass"].unique())
print(df["pclass"].dtype)

['third' 'first' 'second']
object


In [9]:
df["pclass"] = df["pclass"].astype("category")
print(df["pclass"].dtype)

category


## 2. 범주형 데이터 처리

### 2-1. 구간분할 : np.histogram(), pd.cut()

In [10]:
df.dtypes

survived          int64
pclass         category
sex              object
age               int32
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

- np.histogram() : np.histogram(나눌 리스트, bins = 나눌 구간)

In [11]:
count, bins_drivers = np.histogram(df["age"], bins=3)
print(count, '\n', bins_drivers)

[496 345  50] 
 [ 0.         26.66666667 53.33333333 80.        ]


- pd.cut()은 자동으로 범주형변수를 생성해줌
- include_lowest = True -> 최소값을 포함할 것이냐에 대한 옵션

In [13]:
bin_names = ['저출력','보통출력','고출력']
df['hp_bin'] = pd.cut(x = df["age"], #데이터 배열
                     bins = bins_drivers,    #경계값 리스트
                     labels = bin_names,     #구간명
                     include_lowest = True)  # 첫 경계값 포함여부(구간의 하위값)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,hp_bin
0,0,third,male,22,1,0,7.25,S,Third,man,True,,Southampton,no,False,저출력
1,1,first,female,38,1,0,71.28,C,First,woman,False,C,Cherbourg,yes,False,보통출력
2,1,third,female,26,0,0,7.92,S,Third,woman,False,,Southampton,yes,True,저출력
3,1,first,female,35,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,보통출력
4,0,third,male,35,0,0,8.05,S,Third,man,True,,Southampton,no,True,보통출력


- 변수 타입도 범주형으로 입력

In [14]:
print(df["hp_bin"].dtypes)

category


## 3. 더미변수 : get_dummies()

- get_dummies()는 범주형 변수의 모든 고유값을 각각 새로운 더미변수로 변환

In [16]:
dummies = pd.get_dummies(df["hp_bin"])
print(dummies.head(10))

   저출력  보통출력  고출력
0    1     0    0
1    0     1    0
2    1     0    0
3    0     1    0
4    0     1    0
5    1     0    0
6    0     0    1
7    1     0    0
8    0     1    0
9    1     0    0
