# 필요한 모듈 생성

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# 데이터 불러오기

In [None]:
train = pd.read_csv('/content/drive/MyDrive/데이콘 : Basic/소득예측경진대회/train.csv')
train = train.drop('id', axis=1)

In [None]:
print(train.shape)
train.head()

(17480, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,32,Private,309513,Assoc-acdm,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
1,33,Private,205469,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,46,Private,149949,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,23,Private,193090,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,0
4,55,Private,60193,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,0


# Feature Engineering

## 결측치 확인

In [None]:
# 두 코드 모두 가능!
train.info()
train.isnull().sum()

id                   0
age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64

## 결측치 제거

In [None]:
train_1 = train.dropna()
print("결측치 제거 전 데이터 개수 : ", train.shape[0])
print("결측치 제거 후 데이터 개수 : ", train_1.shape[0])

결측치 제거 전 데이터 개수 :  17480
결측치 제거 후 데이터 개수 :  15081


In [None]:
# 결측치가 모두 사라진것을 볼 수 있음
train_1.isnull().sum()

id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64

## 결측치 대체

In [None]:
####결측치를 0으로 대체####
train_2 = train.fillna(0)
print("결측치 대체 전 데이터 개수 : ", train.shape[0])
print("결측치 대체 후 데이터 개수 : ", train_2.shape[0])

결측치 대체 전 데이터 개수 :  17480
결측치 대체 후 데이터 개수 :  17480


- 결측치를 0으로 대체해줌으로써 데이터 개수는 기존 데이터와 같은것을 볼 수 있음

In [None]:
####결측치를 Mean or Median으로 대체####
train_3 = train.fillna(train.mean())
train_4 = train.fillna(train.median())

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
####결측치를 가장 빈도수가 높은 값으로 대체####
print(train['workclass'].value_counts())
print(train['occupation'].value_counts())
print(train['native.country'].value_counts())

Private             11568
Self-emp-not-inc     1272
Local-gov            1053
State-gov             659
Self-emp-inc          594
Federal-gov           485
Never-worked            7
Without-pay             6
Name: workclass, dtype: int64
Exec-managerial      2113
Craft-repair         2101
Prof-specialty       2085
Adm-clerical         1893
Sales                1829
Other-service        1677
Machine-op-inspct    1040
Transport-moving      785
Handlers-cleaners     695
Farming-fishing       508
Tech-support          475
Protective-serv       350
Priv-house-serv        83
Armed-Forces            3
Name: occupation, dtype: int64
United-States                 15393
Mexico                          355
Philippines                     108
Germany                          75
Canada                           63
Puerto-Rico                      59
El-Salvador                      58
Cuba                             49
India                            47
England                          46
Jamaica

- 일단 결측치가 존재하는 Feature에서 어떤 unique값이 가장 빈도수가 높은지 찾아봄
    - workclass : Private
    - occupation : Exec-managerial
    - native.country : United-States

In [None]:
# 빈도수가 높은 값으로 채워줌
train_5 = train.copy()
train_5.loc[train['workclass'] != train['workclass'],'workclass'] = 'Private'
train_5.loc[train['occupation'] != train['occupation'],'occupation'] = 'Exec-managerial'
train_5.loc[train['native.country'] != train['native.country'],'native.country'] = 'United-States'
print("결측값 대체 전 workclass의 Private 값 개수 : ", train.workclass.value_counts()[0])
print("결측값 대체 후 workclass의 Private 값 개수 : ", train_5.workclass.value_counts()[0])

결측값 대체 전 workclass의 Private 값 개수 :  11568
결측값 대체 후 workclass의 Private 값 개수 :  13404


- 결측치를 빈도수가 가장 높았던 Private으로 대체해줌으로써, Private값의 개수가 늘었음을 확인할 수 있음

In [None]:
####결측치를 임의의 값으로 대체####
train_6 = train.fillna(method = 'pad')
train_7 = train.fillna(method = 'bfill')

- pad : 특정한 수치를사용하지 않고, 결측치의 바로 앞에 있는 value를 채워넣는 방식
- bfill : pad와는 반대로 각 결측치의 바로 뒤의 값으로 채워넣는 방법

# 지표변수 (Indicator variables)

In [None]:
train['age']

0        32
1        33
2        46
3        23
4        55
         ..
17475    35
17476    30
17477    71
17478    41
17479    72
Name: age, Length: 17480, dtype: int64

In [None]:
oldness = []
for a in train['age']:
    if (17 <= a <= 30):
        oldness.append('young')
    elif (31 <= a <= 50):
        oldness.append('middle')
    else:
        oldness.append('old')

train['oldness'] = oldness

In [None]:
train.head()

## Feature Split

In [None]:
train['date'] = '2022-02-03'
train

In [None]:
year = []
month = []
day = []

for d in train['date'].str.split("-"):
    year.append(d[0])
    month.append(d[1])
    day.append(d[2])
train['year'] = year
train['month'] = month
train['day'] = day

In [None]:
train

## 스케일링 Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
mm_scaler = MinMaxScaler()
sd_scaler = StandardScaler()

In [None]:
# MinMax Scaling
mm_scaler = mm_scaler.fit_transform(train['age'].values.reshape(-1,1))
train['mm_scaled_age'] = mm_scaler

# Standard Scaling
sd_scaler = sd_scaler.fit_transform(train['age'].values.reshape(-1,1))
train['sd_scaled_age'] = sd_scaler


In [None]:
train

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target,mm_scaled_age,sd_scaled_age
0,0,32,Private,309513,Assoc-acdm,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0,0.205479,-0.477370
1,1,33,Private,205469,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1,0.219178,-0.406343
2,2,46,Private,149949,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0,0.397260,0.517004
3,3,23,Private,193090,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,0,0.082192,-1.116610
4,4,55,Private,60193,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,0,0.520548,1.156244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17475,17475,35,,320084,Bachelors,13,Married-civ-spouse,,Wife,White,Female,0,0,55,United-States,1,0.246575,-0.264290
17476,17476,30,,33811,Bachelors,13,Never-married,,Not-in-family,Asian-Pac-Islander,Female,0,0,99,United-States,0,0.178082,-0.619423
17477,17477,71,,287372,Doctorate,16,Married-civ-spouse,,Husband,White,Male,0,0,10,United-States,1,0.739726,2.292671
17478,17478,41,,202822,HS-grad,9,Separated,,Not-in-family,Black,Female,0,0,32,United-States,0,0.328767,0.161870


## 라벨 인코딩 (Label Encoding)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
encoded = encoder.fit_transform(train['education'])

In [None]:
train['education_class'] = encoded

In [None]:
train.education_class.value_counts()

11    5566
15    3905
9     2842
12     921
8      724
1      664
7      588
0      510
5      378
14     301
6      294
2      249
10     214
4      197
3       96
13      31
Name: education_class, dtype: int64

## 원-핫 인코딩 (One-Hot Encoding)

In [None]:
pd.get_dummies(train['education'])

Unnamed: 0,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17475,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
17476,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
17477,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
17478,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


#과제

##Titanic 데이터를 가지고 5 개의 FE 방법들 직접 사용해보기 (오늘 피피티에 소개되지 않은 FE 방법도 괜찮음)