<a href="https://colab.research.google.com/github/seungmin-son/ML_Practice/blob/main/%5BML_3%5DDataProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 누락된 데이터 다루기

## 테이블 형태 데이터에서 누락된 값 식별

In [9]:
import pandas as pd
from io import StringIO
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data)) #csv파일 불러와서 읽기
df


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [10]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

## 누락된 값이 있는 훈련 샘플 제외

In [16]:
df.dropna(axis =0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [12]:
df.dropna(axis =1) #NAN이 하나라도 있으면 그 열 삭제

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [18]:
df.dropna(subset=['C']) # 특정 열에 NAN이 있는 행만 삭제

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [17]:
df.dropna(thresh=4) #NAN이 아닌 값이 4개보다 작은 행 삭제

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


## 누락된 값 대체

In [19]:
from sklearn.impute import SimpleImputer
import numpy as np

imr = SimpleImputer(missing_values=np.nan,strategy = 'mean') #nan값을 평균값으로 보간
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [20]:
from sklearn.preprocessing import FunctionTransformer
ftr_imr = FunctionTransformer(lambda X: imr.fit_transform(X.T).T,validate=False)
imputed_data = ftr_imr.fit_transform(df.values)
imputed_data

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  6.33333333,  8.        ],
       [10.        , 11.        , 12.        , 11.        ]])

In [21]:
imr = SimpleImputer(add_indicator=True)
imputed_data = imr.fit_transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ,  0. ,  0. ],
       [ 5. ,  6. ,  7.5,  8. ,  1. ,  0. ],
       [10. , 11. , 12. ,  6. ,  0. ,  1. ]])

In [23]:
imr.indicator_.features_

array([2, 3])

In [24]:
imr.indicator_.fit_transform(df.values)

array([[False, False],
       [ True, False],
       [False,  True]])

In [25]:
imr.inverse_transform(imputed_data)

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [26]:
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer

iimr = IterativeImputer()
iimr.fit_transform(df.values)

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  7.00047063,  8.        ],
       [10.        , 11.        , 12.        , 12.99964527]])

In [27]:
from sklearn.impute import KNNImputer #이거는 KNN을 사용하여 누락된 값을 채움 

kimr = KNNImputer()
kimr.fit_transform(df.values)

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [28]:
df.fillna(df.mean()) #pandas fillna 메서드

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [29]:
df.fillna(method='bfill') #누락값을 다음 행의 값으로 채움

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,12.0,8.0
2,10.0,11.0,12.0,


In [30]:
df.fillna(method='ffill') #이전행의 값으로 채움

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,3.0,8.0
2,10.0,11.0,12.0,8.0


In [31]:
df.fillna(method='ffill',axis=1) #이렇게 설정할시 행이 아니라 열을 사용

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,6.0,8.0
2,10.0,11.0,12.0,12.0


## 사이킷런 추정기 API

In [32]:
from IPython.display import Image
Image(url='https://git.io/Jtmwv', width=400)  #변환기

In [33]:
Image(url='https://git.io/JtYZW', width=300) #추정기

# 범주형 데이터 다루기

## 범주형 데이터 인코딩

In [76]:
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])

df.columns = ('color','size','price','classlabel')

df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


## 순서가 있는 특성 mapping

In [60]:
size_mapping={
                'XL':3,
                'L':2,
                'M':1}

df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [61]:
inv_size_mapping = {v:k for k,v in size_mapping.items()}
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

## 클래스 label 인코딩

In [71]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [77]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,1
1,red,L,13.5,0
2,blue,XL,15.3,1


In [78]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [88]:
from sklearn.preprocessing import LabelEncoder

class_temp = LabelEncoder()
y = class_temp.fit_transform(df['classlabel'])
y

array([1, 0, 1])

In [89]:
y= class_temp.inverse_transform(y)
y

array(['class2', 'class1', 'class2'], dtype=object)

## 순서가 없는 특성에 원-핫 인코딩 적용

In [95]:
X = df[['color','size','price']].values
color_le = LabelEncoder()
X[:,0] = color_le.fit_transform(X[:,0])
color_le
X
#color_le

array([[1, 'M', 10.1],
       [2, 'L', 13.5],
       [0, 'XL', 15.3]], dtype=object)

In [97]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder(dtype=int)
col_trans = ColumnTransformer([('ord_enc',ord_enc,['color'])])
X_trans = col_trans.fit_transform(df)
X_trans



array([[1],
       [2],
       [0]])

In [98]:
col_trans.named_transformers_['ord_enc'].inverse_transform(X_trans)

array([['green'],
       ['red'],
       ['blue']], dtype=object)

In [99]:
from sklearn.preprocessing import OneHotEncoder
X = df[['color','size','price']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:,0].reshape(-1,1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [105]:
from sklearn.compose import ColumnTransformer
X = df[['color','size', 'price']].values
c_transf = ColumnTransformer([
    ('onehot',OneHotEncoder(dtype=int),[0]),
    ('nothing','passthrough',[1,2])
    ]) #'passthrough' 옵션은 두 열을 변경하지않고 그대로 두기 위해 사용함
c_transf.fit_transform(X)

array([[0, 1, 0, 'M', 10.1],
       [0, 0, 1, 'L', 13.5],
       [1, 0, 0, 'XL', 15.3]], dtype=object)

In [106]:
pd.get_dummies(df[['price', 'color', 'size']]) #pandas에서는 getdummies로 더 편하게 사용 가능 

Unnamed: 0,price,color_blue,color_green,color_red,size_L,size_M,size_XL
0,10.1,0,1,0,0,1,0
1,13.5,0,0,1,1,0,0
2,15.3,1,0,0,0,0,1


In [107]:
pd.get_dummies(df[['price', 'color', 'size']],columns=['size'])

Unnamed: 0,price,color,size_L,size_M,size_XL
0,10.1,green,0,1,0
1,13.5,red,1,0,0
2,15.3,blue,0,0,1


In [108]:
pd.get_dummies(df[['price', 'color', 'size']],columns=['price'])

Unnamed: 0,color,size,price_10.1,price_13.5,price_15.3
0,green,M,1,0,0
1,red,L,0,1,0
2,blue,XL,0,0,1


In [109]:
pd.get_dummies(df[['price', 'color', 'size']],drop_first=True) #첫번째 열을 삭제

Unnamed: 0,price,color_green,color_red,size_M,size_XL
0,10.1,1,0,1,0
1,13.5,0,1,0,0
2,15.3,0,0,0,1


In [110]:
color_ohe = OneHotEncoder(categories='auto', drop='first') #다중 공정성 문제 
c_transf = ColumnTransformer([ ('onehot', color_ohe, [0]),
                               ('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X)

AttributeError: ignored