<a href="https://colab.research.google.com/github/seungmin-son/ML_Practice/blob/main/%5BML_3%5DDataProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 누락된 데이터 다루기

## 테이블 형태 데이터에서 누락된 값 식별

In [3]:
import pandas as pd
from io import StringIO
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data)) #csv파일 불러와서 읽기
df


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [4]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

## 누락된 값이 있는 훈련 샘플 제외

In [5]:
df.dropna(axis =0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [6]:
df.dropna(axis =1) #NAN이 하나라도 있으면 그 열 삭제

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [7]:
df.dropna(subset=['C']) # 특정 열에 NAN이 있는 행만 삭제

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [8]:
df.dropna(thresh=4) #NAN이 아닌 값이 4개보다 작은 행 삭제

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


## 누락된 값 대체

In [9]:
from sklearn.impute import SimpleImputer
import numpy as np

imr = SimpleImputer(missing_values=np.nan,strategy = 'mean') #nan값을 평균값으로 보간
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [10]:
from sklearn.preprocessing import FunctionTransformer
ftr_imr = FunctionTransformer(lambda X: imr.fit_transform(X.T).T,validate=False)
imputed_data = ftr_imr.fit_transform(df.values)
imputed_data

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  6.33333333,  8.        ],
       [10.        , 11.        , 12.        , 11.        ]])

In [11]:
imr = SimpleImputer(add_indicator=True)
imputed_data = imr.fit_transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ,  0. ,  0. ],
       [ 5. ,  6. ,  7.5,  8. ,  1. ,  0. ],
       [10. , 11. , 12. ,  6. ,  0. ,  1. ]])

In [12]:
imr.indicator_.features_

array([2, 3])

In [13]:
imr.indicator_.fit_transform(df.values)

array([[False, False],
       [ True, False],
       [False,  True]])

In [14]:
imr.inverse_transform(imputed_data)

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [15]:
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer

iimr = IterativeImputer()
iimr.fit_transform(df.values)

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  7.00047063,  8.        ],
       [10.        , 11.        , 12.        , 12.99964527]])

In [16]:
from sklearn.impute import KNNImputer #이거는 KNN을 사용하여 누락된 값을 채움 

kimr = KNNImputer()
kimr.fit_transform(df.values)

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [17]:
df.fillna(df.mean()) #pandas fillna 메서드

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [18]:
df.fillna(method='bfill') #누락값을 다음 행의 값으로 채움

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,12.0,8.0
2,10.0,11.0,12.0,


In [19]:
df.fillna(method='ffill') #이전행의 값으로 채움

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,3.0,8.0
2,10.0,11.0,12.0,8.0


In [20]:
df.fillna(method='ffill',axis=1) #이렇게 설정할시 행이 아니라 열을 사용

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,6.0,8.0
2,10.0,11.0,12.0,12.0


## 사이킷런 추정기 API

In [21]:
from IPython.display import Image
Image(url='https://git.io/Jtmwv', width=400)  #변환기

In [22]:
Image(url='https://git.io/JtYZW', width=300) #추정기

# 범주형 데이터 다루기

## 범주형 데이터 인코딩

In [23]:
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])

df.columns = ('color','size','price','classlabel')

df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


## 순서가 있는 특성 mapping

In [24]:
size_mapping={
                'XL':3,
                'L':2,
                'M':1}

df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [25]:
inv_size_mapping = {v:k for k,v in size_mapping.items()}
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

## 클래스 label 인코딩

In [26]:
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [27]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [28]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [29]:
from sklearn.preprocessing import LabelEncoder

class_temp = LabelEncoder()
y = class_temp.fit_transform(df['classlabel'])
y

array([1, 0, 1])

In [30]:
y= class_temp.inverse_transform(y)
y

array(['class2', 'class1', 'class2'], dtype=object)

## 순서가 없는 특성에 원-핫 인코딩 적용

In [31]:
X = df[['color','size','price']].values
color_le = LabelEncoder()
X[:,0] = color_le.fit_transform(X[:,0])
color_le
X
#color_le

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder(dtype=int)
col_trans = ColumnTransformer([('ord_enc',ord_enc,['color'])])
X_trans = col_trans.fit_transform(df)
X_trans



array([[1],
       [2],
       [0]])

In [33]:
col_trans.named_transformers_['ord_enc'].inverse_transform(X_trans)

array([['green'],
       ['red'],
       ['blue']], dtype=object)

In [34]:
from sklearn.preprocessing import OneHotEncoder
X = df[['color','size','price']].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:,0].reshape(-1,1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [35]:
from sklearn.compose import ColumnTransformer
X = df[['color','size', 'price']].values
c_transf = ColumnTransformer([
    ('onehot',OneHotEncoder(dtype=int),[0]),
    ('nothing','passthrough',[1,2])
    ]) #'passthrough' 옵션은 두 열을 변경하지않고 그대로 두기 위해 사용함
c_transf.fit_transform(X)

array([[0, 1, 0, 1, 10.1],
       [0, 0, 1, 2, 13.5],
       [1, 0, 0, 3, 15.3]], dtype=object)

In [36]:
pd.get_dummies(df[['price', 'color', 'size']]) #pandas에서는 getdummies로 더 편하게 사용 가능 

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


In [37]:
pd.get_dummies(df[['price', 'color', 'size']],columns=['size'])

Unnamed: 0,price,color,size_1,size_2,size_3
0,10.1,green,1,0,0
1,13.5,red,0,1,0
2,15.3,blue,0,0,1


In [38]:
pd.get_dummies(df[['price', 'color', 'size']],columns=['price'])

Unnamed: 0,color,size,price_10.1,price_13.5,price_15.3
0,green,1,1,0,0
1,red,2,0,1,0
2,blue,3,0,0,1


In [39]:
pd.get_dummies(df[['price', 'color', 'size']],drop_first=True) #첫번째 열을 삭제

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,1,0
1,13.5,2,0,1
2,15.3,3,0,0


In [40]:
color_ohe = OneHotEncoder(categories='auto', drop='first') #다중 공선성 문제 처리
c_transf = ColumnTransformer([ ('onehot', color_ohe, [0]),
                               ('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X)

array([[1.0, 0.0, 1, 10.1],
       [0.0, 1.0, 2, 13.5],
       [0.0, 0.0, 3, 15.3]], dtype=object)

## 순서가 있는 특성 인코딩

In [41]:
df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [42]:
df['x>M'] = df['size'].apply(lambda x:1 if x in{'L','XL'} else 0)
df['x>L'] = df['size'].apply(lambda x:1 if x=='XL' else 0)

df

Unnamed: 0,color,size,price,classlabel,x>M,x>L
0,green,M,10.1,class2,0,0
1,red,L,13.5,class1,1,0
2,blue,XL,15.3,class2,1,1


# 데이터셋을 훈련 데이터셋과 테스트 데이터 셋으로 나누기

In [43]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/''ml/machine-learning-databases/wine/wine.data',header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash','Alcalinity of ash', 'Magnesium', 'Total phenols','Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins','Color intensity', 'Hue', 'OD280/OD315 of diluted wines','Proline']

print('Class labels', np.unique(df_wine['Class label']))
df_wine.head()

Class labels [1 2 3]


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [56]:
from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state=0,stratify=y) #stratify는 비율유지하면서 분리함, 비율은 30%

# 특성 스케일 맞추기

In [59]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler() #최대 최소 스케일 [0,1]으로 조정, normalization
X_train_norm = mms.fit_transform(X_train) #train은 fit_transform 메서드를 적용하고, test는 transform메서드를 적용하는 이유는 train에서 사용한 평균꽈 표준편차 값을 test데이터에다가도 똑같이 사용하기 위해서임
X_test_norm = mms.transform(X_test)


In [60]:
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train) #standarization 평균은0 표준편차를 1
X_test_std = stdsc.transform(X_test)

In [62]:
#코드 구현으로도 한번 해보자

ex = np.array([0,1,2,3,4,5])
print('standarization', (ex - ex.mean()) / ex.std())

print('normalization', (ex - ex.min()) / (ex.max() - ex.min()))



standarization [-1.46385011 -0.87831007 -0.29277002  0.29277002  0.87831007  1.46385011]
normalization [0.  0.2 0.4 0.6 0.8 1. ]


In [65]:
from sklearn.preprocessing import RobustScaler #robust는 이상치가 많이 포함된 작은 데이터셋을 다룰떄 유리함 (과적합 될거같으면 robust가 효율이 훨신 좋음) , 중간값을 뺴고 차 값을 이용해서 스케일을 조정하기 때문

rbs = RobustScaler()
X_train_rbs = rbs.fit_transform(X_train)
X_test_rbs = rbs.transform(X_test)


In [64]:
from sklearn.preprocessing import MaxAbsScaler #얘는 데이터를 최대 절대값으로 나눔 -> 각 특성의 최댓값은 1 전체 특성은 [-1,1]
mas = MaxAbsScaler()
X_train_maxabs = mas.fit_transform(X_train)
X_test_maxabs = mas.fit_transform(X_test)

In [66]:
from sklearn.preprocessing import scale, minmax_scale, robust_scale, maxabs_scale #얘내도 대응함수들이 당연히 존재함 -> 함수로도 지원함
print('StandardScaler:', scale(ex))
print('MinMaxScaler:', minmax_scale(ex))
print('RobustScaler:', robust_scale(ex))
print('MaxAbsScaler:', maxabs_scale(ex))

StandardScaler: [-1.46385011 -0.87831007 -0.29277002  0.29277002  0.87831007  1.46385011]
MinMaxScaler: [0.  0.2 0.4 0.6 0.8 1. ]
RobustScaler: [-1.  -0.6 -0.2  0.2  0.6  1. ]
MaxAbsScaler: [0.  0.2 0.4 0.6 0.8 1. ]


In [67]:
from sklearn.preprocessing import Normalizer #얘는 샘플별로 정규화를 함 

nrm = Normalizer()
X_train_l2 = nrm.fit_transform(X_train)

# 유용한 특성 선택