### 결측 처리

In [13]:
import os
import pandas as pd
df = pd.read_csv("../data/classification/bands.csv")

#### 결측 판단

In [14]:
display(df.isnull().head())

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,y
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [5]:
print(df.loc[2, "x6"])

nan


In [15]:
display(df.isnull().sum(axis = 0))

x1     54
x2      5
x3     27
x4      2
x5      1
x6     30
x7     63
x8     55
x9     10
x10    55
x11    55
x12    56
x13    54
x14     6
x15     7
x16    54
x17     7
x18     7
x19     3
y       0
dtype: int64

#### 결측 제거

In [7]:
df.dropna(inplace = True)
print(df.isnull().sum().sum())

0


#### 대푯값으로 결측 대체

In [8]:
X = df.drop('y', axis = 1)
y = df['y']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2022)

In [9]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "mean")

In [10]:
imputer.fit(X_train)
Z_train = pd.DataFrame(imputer.transform(X_train), columns = X_train.columns)
Z_test = pd.DataFrame(imputer.transform(X_test), columns = X_test.columns)

#### 이웃을 활용한 결측 대체

In [11]:
from sklearn.impute import KNNImputer
imputer = KNNImputer()
imputer.fit(X_train)
Z_train = pd.DataFrame(imputer.transform(X_train), columns = X_train.columns)
Z_test = pd.DataFrame(imputer.transform(X_test), columns = X_test.columns)

### 범주 및 서열형 변수 처리

In [12]:
df = pd.read_csv("classification/german.csv")
X = df.drop('y', axis = 1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2022)

FileNotFoundError: [Errno 2] No such file or directory: 'classification/german.csv'

#### 더미화

In [None]:
from feature_engine.encoding import OneHotEncoder as OHE
dummy_model = OHE(drop_last = True).fit(X_train)
Z_train = dummy_model.transform(X_train)
Z_test = dummy_model.transform(X_test)

#### 라벨을 활용한 치환

In [None]:
S = df.groupby('x1')['y'].mean()
display(S)

In [None]:
display(df['x1'].replace(S.to_dict()))

In [None]:
train = pd.concat([X_train, y_train], axis = 1)

In [None]:
for col, dtype in zip(X_train.columns, X_train.dtypes):
    if dtype == object:
        S = train.groupby(col)['y'].mean().to_dict()
        X_train.loc[:, col] = X_train[col].replace(S)
        X_test.loc[:, col] = X_test[col].replace(S)

display(X_train['x1'].head())
display(X_test['x1'].head())

### 스케일링

In [None]:
df = pd.read_csv("classification/glass.csv")
X = df.drop('y', axis = 1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2022)

In [None]:
display(X_train.describe())

#### 최소-최대 정규화

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X_train)
Z_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
Z_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
display(Z_train.describe())

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
Z_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
Z_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
display(Z_train.describe())

### 재샘플링

In [None]:
df = pd.read_csv("classification/yeast-1_vs_7.csv")
X = df.drop('y', axis = 1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2022)

In [None]:
display(y_train.value_counts())

#### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
s_X_train, s_y_train = smote.fit_resample(X_train, y_train)
s_X_train = pd.DataFrame(s_X_train, columns = X_train.columns)
s_y_train = pd.Series(s_y_train)

In [None]:
display(s_y_train.value_counts())

#### NearMiss

In [None]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()
s_X_train, s_y_train = nm.fit_resample(X_train, y_train)
s_X_train = pd.DataFrame(s_X_train, columns = X_train.columns)
s_y_train = pd.Series(s_y_train)

In [None]:
display(s_y_train.value_counts())

### 특징 선택

In [None]:
df = pd.read_csv("classification/wdbc.csv")
X = df.drop('y', axis = 1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2022)

#### SelectKBest 클래스

In [None]:
from sklearn.feature_selection import *
selector = SelectKBest(f_classif, k = 10)
selector.fit(X_train, y_train)
selected_features = X_train.columns[selector.get_support()]
Z_train = X_train.loc[:, selected_features]
Z_test = X_test.loc[:, selected_features]

In [None]:
print(X_train.shape)
print(Z_train.shape)