In [1]:
# SMOTE+ENN : SMOTE와 ENN을 섞은 방식
# SMOTE : 소수 클래스의 샘플을 주변의 이웃을 고려해 약간씩 이동시킨 포인트들을 추가
# ENN : 소수 클래스 주변의 다수 클래스 데이터를 삭제
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

In [2]:
from imblearn.combine import SMOTEENN
import pandas as pd

X_sample, y_sample = SMOTEENN(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample,columns=['y'])
y_samp.y.value_counts()

1    8941
0    8645
Name: y, dtype: int64

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)
model1 = LogisticRegression(random_state=42)
model1.fit(X_train, y_train)
print("학습용:", model1.score(X_train, y_train))
print("검증용:", model1.score(X_test, y_test))

pred1 = model1.predict(X_test)
print(classification_report(y_test, pred1))

학습용: 0.9530850156383281
검증용: 0.9536668561682774
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1729
           1       0.95      0.96      0.95      1789

    accuracy                           0.95      3518
   macro avg       0.95      0.95      0.95      3518
weighted avg       0.95      0.95      0.95      3518



  y = column_or_1d(y, warn=True)


In [4]:
# SMOTE+Tomek : SMOTE와 토멕링크 방법을 섞은 것
# SMOTE : 소수 클래스의 샘플을 주변의 이웃을 고려해 약간씩 이동시킨 포인트들을 추가
# 토멕링크 : 토멕링크 중에서 다수 클래스의 샘플들을 제거
from imblearn.combine import SMOTETomek

X_sample, y_sample = SMOTETomek(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample,columns=['y'])
y_samp.y.value_counts()

0    9653
1    9653
Name: y, dtype: int64

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)
model2 = LogisticRegression(random_state=42)
model2.fit(X_train, y_train)
print("학습용:", model2.score(X_train, y_train))
print("검증용:", model2.score(X_test, y_test))

pred2 = model2.predict(X_test)
print(classification_report(y_test, pred2))

학습용: 0.9197099197099197
검증용: 0.923873640600725
              precision    recall  f1-score   support

           0       0.91      0.94      0.92      1931
           1       0.94      0.91      0.92      1931

    accuracy                           0.92      3862
   macro avg       0.92      0.92      0.92      3862
weighted avg       0.92      0.92      0.92      3862



  y = column_or_1d(y, warn=True)
