In [1]:
import pandas as pd
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
dfX = pd.DataFrame(X, columns=['a','b'])
dfy = pd.DataFrame(y, columns=['y'])
df = pd.concat([dfX, dfy],axis=1)
df

Unnamed: 0,a,b,y
0,0.222014,0.540207,0
1,1.347439,1.412824,0
2,0.537238,0.372730,0
3,2.134462,1.404819,0
4,2.315827,1.356858,0
...,...,...,...
9995,2.440385,1.695643,0
9996,-0.790502,0.194243,0
9997,1.878130,0.829500,0
9998,2.585933,1.927995,0


In [2]:
X1 = df[['a','b']] # 독립변수
y1 = df["y"]

In [3]:
df["y"].value_counts() # 불균형 데이터셋

0    9900
1     100
Name: y, dtype: int64

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, stratify=y1, random_state=10)

In [5]:
# 불균형 데이터셋으로 만든 모형
model1 = LogisticRegression(random_state=0)
model1.fit(X_train, y_train)
print("학습용:",model1.score(X_train, y_train))
print("검증용:",model1.score(X_test, y_test))

학습용: 0.994125
검증용: 0.995


In [6]:
from sklearn.metrics import confusion_matrix

pred1 = model1.predict(X_test)
cm = confusion_matrix(y_test, pred1)
cm
# score는 0.9945로 높으나 recall의 경우 10/(10+10) = 0.5로 낮아지는 문제가 발생함

array([[1980,    0],
       [  10,   10]], dtype=int64)

In [7]:
from sklearn.metrics import classification_report

# 소수 클래스의 정확도와 precision, precision,recall,f1-score 확인
print(classification_report(y_test, pred1))
# 모형의 전반적인 정확도(accuracy)는 높지만 소수 클래스의 재현율(recall)이 0.5로 낮은 문제점

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1980
           1       1.00      0.50      0.67        20

    accuracy                           0.99      2000
   macro avg       1.00      0.75      0.83      2000
weighted avg       1.00      0.99      0.99      2000



In [8]:
# 균형 데이터
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, flip_y=0, random_state=1)
dfX=pd.DataFrame(X,columns=['a','b'])
dfy=pd.DataFrame(y,columns=['y'])
df2=pd.concat([dfX,dfy],axis=1)
df2["y"].value_counts()

0    5000
1    5000
Name: y, dtype: int64

In [9]:
X2 = df2[['a','b']] # 독립변수
y2 = df2["y"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, stratify=y2, random_state=10)

In [11]:
model2 = LogisticRegression(random_state=42)
model2.fit(X_train, y_train)
print("학습용:",model2.score(X_train, y_train))
print("검증용:",model2.score(X_test, y_test))

학습용: 0.896125
검증용: 0.891


In [12]:
pred2 = model2.predict(X_test)
print(classification_report(y_test, pred2))
# 정확도와 재현율이 비슷하게 처리됨

              precision    recall  f1-score   support

           0       0.88      0.91      0.89      1000
           1       0.90      0.87      0.89      1000

    accuracy                           0.89      2000
   macro avg       0.89      0.89      0.89      2000
weighted avg       0.89      0.89      0.89      2000



In [13]:
len(X1), len(y1), len(pred1)

(10000, 10000, 2000)

In [None]:
# 비대칭 데이터는 언더샘플링, 오버샘플링, 복합샘플링 등의 방법으로 데이터 비율을 맞추면 정밀도가 향상된다.

In [14]:
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)

In [None]:
# 언더샘플링: 데이터의 손실이 크고 중요한 특성을 가진 데이터를 잃을 수 있음

In [15]:
# 무작위로 다수 클래스의 데이터를 없애는 단순 샘플링
from imblearn.under_sampling import RandomUnderSampler

X_sample, y_sample = RandomUnderSampler(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample, columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample, columns=['y'])
y_samp.y.value_counts()

0    100
1    100
Name: y, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)
model3 = LogisticRegression(random_state=42)
model3.fit(X_train, y_train)
print("학습용:", model3.score(X_train, y_train))
print("검증용:", model3.score(X_test, y_test))

학습용: 0.8625
검증용: 0.925


  y = column_or_1d(y, warn=True)


In [17]:
pred3 = model3.predict(X_test)
print(classification_report(y_test, pred3))

              precision    recall  f1-score   support

           0       0.95      0.90      0.92        20
           1       0.90      0.95      0.93        20

    accuracy                           0.93        40
   macro avg       0.93      0.93      0.92        40
weighted avg       0.93      0.93      0.92        40



In [27]:
# 토멕링크(Tomek's link) : 서로 다른 클래스에 속하는 한쌍의 데이터
# 토멕링크 중에서 다수 클래스에 속한 샘플을 제거함으로써 데이터의 균형을 맞추는 방법
# 'majority': 다수 클래스의 샘플을 제거
# 'not minority': 소수 클래스를 제외하고 샘플링
# 'not majority': 다수 클래스를 제외하고 샘플링
# 'all': 모든 클래스를 샘플링
# 'auto': not minority와 같음(기본 옵션)
from imblearn.under_sampling import TomekLinks

X_sample,y_sample=TomekLinks(sampling_strategy='majority').fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample,columns=['y'])
y_samp.y.value_counts()

# 토멕링크 중에서 다수 클래스의 샘플들을 제거하는 방식, 1:1로 맞추는 방식은 아님

0    9874
1     100
Name: y, dtype: int64

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)
model4 = LogisticRegression(random_state=42)
model4.fit(X_train, y_train)
print("학습용:", model4.score(X_train, y_train))
print("검증용:", model4.score(X_test, y_test))

pred4 = model4.predict(X_test)
print(classification_report(y_test, pred4))

학습용: 0.9942348665246271
검증용: 0.9959899749373433
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1975
           1       1.00      0.60      0.75        20

    accuracy                           1.00      1995
   macro avg       1.00      0.80      0.87      1995
weighted avg       1.00      1.00      1.00      1995



In [29]:
# CNN(Condensed Nearest Neighbour) : 1-NN 모형으로 분류되지 않는 데이터만 남기는 방법
# 다수의 데이터 중에서 하나를 골라서 최근접 이웃이 다수 클래스이면 그 샘플을 빼는 방식
# 시간이 많이 걸림
from imblearn.under_sampling import CondensedNearestNeighbour
import warnings; warnings.filterwarnings(action='ignore')

X_sample, y_sample = CondensedNearestNeighbour(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample,columns=['y'])
y_samp.y.value_counts()

# 1:1로 맞춰지지는 않음

0    187
1    100
Name: y, dtype: int64

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)
model5 = LogisticRegression(random_state=42)
model5.fit(X_train, y_train)
print("학습용:", model5.score(X_train, y_train))
print("검증용:", model5.score(X_test, y_test))

pred5 = model5.predict(X_test)
print(classification_report(y_test, pred5))

학습용: 0.8209606986899564
검증용: 0.8793103448275862
              precision    recall  f1-score   support

           0       0.90      0.92      0.91        38
           1       0.84      0.80      0.82        20

    accuracy                           0.88        58
   macro avg       0.87      0.86      0.86        58
weighted avg       0.88      0.88      0.88        58



In [32]:
# One Sided Selection
# 토맥링크 방법과 Condensed Nearest Neighbour 방법을 섞은 방식
# 토맥링크 중 다수 클래스의 샘플을 제거하고
# 나머지 데이터 중에서도 서로 붙어있는 다수 클래스 데이터는 1-NN 방법으로 제외하는 방식
from imblearn.under_sampling import OneSidedSelection

X_sample, y_sample = OneSidedSelection(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample,columns=['y'])
y_samp.y.value_counts()

0    6593
1     100
Name: y, dtype: int64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)
model6 = LogisticRegression(random_state=42)
model6.fit(X_train, y_train)
print("학습용:", model6.score(X_train, y_train))
print("검증용:", model6.score(X_test, y_test))

pred6 = model6.predict(X_test)
print(classification_report(y_test, pred6))

학습용: 0.9921553978333956
검증용: 0.9925317401045557
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1319
           1       1.00      0.50      0.67        20

    accuracy                           0.99      1339
   macro avg       1.00      0.75      0.83      1339
weighted avg       0.99      0.99      0.99      1339



In [34]:
# ENN(Edited Nearest Neighbours)
# 다수 클래스 데이터 중 소수 클래스와 가장 가까운 k(n_neighbors)개의 데이터가
# 모두 또는 다수 클래스가 아니면 삭제하는 방법
# 소수 클래스 주변의 다수 클래스 데이터는 삭제됨
from imblearn.under_sampling import EditedNearestNeighbours

# kind_sel='all' 모두, kind_sel='mode' 다수
# 5개의 이웃이 모두 같은 클래스가 아니면 그 샘플을 제거함
X_sample, y_sample = EditedNearestNeighbours(kind_sel="all", n_neighbors=5).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample,columns=['y'])
y_samp.y.value_counts()

0    9747
1     100
Name: y, dtype: int64

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)
model7 = LogisticRegression(random_state=42)
model7.fit(X_train, y_train)
print("학습용:", model7.score(X_train, y_train))
print("검증용:", model7.score(X_test, y_test))

pred7 = model7.predict(X_test)
print(classification_report(y_test, pred7))

학습용: 0.9955566840167577
검증용: 0.9944162436548223
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1950
           1       1.00      0.45      0.62        20

    accuracy                           0.99      1970
   macro avg       1.00      0.72      0.81      1970
weighted avg       0.99      0.99      0.99      1970



In [36]:
# Neighbourhood Cleaning Rule
# CNN(Condensed Nearest Neighbour) 방법과 ENN(Edited Nearest Neighbours) 방법을 섞은 것
from imblearn.under_sampling import NeighbourhoodCleaningRule

# kind_sel='all' 모두, kind_sel='mode' 다수
X_sample,y_sample=NeighbourhoodCleaningRule(kind_sel="all", n_neighbors=5).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=['a','b'] )
y_samp = pd.DataFrame(data=y_sample,columns=['y'])
y_samp.y.value_counts()

0    9721
1     100
Name: y, dtype: int64

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X_samp, y_samp, test_size=0.2, stratify=y_samp, random_state=10)
model8 = LogisticRegression(random_state=42)
model8.fit(X_train, y_train)
print("학습용:", model8.score(X_train, y_train))
print("검증용:", model8.score(X_test, y_test))

pred8 = model8.predict(X_test)
print(classification_report(y_test, pred8))

학습용: 0.9950356415478615
검증용: 0.9969465648854962
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1945
           1       1.00      0.70      0.82        20

    accuracy                           1.00      1965
   macro avg       1.00      0.85      0.91      1965
weighted avg       1.00      1.00      1.00      1965

