## 不均衡データへの対応

In [1]:
!git clone https://github.com/saiku122/AIJobcolle.git

Cloning into 'AIJobcolle'...
remote: Enumerating objects: 465, done.[K
remote: Counting objects: 100% (465/465), done.[K
remote: Compressing objects: 100% (342/342), done.[K
remote: Total 465 (delta 195), reused 221 (delta 70), pack-reused 0[K
Receiving objects: 100% (465/465), 12.08 MiB | 13.65 MiB/s, done.
Resolving deltas: 100% (195/195), done.


In [2]:
cd /content/AIJobcolle/MachineLearning/python

/content/AIJobcolle/MachineLearning/python


まず分類用のサンプルデータであるローン審査データを読み込みます。<br>データ前処理はone-hotエンコーディングと欠損値補完までを行っています。

In [10]:
import pandas as pd
from sklearn.impute import SimpleImputer

# import data
df = pd.read_csv('./data/av_loan_u6lujuX_CVtuZ9i.csv', header=0)
X = df.iloc[:, :-1]          # 最終列外を特徴量X
X = X.drop('Loan_ID',axis=1) # Loan_IDはID情報のため特徴量から削除
y = df.iloc[:,-1]            # 最終列を正解データ

# ローン審査でNOとなったサンプルを1に変換
class_mapping = {'N':1, 'Y':0}
y = y.map(class_mapping)

In [11]:
# one-hot エンコーディング
ohe_columns = ['Dependents',
               'Gender',
               'Married',
               'Education',
               'Self_Employed',
               'Property_Area']

X_ohe = pd.get_dummies(X,
                       dummy_na=True,
                       columns=ohe_columns)

imp = SimpleImputer()
imp.fit(X_ohe)
X_ohe_columns = X_ohe.columns.values
X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)

display(X_ohe.head())

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Female,Gender_Male,Gender_nan,Married_No,Married_Yes,Married_nan,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849.0,0.0,146.412162,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


### アンダーサンプリング

アンダーサンプリングの実装例です。<br>負例（422件）が正例の件数（192件）まで削減されていることが確認できます。

In [12]:
# ランダムアンダーサンプリング
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_under, y_under = rus.fit_sample(X_ohe, y)
Counter(y_under)



Counter({0: 192, 1: 192})

### オーバーサンプリング

以下、オーバーサンプリングの実装例です。<br>正例（192件）が負例の件数（422件）まで増加していることが確認できます。

In [13]:
# ランダムオーバーサンプリング, SMOTE
from imblearn.over_sampling import RandomOverSampler,SMOTE
ros = RandomOverSampler(random_state=0)
smt = SMOTE(random_state=0)

X_over,y_over = ros.fit_sample(X_ohe, y)
X_smt,y_smt = smt.fit_sample(X_ohe, y)

print('Random Over Sampler',Counter(y_over))
print('SMOTE', Counter(y_smt))

Random Over Sampler Counter({0: 422, 1: 422})
SMOTE Counter({0: 422, 1: 422})




最後に、不均衡対応別のモデルの比較評価の実行例を確認します。<b><br>モデルの評価用データはリサンプリング前に確保されるべきで点に留意しましょう。</b>

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,f1_score

# holdout
X_train,X_test,y_train,y_test= train_test_split(X_ohe,
                                                y,
                                                test_size=0.20,
                                                random_state=0)
# resampling
X_train_under, y_train_under = rus.fit_sample(X_train, y_train)
X_train_over, y_train_over = ros.fit_sample(X_train, y_train)
X_train_smt, y_train_smt = smt.fit_sample(X_train, y_train)

# modeling
pipe_gb = Pipeline([('scl',StandardScaler()),
                    ('est',GradientBoostingClassifier(random_state=1))])
# evaluation
###############################################
pipe_gb.fit(X_train,
            y_train)
print('Original Train:', 
      f1_score(y_train,
               pipe_gb.predict(X_train)))
print('Original Test:', 
      f1_score(y_test,
               pipe_gb.predict(X_test)))
###############################################
pipe_gb.fit(X_train_under,
            y_train_under)
print('Undersampling Train:',
      f1_score(y_train_under,
               pipe_gb.predict(X_train_under)))
print('Undersampling Test:', 
      f1_score(y_test,
               pipe_gb.predict(X_test)))
###############################################
pipe_gb.fit(X_train_over,
            y_train_over)
print('Oversampling Train:',
      f1_score(y_train_over,
               pipe_gb.predict(X_train_over)))
print('Oversampling Test:',
      f1_score(y_test,
               pipe_gb.predict(X_test)))
###############################################
pipe_gb.fit(X_train_smt,
            y_train_smt)
print('SMOTE Train:',
      f1_score(y_train_smt,
               pipe_gb.predict(X_train_smt)))
print('SMOTE Test:',
      f1_score(y_test,
               pipe_gb.predict(X_test)))



Original Train: 0.7940074906367042
Original Test: 0.6181818181818182
Undersampling Train: 0.9411764705882353
Undersampling Test: 0.6024096385542169
Oversampling Train: 0.9079365079365079
Oversampling Test: 0.6086956521739131
SMOTE Train: 0.8990228013029316
SMOTE Test: 0.5555555555555556


上記結果から、本データについては不均衡を「完全に解消する」メリットは見られませんでした。不均衡を解消する前のモデルパフォーマンスを定量的に把握した上で、不均衡をどの程度解消するとパフォーマンスがどのように変化するかを記録し、エビデンスを持った対応を心掛けましょう。