參考資料: https://elitedatascience.com/imbalanced-classes

## Balance Scale Dataset

In [None]:
import pandas as pd
import numpy as np

# 使用的資料集為: Balance Scale Data
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.data',
                 names = [ 'balance', 'var1', 'var2', 'var3', 'var4' ] )
                          # 'balance', 為1個目標變數。
                          # 'var1' ~ 'var4', 為input的特徵

df.head()

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


R 代表天秤右邊重, var3 * var4 > var1 * var2  
L 代表天秤左邊重, var3 * var4 < var1 * var2  
B 代表天秤處於平衡. var3 * var4 == var1 * var2  

In [None]:
df['balance'].value_counts()

L    288
R    288
B     49
Name: balance, dtype: int64

In [None]:
# 使用二元分類法
df['balance'] = [1 if b == 'B' else 0 for b in df.balance]
df['balance'].value_counts()

0    576
1     49
Name: balance, dtype: int64

## The Danger of Imbalanced Classes

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
y = df.balance # y, target set, 資料集裡有balance標籤的
x = df.drop('balance', axis = 1) # x, training set,drop掉第一欄column的資料, 等於是var1~var4的所有變數

clf_0 = LogisticRegression().fit(x, y)

pred_y_0 = clf_0.predict(x)

In [None]:
print( accuracy_score(pred_y_0, y) )

0.9216


In [None]:
print( np.unique( pred_y_0 ) )

[0]


##  1.Up-Sampling 上採樣

In [None]:
from sklearn.utils import resample

In [None]:
# 分成兩個class, majority, minority
df_majority = df[df.balance == 0]
df_minority = df[df.balance == 1]

df_minority_upsampled = resample(df_minority, replace = True, n_samples = 576, random_state = 123 )

df_upsampled = pd.concat([df_majority, df_minority_upsampled])

df_upsampled.balance.value_counts()



1    576
0    576
Name: balance, dtype: int64

In [None]:
y = df_upsampled.balance
x = df_upsampled.drop('balance', axis = 1 )

clf_1 = LogisticRegression().fit(x, y)

pred_y_1 = clf_1.predict(x)

print( np.unique(pred_y_1) )

print(accuracy_score(y, pred_y_1))

[0 1]
0.5147569444444444


## 2.Down-Sampling 下採樣

In [1]:
df_majority = df[df.balance == 0]
df_minority = df[df.balance == 1]

df_majority_downsampled = resample(df_majority, replace = False, n_samples = 49, random_state = 123 )

df_downsampled = pd.concat([df_majority_downsampled, df_minority])

df_downsampled.balance.value_counts()



NameError: ignored

In [None]:
y = df_downsampled.balance
x = df_downsampled.drop('balance', axis=1)
 
# Train model
clf_2 = LogisticRegression().fit(x, y)
 
# Predict on training set
pred_y_2 = clf_2.predict(x)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_2 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_2) )

[0 1]
0.5612244897959183


## 3.Change Your Performance Metric

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
# Predict class probabilities
prob_y_2 = clf_2.predict_proba(x)
#print(prob_y_2)
 
# Keep only the positive class
prob_y_2 = [p[1] for p in prob_y_2]
 
prob_y_2[:5]

[[0.54846803 0.45153197]
 [0.51273876 0.48726124]
 [0.52761039 0.47238961]
 [0.52985389 0.47014611]
 [0.4123397  0.5876603 ]
 [0.43978928 0.56021072]
 [0.58114217 0.41885783]
 [0.41401331 0.58598669]
 [0.51781009 0.48218991]
 [0.42704013 0.57295987]
 [0.54709136 0.45290864]
 [0.55297198 0.44702802]
 [0.42408792 0.57591208]
 [0.42479075 0.57520925]
 [0.45594573 0.54405427]
 [0.45894299 0.54105701]
 [0.56218764 0.43781236]
 [0.53638856 0.46361144]
 [0.42891506 0.57108494]
 [0.56807527 0.43192473]
 [0.57698511 0.42301489]
 [0.57721763 0.42278237]
 [0.47773863 0.52226137]
 [0.47487076 0.52512924]
 [0.45946479 0.54053521]
 [0.55197669 0.44802331]
 [0.48845512 0.51154488]
 [0.55968729 0.44031271]
 [0.57988222 0.42011778]
 [0.5180038  0.4819962 ]
 [0.42259225 0.57740775]
 [0.54509571 0.45490429]
 [0.53438589 0.46561411]
 [0.56515781 0.43484219]
 [0.44210555 0.55789445]
 [0.53338414 0.46661586]
 [0.5064647  0.4935353 ]
 [0.53381433 0.46618567]
 [0.42924457 0.57075543]
 [0.50972115 0.49027885]


[0.45153197257586,
 0.48726124480998256,
 0.47238960854127193,
 0.47014610622647623,
 0.587660295588417]

In [None]:
print( roc_auc_score(y, prob_y_2) )

0.5651811745106206


In [None]:
prob_y_0 = clf_0.predict_proba(x)
prob_y_0 = [p[1] for p in prob_y_0]
 
print( roc_auc_score(y, prob_y_0) )
#print( 1 - roc_auc_score(y, prob_y_0) )
# 如果AUROC為0.47，則意味著您需要對預測進行反轉，因為Scikit-Learn會誤解陽性類別。 AUROC應該> = 0.5, 所以應該是 0.53多。

0.5306476757369614
0.46935232426303863


## 4.Penalize Algorithms (Cost-Sensitive Training)

In [None]:
from sklearn.svm import SVC

In [None]:
y = df.balance
x = df.drop('balance', axis=1)

clf_3 = SVC(kernel = 'linear', 
            class_weight = 'balanced',
            probability = True)
 
clf_3.fit(x, y)

pred_y_3 = clf_3.predict(x)
 
print( np.unique( pred_y_3 ) )

print( accuracy_score(y, pred_y_3) )

# What about AUROC?
prob_y_3 = clf_3.predict_proba(x)
prob_y_3 = [p[1] for p in prob_y_3]
print( roc_auc_score(y, prob_y_3) )
#print( 1 - roc_auc_score(y, prob_y_3) )

[0 1]
0.688
0.46947633219954643


## 5.Tree-Based Algorithms

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Separate input features (X) and target variable (y)
y = df.balance
x = df.drop('balance', axis=1)
 
# Train model
clf_4 = RandomForestClassifier()
clf_4.fit(x, y)
 
# Predict on training set
pred_y_4 = clf_4.predict(x)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_4 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_4) )
# 0.9744
 
# What about AUROC?
prob_y_4 = clf_4.predict_proba(x)
prob_y_4 = [p[1] for p in prob_y_4]
print( roc_auc_score(y, prob_y_4) )
# 0.999078798186

[0 1]
1.0
1.0000000000000002
