In [1]:
# Tutorial from: https://elitedatascience.com/imbalanced-classes

import pandas as pd
import numpy as np

df = pd.read_csv('./datasets/balance_scale/balance_scale_data.csv', 
                 names=['balance', 'var1', 'var2', 'var3', 'var4'])

In [2]:
df['balance'].unique()

array(['B', 'R', 'L'], dtype=object)

In [3]:
df.head()

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [4]:
df['balance'].value_counts()

R    288
L    288
B     49
Name: balance, dtype: int64

In [5]:
# Transform into a binary classification problem

df['balance'] = [1 if d == 'B' else 0 for d in df.balance]

In [6]:
df['balance'].value_counts()

0    576
1     49
Name: balance, dtype: int64

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = df.copy()

y = X.balance
X = X.drop('balance', axis=1)

In [8]:
# Classify 

clf = LogisticRegression().fit(X, y)

In [9]:
pred = clf.predict(X)

In [10]:
accuracy_score(pred, y)

0.92159999999999997

In [11]:
# High accuracy but only one class is being predicted. The minority class is totally ignored

print(np.unique(pred))

[0]


### Up-sample minority class 

#### Ramdomly duplicate samples from the minority class in order to reinforce its signal

In [12]:
from sklearn.utils import resample

df_majority = df[df.balance == 0]
df_minority = df[df.balance == 1]

In [13]:
len(df_majority), len(df_minority)

(576, 49)

In [14]:
# Up-sample the minority class
df_minority_upsampled = resample(df_minority, replace=True, n_samples=576, random_state=123)

In [15]:
# Concat majority class with up-sampled minority class 

df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [16]:
X_up = df_upsampled.copy()

y_up = X_up.balance
X_up = X_up.drop('balance', axis=1)

In [17]:
# Use logistic regression to predict the up-sampled data

clf_up = LogisticRegression().fit(X_up, y_up)
pred_up = clf_up.predict(X_up)
print(accuracy_score(pred_up, y_up))

0.513888888889


In [18]:
# Both classes are being predicted now

print(np.unique(pred_up))

[0 1]


### Down-sample Majority Class

#### Randomly remove observations from the majority class to prevent dominating the signal from the minority class

In [19]:
df_majority_downsampled = resample(df_majority, replace=False, n_samples=49, random_state=123)


In [20]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [21]:
df_downsampled.balance.value_counts()

1    49
0    49
Name: balance, dtype: int64

In [22]:
X_down = df_downsampled.copy()
y_down = X_down.balance
X_down = X_down.drop('balance', axis=1)

In [23]:
# Use logistic regression to predict the up-sampled data

clf_down = LogisticRegression().fit(X_down, y_down)
pred_down = clf_down.predict(X_down)
print(accuracy_score(pred_down, y_down))

0.581632653061


### Use the ROC Area Under Curve Metric

In [24]:
from sklearn.metrics import roc_auc_score

prob_y_down = clf_down.predict_proba(X_down)

In [27]:
prob_y_down = [p[1] for p in prob_y_down]

In [28]:
roc_auc_score(y_down, prob_y_down)

0.56809662640566438

In [29]:
# ROC AUC for the original data

prob_y = clf.predict_proba(X)
prob_y = [p[1] for p in prob_y]
roc_auc_score(y, prob_y)

0.530718537414966