In [59]:
# Tutorial from: https://elitedatascience.com/imbalanced-classes

import pandas as pd
import numpy as np

df = pd.read_csv('./datasets/balance_scale/balance_scale_data.csv', 
                 names=['balance', 'var1', 'var2', 'var3', 'var4'])

In [60]:
df['balance'].unique()

array(['B', 'R', 'L'], dtype=object)

In [61]:
df.head()

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [62]:
df['balance'].value_counts()

L    288
R    288
B     49
Name: balance, dtype: int64

In [63]:
# Transform into a binary classification problem

df['balance'] = [1 if d == 'B' else 0 for d in df.balance]

In [64]:
df['balance'].value_counts()

0    576
1     49
Name: balance, dtype: int64

In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X = df.copy()

y = X.balance
X = X.drop('balance', axis=1)

In [66]:
# Classify 

clf = LogisticRegression().fit(X, y)

In [67]:
pred = clf.predict(X)

In [68]:
accuracy_score(pred, y)

0.92159999999999997

In [69]:
# High accuracy but only one class is being predicted. The minority class is totally ignored

print(np.unique(pred))

[0]


### Up-sample minority class 

#### Ramdomly duplicate samples from the minority class in order to reinforce its signal

In [70]:
from sklearn.utils import resample

df_majority = df[df.balance == 0]
df_minority = df[df.balance == 1]

In [71]:
len(df_majority), len(df_minority)

(576, 49)

In [72]:
# Up-sample the minority class
df_minority_upsampled = resample(df_minority, replace=True, n_samples=576, random_state=123)

In [73]:
# Concat majority class with up-sampled minority class 

df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [74]:
X_up = df_upsampled.copy()

y_up = X_up.balance
X_up = X_up.drop('balance', axis=1)

In [78]:
# Use logistic regression to predict the up-sampled data

clf_up = LogisticRegression().fit(X_up, y_up)
pred_up = clf_up.predict(X_up)
print(accuracy_score(pred_up, y_up))

0.513888888889


In [80]:
# Both classes are being predicted now

print(np.unique(pred_up))

[0 1]
