Author: Pradeep Pujari

Generate imbalanced data set, evaluate a majority class classifier on an 1:100 imbalanced dataset

In [47]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [78]:
# create an imbalanced dataset with a given ratio
def get_imbalanced_dataset(ratio):
    n_classes = len(ratio)  #binary classifier 2 classes
    large = max([v for k,v in ratio.items()])
    n_samples = large * n_classes
    X, y = make_blobs(n_samples=n_samples, centers=n_classes, n_features=2, random_state=42, cluster_std=3, shuffle=True)
    # collect the samples
    X_list, y_list = list(), list()
    for k,v in ratio.items():
        row_ix = where(y == k)[0]
        selected = row_ix[:v]
        X_list.append(X[selected, :])
        y_list.append(y[selected])
    return vstack(X_list), hstack(y_list)


In [99]:
# define the class distribution 1:100
ratio = {0:10000, 1:100}
# generate dataset
X, y = get_imbalanced_dataset(ratio)
# summarize class distribution:
major = (len(where(y == 0)[0]) / len(X)) * 100
minor = (len(where(y == 1)[0]) / len(X)) * 100
print('Class 0: %.3f%%, Class 1: %.3f%%' % (major, minor))

grid = {'C': 10.0 ** np.arange(-2, 3),
                        'penalty': ['none', 'l2'],
                        'class_weight': [None, 'auto']}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)
# define model
clf_base = LogisticRegression()
cv = KFold(n_splits=3, shuffle=True, random_state=42)
clf = GridSearchCV(clf_base, grid, cv=cv, n_jobs=8, verbose=10)
clf=clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

# print score
print('Accuracy: %.3f%%' % (accuracy_score(y_test, y_pred) * 100))

Class 0: 99.010%, Class 1: 0.990%
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Accuracy: 99.356%
