In [25]:
# example of random oversampling to balance the class distribution
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)

# summarize class distribution
print("count of categories before over sampling:")
print(Counter(y))

count of categories before over sampling:
Counter({0: 9900, 1: 100})


In [26]:
# fitting dataset using SVM before oversampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import svm
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=42)

clf1 = svm.SVC()
clf1.fit(X_train, y_train)

pred1 = clf1.predict(X_test)

In [27]:
# performance metrics before over sampling
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

print('Accuracy score: {}'.format(accuracy_score(y_test, pred1)))
print('Precision score: {}'.format(precision_score(y_test, pred1)))
print('Recall score: {}'.format(recall_score(y_test, pred1)))
print('F1 score: {}'.format(f1_score(y_test, pred1)))

Accuracy score: 0.9916
Precision score: 1.0
Recall score: 0.045454545454545456
F1 score: 0.08695652173913045


In [28]:
# balancing dataset

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')

# fit and apply the transform
X_over, y_over = oversample.fit_resample(X, y)

# summarize class distr
print("count of categories after over sampling:")
print(Counter(y_over))



count of categories after over sampling:
Counter({0: 9900, 1: 9900})


In [29]:
# fitting balanced dataset using SVM

clf2 = svm.SVC()
clf2.fit(X_over, y_over)

pred2 = clf2.predict(X_test)

In [30]:
# performance metrics after over sampling

print('Accuracy score: {}'.format(accuracy_score(y_test, pred2)))
print('Precision score: {}'.format(precision_score(y_test, pred2)))
print('Recall score: {}'.format(recall_score(y_test, pred2)))
print('F1 score: {}'.format(f1_score(y_test, pred2)))

Accuracy score: 0.9952
Precision score: 0.6470588235294118
Recall score: 1.0
F1 score: 0.7857142857142858
