# Handling imbalanced dataset

### A. Handling imbalanced data using SMOTE
SMOTE uses interpolation technique to conduct over sampling of the minority class.

In [48]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

In [49]:
x,y = make_classification(n_samples = 1000, n_features = 5, n_informative = 3, n_redundant = 1, n_clusters_per_class = 1, weights = [0.9,0.1], random_state = 47)

In [50]:
df = pd.DataFrame(x, columns = [f"feature_{i}" for i in range(1,6)])

In [51]:
df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5
0,0.791687,1.142094,0.660421,-0.539649,1.15392
1,-0.969617,-2.655449,-0.570283,-0.759104,-0.314212
2,0.93095,0.335996,1.205048,-1.597056,-0.160702
3,0.636137,-0.294808,1.3706,-1.352939,-1.21612
4,0.227346,0.4182,0.557623,0.231708,0.331084


In [52]:
df["target"] = y

In [53]:
df["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,899
1,101


In [54]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.20, random_state = 47, stratify = y)

In [55]:
model = LogisticRegression()
model.fit(xtrain, ytrain)

In [56]:
ypred = model.predict(xtest)
print(accuracy_score(ytest, ypred))
print(classification_report(ytest, ypred))

0.965
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       180
           1       0.84      0.80      0.82        20

    accuracy                           0.96       200
   macro avg       0.91      0.89      0.90       200
weighted avg       0.96      0.96      0.96       200



In [57]:
sm = SMOTE(random_state = 47)
xtrains, ytrains = sm.fit_resample(xtrain, ytrain)

In [58]:
print(pd.Series(ytrain).value_counts(), "\n\n", pd.Series(ytrains).value_counts())

0    719
1     81
Name: count, dtype: int64 

 0    719
1    719
Name: count, dtype: int64


In [59]:
models = LogisticRegression()
models.fit(xtrains, ytrains)

In [60]:
ypreds = models.predict(xtest)
print(accuracy_score(ypreds, ytest))
print(classification_report(ypreds, ytest))

0.92
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       164
           1       1.00      0.56      0.71        36

    accuracy                           0.92       200
   macro avg       0.96      0.78      0.83       200
weighted avg       0.93      0.92      0.91       200



### B. Random Oversampling
Random Oversampling will conduct over sampling by duplicating the minority class samples, hence it works quite differently than SMOTE.

In [61]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state = 47)

In [62]:
xtrainros, ytrainros = ros.fit_resample(xtrain, ytrain)

In [63]:
print(pd.Series(ytrainros).value_counts())

0    719
1    719
Name: count, dtype: int64


In [64]:
modelros = LogisticRegression()
modelros.fit(xtrainros, ytrainros)
ypredros = modelros.predict(xtest)
print(accuracy_score(ytest, ypredros))
print(classification_report(ytest, ypredros))

0.91
              precision    recall  f1-score   support

           0       1.00      0.90      0.95       180
           1       0.53      1.00      0.69        20

    accuracy                           0.91       200
   macro avg       0.76      0.95      0.82       200
weighted avg       0.95      0.91      0.92       200



### C. Random Undersampling
Under-sample the majority class(es) by randomly picking samples with or without replacement.

In [66]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state = 47)

In [67]:
xtrainrus, ytrainrus = rus.fit_resample(xtrain, ytrain)

In [68]:
print(pd.Series(ytrainrus).value_counts())

0    81
1    81
Name: count, dtype: int64


In [69]:
modelrus = LogisticRegression()
modelrus.fit(xtrainrus, ytrainrus)
ypredrus = modelrus.predict(xtest)
print(accuracy_score(ytest, ypredrus))
print(classification_report(ytest, ypredrus))

0.91
              precision    recall  f1-score   support

           0       1.00      0.90      0.95       180
           1       0.53      1.00      0.69        20

    accuracy                           0.91       200
   macro avg       0.76      0.95      0.82       200
weighted avg       0.95      0.91      0.92       200

