# Handle data imbalancing

### import required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### load the data

In [3]:
# generate 200 normally distributed values
np.random.normal(size=200)

array([-1.58591164e+00,  6.65986415e-01,  6.95353777e-01,  2.12639061e+00,
       -2.55301287e-01,  1.16911441e+00, -1.67591556e+00, -1.07864604e+00,
        2.46903066e-01,  1.97359481e-03, -1.83370356e+00,  1.00575473e+00,
        9.77973585e-01,  2.75863107e-01, -2.09039216e-01, -1.83165793e-01,
        1.11917047e+00,  1.45146636e+00, -9.26005449e-01,  1.32243968e-01,
        9.42896365e-03, -4.19472532e-01, -6.82573328e-01,  1.16636258e+00,
        1.15274476e+00,  2.38081345e-02,  2.83399429e-01,  1.15688861e-03,
       -2.06758675e-01, -7.55358467e-01, -1.10614584e+00,  4.29043403e-01,
       -5.87977585e-01, -8.52811026e-01, -3.27272048e-01, -9.14964939e-01,
        9.16371974e-01, -3.82194075e-01, -4.33712337e-01,  3.58279142e+00,
        2.19447709e+00,  2.99794934e-01,  9.27994029e-01, -2.39673642e-01,
       -8.50873551e-01,  8.51917662e-01,  2.70663401e-01, -7.46685337e-01,
        1.72599215e+00,  2.93778052e-01,  2.29134025e+00,  4.47655397e-01,
        1.14289160e+00,  

In [5]:
# create a data frame to have target variable with 200 '0's 
df_1 = pd.DataFrame({
    "feature1": np.random.normal(size=200),
    "feature2": np.random.normal(size=200),
    "target": [0] * 200
})
df_1

Unnamed: 0,feature1,feature2,target
0,-1.758694,-0.321161,0
1,-0.056239,0.787099,0
2,1.012855,-1.201779,0
3,-2.154651,0.812135,0
4,0.711547,-0.674921,0
...,...,...,...
195,-0.585625,-0.615480,0
196,-0.649907,1.308315,0
197,-0.103414,-0.033746,0
198,-1.084530,0.255181,0


In [7]:
# create a data frame to have target variable with 800 '1's 
df_2 = pd.DataFrame({
    "feature1": np.random.normal(size=800),
    "feature2": np.random.normal(size=800),
    "target": [1] * 800
})
df_2

Unnamed: 0,feature1,feature2,target
0,0.997454,0.403240,1
1,0.989848,-0.410655,1
2,-0.873344,1.360711,1
3,-0.331820,0.466598,1
4,0.392631,-1.360056,1
...,...,...,...
795,0.912257,0.855369,1
796,-0.648639,0.521715,1
797,-0.416322,1.057739,1
798,0.590178,-0.148396,1


In [8]:
# create a new data frame by joining df_1 and df_2 data frames
df = pd.concat([df_1, df_2])
df

Unnamed: 0,feature1,feature2,target
0,-1.758694,-0.321161,0
1,-0.056239,0.787099,0
2,1.012855,-1.201779,0
3,-2.154651,0.812135,0
4,0.711547,-0.674921,0
...,...,...,...
795,0.912257,0.855369,1
796,-0.648639,0.521715,1
797,-0.416322,1.057739,1
798,0.590178,-0.148396,1


### EDA

In [10]:
df.head()

Unnamed: 0,feature1,feature2,target
0,-1.758694,-0.321161,0
1,-0.056239,0.787099,0
2,1.012855,-1.201779,0
3,-2.154651,0.812135,0
4,0.711547,-0.674921,0


In [11]:
df.tail()

Unnamed: 0,feature1,feature2,target
795,0.912257,0.855369,1
796,-0.648639,0.521715,1
797,-0.416322,1.057739,1
798,0.590178,-0.148396,1
799,-0.58097,-0.093073,1


In [12]:
df['target'].value_counts()

target
1    800
0    200
Name: count, dtype: int64

In [17]:
# divide the major and minor classes
# this is using filtering technique to separate minority and majority classes
df_minority = df[df['target'] == 0]
df_majority = df[df['target'] == 1]

## Down Sampling
- removing random records from majority class to match with minority class
- also known as under-sampling
- reduces the number of records

In [22]:
from sklearn.utils import resample

# remove 600 random records from majoriry class
df_down_sampled = resample(df_majority, n_samples=200, random_state=123456)
df_down_sampled

Unnamed: 0,feature1,feature2,target
65,-0.831293,-1.852752,1
746,0.845113,-0.527549,1
498,-0.810278,0.866749,1
49,-1.735119,0.656966,1
568,0.707991,0.199434,1
...,...,...,...
92,0.197652,-0.154294,1
756,-0.935261,1.624146,1
200,-0.605377,0.359582,1
463,-0.960480,0.363705,1


In [23]:
# now combine the reduced majority (df_down_sampled) and minority class to create final df
df_balanced = pd.concat([df_down_sampled, df_minority])
df_balanced

Unnamed: 0,feature1,feature2,target
65,-0.831293,-1.852752,1
746,0.845113,-0.527549,1
498,-0.810278,0.866749,1
49,-1.735119,0.656966,1
568,0.707991,0.199434,1
...,...,...,...
195,-0.585625,-0.615480,0
196,-0.649907,1.308315,0
197,-0.103414,-0.033746,0
198,-1.084530,0.255181,0


In [25]:
df_balanced['target'].value_counts()

target
1    200
0    200
Name: count, dtype: int64

## Up Sampling
- adding NEW random records from the existing ones to match the majority class
- also known as over-sampling
- increases the number of records


In [27]:
from sklearn.utils import resample

df_up_sampled = resample(df_minority, n_samples=800, random_state=123456)
df_up_sampled

Unnamed: 0,feature1,feature2,target
65,-1.392426,-0.726589,0
49,-0.108214,1.250181,0
56,0.701180,0.233874,0
171,-0.230691,-0.431821,0
43,-0.743719,-0.165717,0
...,...,...,...
197,-0.103414,-0.033746,0
84,1.263259,0.205753,0
103,0.735726,0.495328,0
133,0.728625,1.400459,0


In [28]:
# contact both the majority class and up sample data frame
df_balanced = pd.concat([df_majority, df_up_sampled])
df_balanced

Unnamed: 0,feature1,feature2,target
0,0.997454,0.403240,1
1,0.989848,-0.410655,1
2,-0.873344,1.360711,1
3,-0.331820,0.466598,1
4,0.392631,-1.360056,1
...,...,...,...
197,-0.103414,-0.033746,0
84,1.263259,0.205753,0
103,0.735726,0.495328,0
133,0.728625,1.400459,0


In [29]:
df_balanced['target'].value_counts()

target
1    800
0    800
Name: count, dtype: int64

## SMOTE
- Synthetic Minority Over sampling Technique
- the data generated for minority class will have relationship mantainned

In [30]:
# install package named imbalanced-learn 
%pip install imbalanced-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [33]:
from imblearn.over_sampling import SMOTE

# create a SMOTE object
smote = SMOTE(random_state=123456)

# seperate dependent and independent features

# x must be a data frame (2d collection)
x = df.drop('target', axis=1)

# y must be a series (1d collection)
y = df['target']

# oversample the data
x_balanced, y_balanced = smote.fit_resample(x, y)

In [35]:
y_balanced

0       0
1       0
2       0
3       0
4       0
       ..
1595    0
1596    0
1597    0
1598    0
1599    0
Name: target, Length: 1600, dtype: int64