Due to imbalanced dataset, model may get biased towards the majority datapoints. To fix this, we use upsampling and downsampling.

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(123)
# this is used to ensure that the random numbers generated are the same every time you run the code, since the seed is fixed.
# because whenever we run the program, new random numbers are generated, which can lead to different results.
# but by setting a seed, we ensure that the random numbers generated are the same every time we run the code.  

In [3]:
n_samples = 1000
# number of samples in the dataset
class_0_ratio = 0.9
# ratio of class 0 in the dataset
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0
# number of samples in class 1


In [4]:
# create dataframe
df = pd.DataFrame({
    'feature_1': np.random.randn(n_samples),
    'feature_2': np.random.randn(n_samples),
    'target': np.concatenate([np.zeros(n_class_0), np.ones(n_class_1)])
})


In [5]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,-0.748827,0.0
1,0.997345,0.567595,0.0
2,0.282978,0.718151,0.0
3,-1.506295,-0.999381,0.0
4,-0.5786,0.474898,0.0


In [6]:
df.shape

(1000, 3)

In [7]:
df['target'].value_counts()

target
0.0    900
1.0    100
Name: count, dtype: int64

Upsampling

In [8]:
df_minority = df[df['target']==1]
df_majority = df[df['target']==0]

In [9]:
from sklearn.utils import resample
df_minority_upsampled = resample(df_minority,replace=True,n_samples=len(df_majority),random_state=123)

In [10]:
df_minority_upsampled.shape

(900, 3)

In [11]:
df_minority_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
966,-0.341206,-0.629012,1.0
992,-1.318044,-0.602575,1.0
998,0.470264,-1.609695,1.0
917,0.895371,2.216788,1.0
983,0.609791,0.015572,1.0


In [12]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [13]:
df_upsampled['target'].value_counts()

target
0.0    900
1.0    900
Name: count, dtype: int64

Downsampling

In [14]:
np.random.seed(42)

In [17]:
n_samples2 = 1000
class_0_ratio2 = 0.8
class_0_samples = int(n_samples2 * class_0_ratio2)
class_1_samples = n_samples2 - class_0_samples

In [19]:
df2 = pd.DataFrame({
    "feature1":np.random.randn(n_samples2),
    "feature2":np.random.randn(n_samples2),
    "target":np.concatenate([np.zeros(class_0_samples), np.ones(class_1_samples)])
})

In [20]:
df2.head()

Unnamed: 0,feature1,feature2,target
0,-0.675178,-1.907808,0.0
1,-0.144519,-0.860385,0.0
2,-0.79242,-0.413606,0.0
3,-0.307962,1.887688,0.0
4,-1.893615,0.556553,0.0


In [22]:
df_minority = df[df2['target']==1]
df_majority = df[df2['target']==0]

In [23]:
from sklearn.utils import resample
df_downsampled = resample(df_majority,replace=False,n_samples=len(df_minority),random_state=123)

In [24]:
df_downsampled.shape

(200, 3)

In [26]:
df_downsampled=pd.concat([df_minority, df_downsampled])

In [27]:
df_downsampled.head()

Unnamed: 0,feature1,feature2,target
800,0.938284,1.901191,1.0
801,-0.516045,-0.060661,1.0
802,0.096121,-0.708407,1.0
803,-0.462275,-1.513714,1.0
804,-0.434496,-1.80314,1.0


In [28]:
df_downsampled['target'].value_counts()

target
1.0    200
0.0    200
Name: count, dtype: int64