Due to imbalanced dataset, model may get biased towards the majority datapoints. To fix this, we use upsampling and downsampling.

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(123)
# this is used to ensure that the random numbers generated are the same every time you run the code, since the seed is fixed.
# because whenever we run the program, new random numbers are generated, which can lead to different results.
# but by setting a seed, we ensure that the random numbers generated are the same every time we run the code.  

In [3]:
n_samples = 1000
# number of samples in the dataset
class_0_ratio = 0.9
# ratio of class 0 in the dataset
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0
# number of samples in class 1


In [4]:
# create dataframe
df = pd.DataFrame({
    'feature_1': np.random.randn(n_samples),
    'feature_2': np.random.randn(n_samples),
    'target': np.concatenate([np.zeros(n_class_0), np.ones(n_class_1)])
})


In [5]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,-0.748827,0.0
1,0.997345,0.567595,0.0
2,0.282978,0.718151,0.0
3,-1.506295,-0.999381,0.0
4,-0.5786,0.474898,0.0


In [6]:
df.sample(5)

Unnamed: 0,feature_1,feature_2,target
382,-1.305786,-0.526109,0.0
879,0.997957,-1.131957,0.0
37,0.688223,-1.7577,0.0
98,0.379401,0.69588,0.0
990,-0.453569,-1.080413,1.0


In [7]:
df.shape

(1000, 3)

In [26]:
df['target'].value_counts()

target
0.0    900
1.0    100
Name: count, dtype: int64

Upsampling

In [None]:
df_minority = df[df['target']==1]
df_majority = df[df['target']==0]

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,-0.748827,0.0
1,0.997345,0.567595,0.0
2,0.282978,0.718151,0.0
3,-1.506295,-0.999381,0.0
4,-0.578600,0.474898,0.0
...,...,...,...
895,0.238761,-0.623629,0.0
896,-1.106386,0.239810,0.0
897,0.366732,-0.868240,0.0
898,1.023906,0.902006,0.0


In [29]:
df_minority.head()

Unnamed: 0,feature_1,feature_2,target
900,0.551302,0.139033,1.0
901,0.419589,0.025577,1.0
902,1.815652,-0.196443,1.0
903,-0.25275,1.312255,1.0
904,-0.292004,1.187417,1.0


In [10]:
from sklearn.utils import resample
df_minority_upsampled = resample(df_minority,replace=True,n_samples=len(df_majority),random_state=123)

In [11]:
df_minority_upsampled.shape

(900, 3)

In [28]:
df_minority_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
966,-0.341206,-0.629012,1.0
992,-1.318044,-0.602575,1.0
998,0.470264,-1.609695,1.0
917,0.895371,2.216788,1.0
983,0.609791,0.015572,1.0


In [13]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [14]:
df_upsampled['target'].value_counts()

target
0.0    900
1.0    900
Name: count, dtype: int64

Downsampling

In [15]:
np.random.seed(42)

In [16]:
n_samples2 = 1000
class_0_ratio2 = 0.8
class_0_samples = int(n_samples2 * class_0_ratio2)
class_1_samples = n_samples2 - class_0_samples

In [17]:
df2 = pd.DataFrame({
    "feature1":np.random.randn(n_samples2),
    "feature2":np.random.randn(n_samples2),
    "target":np.concatenate([np.zeros(class_0_samples), np.ones(class_1_samples)])
})

In [18]:
df2.head()

Unnamed: 0,feature1,feature2,target
0,0.496714,1.399355,0.0
1,-0.138264,0.924634,0.0
2,0.647689,0.05963,0.0
3,1.52303,-0.646937,0.0
4,-0.234153,0.698223,0.0


In [19]:
df_minority = df[df2['target']==1]
df_majority = df[df2['target']==0]

In [20]:
from sklearn.utils import resample
df_downsampled = resample(df_majority,replace=False,n_samples=len(df_minority),random_state=123)

In [21]:
df_downsampled.shape

(200, 3)

In [22]:
df_downsampled=pd.concat([df_minority, df_downsampled])

In [23]:
df_downsampled.head()

Unnamed: 0,feature_1,feature_2,target
800,-0.121167,-0.300232,0.0
801,-2.008585,-0.632261,0.0
802,-0.920647,-0.204317,0.0
803,0.168234,0.213696,0.0
804,-1.319892,1.033878,0.0


In [24]:
df_downsampled['target'].value_counts()

target
0.0    300
1.0    100
Name: count, dtype: int64