## Handling Imbalanced Dataset

1.Upsampling

2.Downsampling

In [1]:
import numpy as np
import pandas as pd

# Set the random seed for reproducibility

np.random.seed(123)

In [4]:
## Create a dataframe with two classes

n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0
n_class_0,n_class_1

(900, 100)

In [8]:
## CREATE DATAFRAME WITH IMBALANCED DATASET
class_0 = pd.DataFrame({
    'feature_1':np.random.normal(loc =0,scale = 1,size = n_class_0),
    'feature_2':np.random.normal(loc= 0,scale =1,size=n_class_0),
    'target':[0]*n_class_0
})

In [10]:
class_1 = pd.DataFrame({
    'feature_1':np.random.normal(loc =2,scale = 1,size = n_class_1),
    'feature_2':np.random.normal(loc= 2,scale =1,size=n_class_1),
    'target':[1]*n_class_1
})

In [14]:
df = pd.concat([class_0,class_1]).reset_index(drop = True)

In [15]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-0.367419,-0.016102,0
1,0.575273,0.74701,0
2,0.439351,-0.257842,0
3,-0.728152,0.561622,0
4,-0.88853,0.295353,0


In [16]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

## Upsampling

In [None]:

df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

from sklearn.utils import resample
df_minority_upsampled = resample (df_minority , replace = True,
                                 n_samples = len(df_majority),
                                 random_state = 42)

In [19]:
df_minority_upsampled.shape

(900, 3)

In [20]:
df_minority_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
951,2.778823,1.338017,1
992,3.327434,1.472512,1
914,0.870036,2.28828,1
971,1.90979,2.912679,1
960,1.51235,2.116684,1


In [25]:
df_upsampled = pd.concat([df_minority_upsampled,df_majority]).reset_index(drop = True)

In [26]:
df_upsampled

Unnamed: 0,feature_1,feature_2,target
0,2.778823,1.338017,1
1,3.327434,1.472512,1
2,0.870036,2.288280,1
3,1.909790,2.912679,1
4,1.512350,2.116684,1
...,...,...,...
1795,0.719991,0.787335,0
1796,-1.159531,0.732800,0
1797,0.609591,-1.995079,0
1798,-1.919582,-0.254572,0


In [27]:
df_upsampled['target'].value_counts()

target
1    900
0    900
Name: count, dtype: int64

## Downsampling

In [29]:
## CREATE DATAFRAME WITH IMBALANCED DATASET
class_0 = pd.DataFrame({
    'feature_1':np.random.normal(loc =0,scale = 1,size = n_class_0),
    'feature_2':np.random.normal(loc= 0,scale =1,size=n_class_0),
    'target':[0]*n_class_0
})
class_1 = pd.DataFrame({
    'feature_1':np.random.normal(loc =2,scale = 1,size = n_class_1),
    'feature_2':np.random.normal(loc= 2,scale =1,size=n_class_1),
    'target':[1]*n_class_1
})
df = pd.concat([class_0,class_1]).reset_index(drop = True)
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-0.352019,-0.660989,0
1,-0.544779,-1.000782,0
2,-0.655072,0.072629,0
3,-0.312579,0.602853,0
4,0.194686,2.567502,0


In [31]:
##downsampling
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

from sklearn.utils import resample
df_majority_downsampled = resample (df_majority , replace = False,
                                 n_samples = len(df_minority),
                                 random_state = 42)

In [32]:
df_majority_downsampled.shape

(100, 3)

In [33]:
df_downsampled = pd.concat([df_minority,df_majority_downsampled]).reset_index(drop = True)

In [34]:
df_downsampled

Unnamed: 0,feature_1,feature_2,target
0,1.498372,2.187717,1
1,2.047969,1.354423,1
2,3.395306,0.318245,1
3,2.130621,1.547013,1
4,1.421185,1.942670,1
...,...,...,...
195,-0.542375,0.851079,0
196,-1.460387,-0.457429,0
197,0.658482,0.072417,0
198,1.741457,0.184276,0


In [35]:
df_downsampled['target'].value_counts()

target
1    100
0    100
Name: count, dtype: int64