## Handling Imbalanced Dataset
Initially you will be given a dataset when you are working on ml or dl project
-- lets say it is a classification problem(supervised machine learning problem), it means the output is basically in the form of categories
if there is 2 categories then we will say it is a binary classification

Dataset has 1000 data points, and out of this 1000 datapoints 900 yes and 100 no datapoint, maximum number of datapoint is saying yes and minimul number of datapoint is saying no, ratio is 9:1, it is a imbalanced dataset and model will get biased by the maximum number of datapoints. 
we should make datapoints equal by these 2 methods

1. Up sampling- increase the data point from minority
2. Down Sampling- decrease the data point from majority

In [1]:
import numpy as np
import pandas as pd

# Set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [2]:
n_class_0,n_class_1

(900, 100)

In [3]:
## CREATE MY DATAFRAME WITH IMBALANCED DATASET
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

In [4]:
#concatenating the dataframe
df=pd.concat([class_0,class_1]).reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [6]:
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,1.376371,2.845701,1
996,2.23981,0.880077,1
997,1.13176,1.640703,1
998,2.902006,0.390305,1
999,2.69749,2.01357,1


In [9]:
# df.target.value_counts()
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

In [None]:
#UPSAMPLING
#increase the data point of minority 
df_minority = df[df['target']==1]
df_majority = df[df['target']==0]

In [15]:
from sklearn.utils import resample
df_minority_upsampled= resample(df_minority, replace = True, #Sample with replacement
n_samples = len(df_majority),
random_state = 42
)

In [None]:
#data points are same
df_minority_upsampled.shape

(900, 3)

In [17]:
df_minority_upsampled.head()

Unnamed: 0,feature_1,feature_2,target
951,1.125854,1.843917,1
992,2.19657,1.397425,1
914,1.93217,2.998053,1
971,2.272825,3.034197,1
960,2.870056,1.550485,1


In [18]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [19]:
df_upsampled.shape

(1800, 3)

In [20]:
df_upsampled.target.value_counts()

target
0    900
1    900
Name: count, dtype: int64

## DOWN SAMPLING

In [21]:
import pandas as pd

# Set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

df = pd.concat([class_0, class_1]).reset_index(drop=True)

# Check the class distribution
print(df['target'].value_counts())

target
0    900
1    100
Name: count, dtype: int64


In [None]:
#DOWN SAMPLING
#decrease the data point from the majority
df_minority = df[df['target']==1]
df_majority = df[df['target']==0]

In [23]:
from sklearn.utils import resample
df_majority_downsampled= resample(df_majority, replace = False, #replacing will be false we are not replacing the datapoints
n_samples = len(df_minority),
random_state = 42
)

In [None]:
df_majority_downsampled.shape



(100, 3)

In [25]:
df_majority_downsampled.target.value_counts()

target
0    100
Name: count, dtype: int64

In [26]:
df_majority_downsampled.head()

Unnamed: 0,feature_1,feature_2,target
70,0.468439,1.72092,0
827,1.089165,-0.464899,0
231,0.753869,-0.969798,0
588,0.588686,-0.70472,0
39,0.283627,1.012868,0


#### DOWN SAMPLING is bad because we lose a lots of data points so we should use the up sampling techniques