#### Handling Imbalanced dataset
#### 1. Up sampling 
#### 2. Down Sampling

In [3]:
import numpy as np
import pandas as pd


np.random.seed(120)

n_samples = 1000

class_0_ratio = 0.9

n_class_0=int(n_samples*class_0_ratio)
n_class_1= n_samples - n_class_0


In [4]:
n_class_0,n_class_1

(900, 100)

In [5]:
#create the dataframe with imbalanced dataset

class_0 = pd.DataFrame({
    'feature1':np.random.normal(loc=0,scale=1,size=n_class_0),
    'feature2':np.random.normal(loc=0,scale=1,size=n_class_0),
    'target':[0]*n_class_0})



class_1 = pd.DataFrame({
    'feature1':np.random.normal(loc=2,scale=1,size=n_class_1),
    'feature2':np.random.normal(loc=2,scale=1,size=n_class_1),
    'target':[1]*n_class_1})


In [6]:
df=pd.concat([class_0,class_1]).reset_index(drop=True)

In [7]:
df.head()

Unnamed: 0,feature1,feature2,target
0,0.14745,0.460683,0
1,2.02484,0.347878,0
2,-0.42688,-1.806474,0
3,2.31029,0.99492,0
4,0.62866,-0.014727,0


In [9]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

In [10]:
#now we use upsampling

df_minority= df[df['target']==1]
df_majority = df[df['target']==0]

In [12]:
df_majority

Unnamed: 0,feature1,feature2,target
0,0.147450,0.460683,0
1,2.024840,0.347878,0
2,-0.426880,-1.806474,0
3,2.310290,0.994920,0
4,0.628660,-0.014727,0
...,...,...,...
895,1.072119,-2.057989,0
896,0.234279,1.273417,0
897,0.380468,-0.281237,0
898,-0.488806,1.043399,0


In [13]:
from sklearn.utils import resample

#it will extrapolate the points


df_minority_upsampling = resample(df_minority, replace=True,
         n_samples=len(df_majority),
         random_state=130
         )

In [14]:
df_minority_upsampling

Unnamed: 0,feature1,feature2,target
925,2.816773,1.856024,1
929,3.358322,1.299378,1
910,2.915788,3.524829,1
958,1.404281,0.230894,1
981,1.981635,1.641152,1
...,...,...,...
905,1.500623,2.990866,1
922,2.005418,1.178659,1
963,0.195268,0.879252,1
919,3.069282,0.588713,1


In [15]:
df_upsampled=pd.concat([df_majority,df_minority_upsampling])

In [16]:
df_upsampled

Unnamed: 0,feature1,feature2,target
0,0.147450,0.460683,0
1,2.024840,0.347878,0
2,-0.426880,-1.806474,0
3,2.310290,0.994920,0
4,0.628660,-0.014727,0
...,...,...,...
905,1.500623,2.990866,1
922,2.005418,1.178659,1
963,0.195268,0.879252,1
919,3.069282,0.588713,1


In [17]:
df_upsampled['target'].value_counts()

target
0    900
1    900
Name: count, dtype: int64

In [18]:
# Downsampling 

#create the dataframe with imbalanced dataset

class_0 = pd.DataFrame({
    'feature1':np.random.normal(loc=0,scale=1,size=n_class_0),
    'feature2':np.random.normal(loc=0,scale=1,size=n_class_0),
    'target':[0]*n_class_0})



class_1 = pd.DataFrame({
    'feature1':np.random.normal(loc=2,scale=1,size=n_class_1),
    'feature2':np.random.normal(loc=2,scale=1,size=n_class_1),
    'target':[1]*n_class_1})


In [19]:
#downsampling
df_minority= df[df['target']==1]
df_majority = df[df['target']==0]

In [21]:
df_majority_downsampled=resample(df_majority,replace=False,
                                 n_samples=len(df_minority),
                                 random_state=32)

In [22]:
df_majority_downsampled

Unnamed: 0,feature1,feature2,target
390,-0.660955,-0.170864,0
267,0.316080,-3.554625,0
869,-0.241516,-0.547101,0
396,1.005169,-1.163713,0
416,0.347548,-0.400700,0
...,...,...,...
293,0.681560,-1.307766,0
812,1.202027,1.622992,0
894,-0.946051,2.091256,0
394,1.004728,0.062944,0
