In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Handling Imbalanced Dataset

## 1. Up Sampling (Increase the data with lower samples)
## 2. Down Sampling (Decrease the data with higher samples)

In [2]:
np.random.seed(123)

In [3]:
n_samples = 1000
class_0_ratio = 0.9

n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [4]:
n_class_0, n_class_1

(900, 100)

In [5]:
# Creating dataframe with imbalanced dataset
class_0 = pd.DataFrame({
    "feature_1": np.random.normal(loc=0, scale=1, size=n_class_0),
    "feature_2": np.random.normal(loc=0, scale=1, size=n_class_0),
    "target": [0] * n_class_0
})

class_1 = pd.DataFrame({
    "feature_1": np.random.normal(loc=0, scale=1, size=n_class_1),
    "feature_2": np.random.normal(loc=0, scale=1, size=n_class_1),
    "target": [1] * n_class_1
})

In [6]:
df = pd.concat([class_0, class_1]).reset_index(drop=True)

In [7]:
df.sample(5)

Unnamed: 0,feature_1,feature_2,target
382,-1.305786,-0.366811,0
879,0.997957,1.482163,0
37,0.688223,0.670512,0
98,0.379401,0.470264,0
990,1.063905,-1.080413,1


In [8]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

### In up sampling, we will try to increase the data points with `target=1`

### In down sampling, we will try to decrease the data points with `target=0`

In [9]:
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [10]:
from sklearn.utils import resample

In [12]:
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)

In [13]:
df_minority.shape, df_minority_upsampled.shape

((100, 3), (900, 3))

In [14]:
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [15]:
df_upsampled['target'].value_counts()

target
0    900
1    900
Name: count, dtype: int64