# Dataset Balancer Demonstration
Source code: `https://github.com/sarahndippity/sx-portfolio/mission-control/src/dataset_balancer.py` <br>
Dataset borrowed from `https://machinelearningmastery.com/standard-machine-learning-datasets-for-imbalanced-classification/`

In [1]:
import pandas as pd

from src.dataset_balancer import DatasetBalancer

In [2]:
# load dataset
DATA_URL = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/glass.csv"
glass_df = pd.read_csv(DATA_URL, header=None)
glass_df.columns = ["feature_1", "feature_2", "feature_3", "feature_4", "feature_5", 
                    "feature_6", "feature_7", "feature_8", "feature_9", "target"]
print(glass_df.shape)
glass_df.head(2)

(214, 10)


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,target
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1


In [3]:
glass_df.groupby("target").size()

target
1    70
2    76
3    17
5    13
6     9
7    29
dtype: int64

# Oversampling all minority classes
With validation set

In [4]:
db = DatasetBalancer(data=glass_df,
                     target_variable="target",
                     stratify_cols=["target"],
                     balance_strategy="oversample", 
                     train_ratio=0.6, 
                     test_ratio=0.3)
train, val, test = db.execute_data_balancer()

2024-08-17 13:33:42 INFO Splitting data into train, val, and test sets.
2024-08-17 13:33:42 INFO Starting dataset size: (214, 10)
2024-08-17 13:33:42 INFO Training data size - (128, 10), Validation data size - (21, 10), Test data size - (65, 10)
2024-08-17 13:33:42 INFO Oversampling the minority class.
2024-08-17 13:33:42 INFO Dataset balancing complete. Final dataset size for train set: (276, 10)
2024-08-17 13:33:42 INFO validation set: (42, 10)
2024-08-17 13:33:42 INFO Successfully generated balanced training, validation, and test datasets.


In [5]:
train.groupby("target").size()

target
1    46
2    46
3    46
5    46
6    46
7    46
dtype: int64

# Undersampling all majority classes
With no validation set

In [6]:
db = DatasetBalancer(data=glass_df,
                     target_variable="target",
                     stratify_cols=["target"],
                     balance_strategy="undersample", 
                     train_ratio=0.6, 
                     test_ratio=0.4)
train, val, test = db.execute_data_balancer()

2024-08-17 13:34:45 INFO Splitting data into train, val, and test sets.
2024-08-17 13:34:45 INFO Starting dataset size: (214, 10)
2024-08-17 13:34:45 INFO Training data size - (128, 10), Test data size - (86, 10)
2024-08-17 13:34:45 INFO Undersampling the majority class.
2024-08-17 13:34:45 INFO Dataset balancing complete. Final dataset size for train set: (30, 10)
2024-08-17 13:34:45 INFO Successfully generated balanced training, validation, and test datasets.


In [7]:
train.groupby("target").size()

target
1    5
2    5
3    5
5    5
6    5
7    5
dtype: int64

In [8]:
val

# Both oversampling and undersampling
Using Class 7 as the anchor class

In [3]:
db = DatasetBalancer(data=glass_df,
                     target_variable="target",
                     stratify_cols=["target"],
                     balance_strategy="both", 
                     train_ratio=0.6, 
                     test_ratio=0.3, 
                     sampling_class=7)
train, val, test = db.execute_data_balancer()

2024-08-17 13:49:10 INFO Splitting data into train, val, and test sets.
2024-08-17 13:49:10 INFO Starting dataset size: (214, 10)
2024-08-17 13:49:10 INFO Training data size - (128, 10), Validation data size - (21, 10), Test data size - (65, 10)
2024-08-17 13:49:10 INFO Undersampling the classes with majority representation above 7
2024-08-17 13:49:10 INFO Oversampling the classes with minority representation below 7
2024-08-17 13:49:10 INFO Undersampling the classes with majority representation above 7
2024-08-17 13:49:10 INFO Oversampling the classes with minority representation below 7
2024-08-17 13:49:10 INFO Dataset balancing complete. Final dataset size for train set: (102, 10)
2024-08-17 13:49:10 INFO validation set: (18, 10)
2024-08-17 13:49:10 INFO Successfully generated balanced training, validation, and test datasets.


In [4]:
train.groupby("target").size()

target
1    17
2    17
3    17
5    17
6    17
7    17
dtype: int64

In [5]:
val.groupby("target").size()

target
1    3
2    3
3    3
5    3
6    3
7    3
dtype: int64