# 02. Data Balancing

Balance data with RUS, ROS, SMOTENC, and CTGAN.

## 01. Imports and Settings

In [1]:
# Imports
from libs.balance_data import resampling_data  # balance_data.py
from libs.utils import split_X_y  # utils.py

import pandas as pd
import warnings


# Ignore warnings
warnings.filterwarnings("ignore")

# Pandas settings
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 150)  # Increase column width

# Default path
DATA_PATH = "data/"

## 02. Load Data

In [2]:
# Load training data
train_df = pd.read_csv(f"{DATA_PATH}train_data.csv")

# Categorial columns
categorical_cols = ["join_s", "sch_s", "sch_r"]
# Numerical columns
numerical_cols = ["adv_r", "data_s", "dist_ch_to_bs", "dist_to_ch",
                  "expaned_energy", "rank", "send_code", "who_ch"]

In [3]:
# Split into X and y
X_train, y_train = split_X_y(train_df, "is_target", [])

## 03. Resampling Data

In [4]:
# Resampling...
resampled = resampling_data(X_train, y_train, numerical_cols, categorical_cols, object_to_save=True)


>> ONLY_RUS: ...
>> ONLY_RUS: Done!


>> ROS: ...
>> ROS: Done!


>> SMOTENC: ...
>> SMOTENC: Done!


>> CTGAN: ...

> Class 1:
Epoch: 0 | critic_loss: 1.4269341230392456 | generator_loss: 0.3303593397140503
Epoch: 1 | critic_loss: 0.12900638580322266 | generator_loss: -0.15554296970367432
Epoch: 2 | critic_loss: -0.03884728252887726 | generator_loss: 0.13909348845481873
Epoch: 3 | critic_loss: 0.18110786378383636 | generator_loss: -0.2946339249610901
Epoch: 4 | critic_loss: 0.2537495493888855 | generator_loss: -0.6818858981132507
Epoch: 5 | critic_loss: 0.060408174991607666 | generator_loss: -0.507952094078064
Epoch: 6 | critic_loss: 0.20891740918159485 | generator_loss: -0.6302518248558044
Epoch: 7 | critic_loss: 0.22924292087554932 | generator_loss: -0.5066438913345337
Epoch: 8 | critic_loss: 0.07944479584693909 | generator_loss: -0.4747595191001892
Epoch: 9 | critic_loss: 0.04966318607330322 | generator_loss: -0.642231822013855
Epoch: 10 | critic_loss: -0.05496141314506531 | gener

## 04. Save "Balanced" Data

### 04.1. Random Undersampling (RUS)

In [5]:
# Save data after RUS
resampled["ONLY_RUS"]["X"]["is_target"] = resampled["ONLY_RUS"]["y"]
resampled["ONLY_RUS"]["X"].to_csv(f"{DATA_PATH}resampling/train_data_ONLY_RUS.csv", index=False)

### 04.2. Random Oversampling (ROS)

In [6]:
# Save data after ROS
resampled["ROS"]["X"]["is_target"] = resampled["ROS"]["y"]
resampled["ROS"]["X"].to_csv(f"{DATA_PATH}resampling/train_data_ROS.csv", index=False)

### 04.3. Synthetic Minority Over-sampling Technique for Nominal and Continuous (SMOTENC)

In [7]:
# Save data after SMOTENC
resampled["SMOTENC"]["X"]["is_target"] = resampled["SMOTENC"]["y"]
resampled["SMOTENC"]["X"].to_csv(f"{DATA_PATH}resampling/train_data_SMOTENC.csv", index=False)

### 04.4. Conditional Tabular Generative Adversarial Network (CTGAN)

In [8]:
# Save data after CTGAN
resampled["CTGAN"]["X"]["is_target"] = resampled["CTGAN"]["y"]
resampled["CTGAN"]["X"].to_csv(f"{DATA_PATH}resampling/train_data_CTGAN.csv", index=False)