# 02. Data Balancing

Balance data with RUS, ROS, SMOTENC, and CTGAN.

## 01. Imports and Settings

In [1]:
# Imports
from libs.balance_data import resampling_data  # balance_data.py
from libs.utils import split_X_y  # utils.py

import pandas as pd
import warnings


# Ignore warnings
warnings.filterwarnings("ignore")

# Pandas settings
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_colwidth", 150)  # Increase column width

# Default path
DATA_PATH = "data/"

## 02. Load Data

In [2]:
# Load training data
train_df = pd.read_csv(f"{DATA_PATH}train_data.csv")

# Categorial columns
categorical_cols = ["join_s", "sch_s", "sch_r"]
# Numerical columns
numerical_cols = ["adv_r", "adv_s", "data_r", "data_s", "dist_ch_to_bs",
                  "dist_to_ch", "expaned_energy", "rank", "send_code", "who_ch"]

In [3]:
# Split into X and y
X_train, y_train = split_X_y(train_df, "is_target", [])

## 03. Resampling Data

In [4]:
# Resampling...
resampled = resampling_data(X_train, y_train, numerical_cols, categorical_cols, object_to_save=True)


>> ONLY_RUS: ...
>> ONLY_RUS: Done!


>> ROS: ...
>> ROS: Done!


>> SMOTENC: ...
>> SMOTENC: Done!


>> CTGAN: ...

> Class 1:
Epoch: 0 | critic_loss: 1.2442028522491455 | generator_loss: 0.19312773644924164
Epoch: 1 | critic_loss: 0.3490235209465027 | generator_loss: -0.4613366425037384
Epoch: 2 | critic_loss: 0.21721212565898895 | generator_loss: -0.307803213596344
Epoch: 3 | critic_loss: 0.20947405695915222 | generator_loss: -0.15340140461921692
Epoch: 4 | critic_loss: 0.17708280682563782 | generator_loss: -0.1925140768289566
Epoch: 5 | critic_loss: 0.11903148144483566 | generator_loss: -0.5284206867218018
Epoch: 6 | critic_loss: 0.02284109592437744 | generator_loss: -0.9621726870536804
Epoch: 7 | critic_loss: 0.24115467071533203 | generator_loss: -0.6520392298698425
Epoch: 8 | critic_loss: 0.049259066581726074 | generator_loss: -0.5687962770462036
Epoch: 9 | critic_loss: 0.1628103256225586 | generator_loss: -0.6345369815826416
Epoch: 10 | critic_loss: 0.16365739703178406 | genera

## 04. Save "Balanced" Data

### 04.1. Random Undersampling (RUS)

In [5]:
# Save data after RUS
resampled["ONLY_RUS"]["X"]["is_target"] = resampled["ONLY_RUS"]["y"]
resampled["ONLY_RUS"]["X"].to_csv(f"{DATA_PATH}resampling/train_data_ONLY_RUS.csv", index=False)

### 04.2. Random Oversampling (ROS)

In [6]:
# Save data after ROS
resampled["ROS"]["X"]["is_target"] = resampled["ROS"]["y"]
resampled["ROS"]["X"].to_csv(f"{DATA_PATH}resampling/train_data_ROS.csv", index=False)

### 04.3. Synthetic Minority Over-sampling Technique for Nominal and Continuous (SMOTENC)

In [7]:
# Save data after SMOTENC
resampled["SMOTENC"]["X"]["is_target"] = resampled["SMOTENC"]["y"]
resampled["SMOTENC"]["X"].to_csv(f"{DATA_PATH}resampling/train_data_SMOTENC.csv", index=False)

### 04.4. Conditional Tabular Generative Adversarial Network (CTGAN)

In [8]:
# Save data after CTGAN
resampled["CTGAN"]["X"]["is_target"] = resampled["CTGAN"]["y"]
resampled["CTGAN"]["X"].to_csv(f"{DATA_PATH}resampling/train_data_CTGAN.csv", index=False)