In [6]:
import pandas as pd
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import BorderlineSMOTE
import matplotlib.pyplot as plt
from collections import Counter


In [7]:
def calculate_class_weights(labels):
    """
    Calculate class weights for imbalanced datasets.
    Args:
        labels (np.array or list): The class labels in the dataset.
    Returns:
        dict: A dictionary mapping each class to its weight.
    """
    classes = np.unique(labels)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=labels)
    return dict(zip(classes, weights))

In [8]:
def apply_bsmote_with_limit(X, y, scaling_factor=5000, max_class_limit_ratio=1):
    """
    Apply BorderlineSMOTE with a limit on how much minority classes can be upsampled.

    Args:
        X (pd.DataFrame): Features of the dataset.
        y (pd.Series): Labels of the dataset.
        scaling_factor (int): Maximum upscaling factor for minority classes relative to their original size.
        max_class_limit_ratio (float): Maximum oversampling limit as a fraction of the largest class size.

    Returns:
        tuple: Resampled X and y.
    """

    # Convert y to a pandas Series for easier manipulation
    y = pd.Series(y)
    
    # Dynamically determine n_neighbors based on smallest class
    class_counts = Counter(y)
    min_class_size = min(class_counts.values())
    largest_class_size = max(class_counts.values())
    n_neighbors = min(5, min_class_size - 1)  # Adjust neighbors to fit smallest class
    j_neighbors = min(10, min_class_size - 1)

    # Calculate target sizes for minority classes
    smote_target_sizes = {
        class_label: min(
            scaling_factor * original_size,
            int(largest_class_size * max_class_limit_ratio)
        )
        for class_label, original_size in class_counts.items()
    }

    # Adjust BorderlineSMOTE strategy to limit the oversampling
    smote_strategy = {
        class_label: target_size
        for class_label, target_size in smote_target_sizes.items()
        if target_size > class_counts[class_label]
    }

    # Apply BorderlineSMOTE with the limited strategy
    smote = BorderlineSMOTE(sampling_strategy=smote_strategy, random_state=42, k_neighbors=n_neighbors, m_neighbors=j_neighbors)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    return pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled)

# Shuttle

In [None]:
# Load the training dataset (replace with your dataset path)
dataset_path = "/kaggle/input/ma-datasets/shuttle_train.csv"  # Example for KDD dataset
data = pd.read_csv(dataset_path)

# Assume the last column is the target
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Print original class distribution
print("Original Class Distribution:")
print(y.value_counts())

In [None]:
# Rebalance with SMOTE
X_resampled, y_resampled = apply_bsmote_with_limit(X, y)

# Print resampled class distribution
print("\nResampled Class Distribution:")
print(pd.Series(y_resampled).value_counts())

# Visualize original and resampled distributions
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
y.value_counts().sort_index().plot(kind='bar', ax=ax[0], title="Original Distribution")
pd.Series(y_resampled).value_counts().sort_index().plot(kind='bar', ax=ax[1], title="Resampled Distribution")
ax[0].set_xlabel("Class")
ax[0].set_ylabel("Frequency")
ax[1].set_xlabel("Class")
ax[1].set_ylabel("Frequency")
plt.tight_layout()
plt.show()

# Optionally calculate class weights
class_weights = calculate_class_weights(y_resampled)
print("\nClass Weights (After Resampling):")
print(class_weights)

In [None]:
# Combine resampled features and labels into a single DataFrame
resampled_data = pd.concat([X_resampled, y_resampled.rename('label')], axis=1)

# Save the resampled data to a CSV file
resampled_data.to_csv("shuttle_bsmote.csv", index=False)

print("Resampled data has been saved to 'shuttle_bsmote.csv'")

# Covertype

In [None]:
# Load the training dataset (replace with your dataset path)
dataset_path = "/kaggle/input/ma-datasets/covtype_train.csv"  # Example for KDD dataset
data = pd.read_csv(dataset_path)

# Assume the last column is the target
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Print original class distribution
print("Original Class Distribution:")
print(y.value_counts())

In [None]:
# Rebalance with SMOTE
X_resampled, y_resampled = apply_bsmote_with_limit(X, y)

# Print resampled class distribution
print("\nResampled Class Distribution:")
print(pd.Series(y_resampled).value_counts())

# Visualize original and resampled distributions
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
y.value_counts().sort_index().plot(kind='bar', ax=ax[0], title="Original Distribution")
pd.Series(y_resampled).value_counts().sort_index().plot(kind='bar', ax=ax[1], title="Resampled Distribution")
ax[0].set_xlabel("Class")
ax[0].set_ylabel("Frequency")
ax[1].set_xlabel("Class")
ax[1].set_ylabel("Frequency")
plt.tight_layout()
plt.show()

# Optionally calculate class weights
class_weights = calculate_class_weights(y_resampled)
print("\nClass Weights (After Resampling):")
print(class_weights)

In [None]:
# Combine resampled features and labels into a single DataFrame
resampled_data = pd.concat([X_resampled, y_resampled.rename('label')], axis=1)

# Save the resampled data to a CSV file
resampled_data.to_csv("covtype_bsmote.csv", index=False)

print("Resampled data has been saved to 'covtype_bsmote.csv'")

# Darknet

In [None]:
# Load the training dataset (replace with your dataset path)
dataset_path = "/kaggle/input/ma-datasets/darknet_train.csv"  # Example for KDD dataset
data = pd.read_csv(dataset_path)

# Assume the last column is the target
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Print original class distribution
print("Original Class Distribution:")
print(y.value_counts())

In [None]:
# Rebalance with SMOTE
X_resampled, y_resampled = apply_bsmote_with_limit(X, y)

# Print resampled class distribution
print("\nResampled Class Distribution:")
print(pd.Series(y_resampled).value_counts())

# Visualize original and resampled distributions
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
y.value_counts().sort_index().plot(kind='bar', ax=ax[0], title="Original Distribution")
pd.Series(y_resampled).value_counts().sort_index().plot(kind='bar', ax=ax[1], title="Resampled Distribution")
ax[0].set_xlabel("Class")
ax[0].set_ylabel("Frequency")
ax[1].set_xlabel("Class")
ax[1].set_ylabel("Frequency")
plt.tight_layout()
plt.show()

# Optionally calculate class weights
class_weights = calculate_class_weights(y_resampled)
print("\nClass Weights (After Resampling):")
print(class_weights)

In [None]:
# Combine resampled features and labels into a single DataFrame
resampled_data = pd.concat([X_resampled, y_resampled.rename('label')], axis=1)

# Save the resampled data to a CSV file
resampled_data.to_csv("darknet_bsmote.csv", index=False)

print("Resampled data has been saved to 'darknet_bsmote.csv'")

# KDD

In [None]:
# Load the training dataset (replace with your dataset path)
dataset_path = "/kaggle/input/ma-datasets/kdd_train.csv"  # Example for KDD dataset
data = pd.read_csv(dataset_path)

# Assume the last column is the target
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Print original class distribution
print("Original Class Distribution:")
print(y.value_counts())

In [None]:
# Rebalance with SMOTE
X_resampled, y_resampled = apply_bsmote_with_limit(X, y)

# Print resampled class distribution
print("\nResampled Class Distribution:")
print(pd.Series(y_resampled).value_counts())

# Visualize original and resampled distributions
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
y.value_counts().sort_index().plot(kind='bar', ax=ax[0], title="Original Distribution")
pd.Series(y_resampled).value_counts().sort_index().plot(kind='bar', ax=ax[1], title="Resampled Distribution")
ax[0].set_xlabel("Class")
ax[0].set_ylabel("Frequency")
ax[1].set_xlabel("Class")
ax[1].set_ylabel("Frequency")
plt.tight_layout()
plt.show()

# Optionally calculate class weights
class_weights = calculate_class_weights(y_resampled)
print("\nClass Weights (After Resampling):")
print(class_weights)

In [None]:
# Combine resampled features and labels into a single DataFrame
resampled_data = pd.concat([X_resampled, y_resampled.rename('label')], axis=1)

# Save the resampled data to a CSV file
resampled_data.to_csv("kdd_bsmote.csv", index=False)

print("Resampled data has been saved to 'kdd_bsmote.csv'")