In [4]:
'''1. Random UnderSampling
Identify the smallest class: Determine the class with the fewest samples (min_samples).

Randomly sample from other classes: For each of the other classes, randomly select min_samples instances to match the size of the smallest class.

Combine the samples: Merge the sampled data from all classes into a balanced dataset.'''




import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('./Dataset/all_data.csv')

# Count the number of samples for each class
class_counts = df['Label'].value_counts()

# Find the minimum number of samples among all classes
min_samples = class_counts.min()

# Create a balanced dataset by sampling equally from each class
balanced_data = pd.DataFrame()

for label in df['Label'].unique():
    class_data = df[df['Label'] == label]
    sampled_data = class_data.sample(min_samples, random_state=42)  # Random sampling
    balanced_data = pd.concat([balanced_data, sampled_data])

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset to a new CSV file
balanced_data.to_csv('./BalancedDataset/underSampledBalance.csv', index=False)




print("Balanced dataset created and saved as './BalancedDataset/underSampledBalance.csv'.")


Balanced dataset created and saved as './BalancedDataset/underSampledBalance.csv'.


In [5]:
'''1. Random Oversampling
How it works: Randomly duplicate samples from the minority classes to match the size of the majority class.

Advantages: No data is lost from the majority class.

Disadvantages: Can lead to overfitting because the same samples are repeated.'''

import pandas as pd
from sklearn.utils import resample

# Load the dataset in chunks
chunk_size = 100000  # Adjust based on your memory capacity
chunks = pd.read_csv('./Dataset/all_data.csv', chunksize=chunk_size)

# Initialize an empty DataFrame for the balanced dataset
balanced_data = pd.DataFrame()

# Process each chunk
for chunk in chunks:
    classes = chunk['Label'].unique()
    max_samples = chunk['Label'].value_counts().max()

    for label in classes:
        class_data = chunk[chunk['Label'] == label]
        if len(class_data) < max_samples:
            # Oversample the minority class
            class_data = resample(class_data, replace=True, n_samples=max_samples, random_state=42)
        balanced_data = pd.concat([balanced_data, class_data])

# Shuffle and save the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_data.to_csv('./BalancedDataset/overSampledBalanced.csv', index=False)

print('./BalancedDataset/overSampledBalanced.csv')


./BalancedDataset/overSampledBalanced.csv


In [8]:
'''2. SMOTE (Synthetic Minority Oversampling Technique)
How it works: Generates synthetic samples for the minority classes by interpolating between existing samples.

Advantages: Creates new, diverse samples instead of duplicating existing ones.

Disadvantages: Can create unrealistic samples if the dataset is very small.'''


from imblearn.over_sampling import SMOTE

# Separate features and target
X = df.drop('Label', axis=1)
y = df['Label']

#from imblearn.over_sampling import SMOTE
import pandas as pd

# Load the dataset in chunks
chunk_size = 100000  # Adjust based on your memory capacity
chunks = pd.read_csv('./Dataset/all_data.csv', chunksize=chunk_size)

# Initialize an empty DataFrame for the balanced dataset
balanced_data = pd.DataFrame()

# Process each chunk
for chunk in chunks:
    X = chunk.drop('Label', axis=1)
    y = chunk['Label']

    # Apply SMOTE
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Append to the balanced dataset
    balanced_chunk = pd.concat([pd.DataFrame(X_resampled), pd.Series(y_resampled, name='Label')], axis=1)
    balanced_data = pd.concat([balanced_data, balanced_chunk])

# Shuffle and save the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_data.to_csv('./BalancedDataset/smote_balanced.csv', index=False)

print('./BalancedDataset/SMOTEBalanced.csv')

ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

In [None]:
### Cannot use SMOTE with large datasets due to memory constraints. Use it with caution. ###

In [10]:
'''3. ADASYN (Adaptive Synthetic Sampling)
How it works: Similar to SMOTE but focuses on generating more samples for harder-to-learn minority classes.

Advantages: Better at handling complex datasets.

Disadvantages: Can also create unrealistic samples.'''


from imblearn.over_sampling import ADASYN

# Apply ADASYN
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X, y)

balanced_data = pd.concat([pd.DataFrame(X_resampled), pd.Series(y_resampled, name='Label')], axis=1)
balanced_data.to_csv('./BalancedDataset/adasynBalanced.csv', index=False)

print('./BalancedDataset/adasynBalanced.csv')

ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

In [11]:
'''4. Hybrid Approach (SMOTE + Random Undersampling)
How it works: Combines SMOTE (to oversample minority classes) with random undersampling (to reduce the majority class).

Advantages: Balances the dataset without losing too much information or creating too many synthetic samples.'''

from imblearn.combine import SMOTEENN

# Apply SMOTE + ENN (Edited Nearest Neighbors)
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

balanced_data = pd.concat([pd.DataFrame(X_resampled), pd.Series(y_resampled, name='Label')], axis=1)
balanced_data.to_csv('./BalancedDataset/smote_enn_balanced_data.csv', index=False)

print('./BalancedDataset/smote_enn_balanced_data.csv')

ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead