<a href="https://colab.research.google.com/github/steliosg23/PDS-A2/blob/main/Incidents_Augmentation_using_EleutherAI_gpt_neo_125M.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from transformers import pipeline
from sklearn.utils import resample

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
file_path = '/content/drive/MyDrive/Data/incidents_train.csv'
df = pd.read_csv(file_path)

# Drop the 'Unnamed: 0' column
df.drop(columns=['Unnamed: 0'], inplace=True)

# Initialize a smaller free text generation model with explicit pad_token_id
generator = pipeline(
    'text-generation',
    model='EleutherAI/gpt-neo-125M',  # Smaller model
    max_length=200,
    device=-1,  # Use CPU
    pad_token_id=50256  # Explicitly set pad_token_id to prevent warning
)

# Function to augment a specific class with synthetic title and text
def augment_class(data, column, class_label, n_samples):
    class_samples = data[data[column] == class_label]
    generated_samples = []

    for _ in range(n_samples):
        # Generate synthetic title
        title_prompt = f"Generate a title for an incident related to {class_label} in the {column} category."
        synthetic_title = generator(title_prompt, num_return_sequences=1)[0]['generated_text']

        # Generate synthetic description
        text_prompt = f"Generate an incident description related to {class_label} in the {column} category."
        synthetic_text = generator(text_prompt, num_return_sequences=1)[0]['generated_text']

        # Create new row with synthetic title and text
        new_row = class_samples.iloc[0].copy()
        new_row['title'] = synthetic_title
        new_row['text'] = synthetic_text
        new_row[column] = class_label
        generated_samples.append(new_row)

    return pd.DataFrame(generated_samples)

# Define target columns and minimum samples per class
target_columns = ['hazard-category', 'product-category', 'hazard', 'product']
min_samples = 100  # Minimum desired samples per class

# Augment classes for each target
augmented_dataframes = []
for target in target_columns:
    class_counts = df[target].value_counts()
    for class_label, count in class_counts.items():
        if count < min_samples:
            n_to_generate = min_samples - count
            augmented_df = augment_class(df, target, class_label, n_to_generate)
            augmented_dataframes.append(augmented_df)

# Combine the original and augmented datasets
augmented_df = pd.concat([df] + augmented_dataframes, ignore_index=True)

# Save the augmented dataset
output_path = '/content/drive/MyDrive/Data/augmented_incidents.csv'
augmented_df.to_csv(output_path, index=False)
