In [None]:
!pip install transformers
!pip install bitsandbytes

In [None]:
class DataPreprocessorLogisticReg:
    def __init__(self, file_path, mappings_file):
        self.file_path = file_path
        self.data = pd.read_csv(file_path)
        self.data = self.data.sample(frac=0.0005, random_state=42)
        print("✔ Loaded dataset successfully.")

        # Load mappings from the provided JSON file
        with fs.open(mappings_file, 'r') as f:
            mappings = json.load(f)
            self.product_category_indices = mappings['product_category_indices']
            self.hazard_category_indices = mappings['hazard_category_indices']
            self.product_indices = mappings['product_indices']
            self.hazard_indices = mappings['hazard_indices']

        # Initialize reverse mappings
        self.product_category_labels = {v: k for k, v in self.product_category_indices.items()}
        self.hazard_category_labels = {v: k for k, v in self.hazard_category_indices.items()}
        self.product_labels = {v: k for k, v in self.product_indices.items()}
        self.hazard_labels = {v: k for k, v in self.hazard_indices.items()}

        # Initialize the Lemmatizer and Stop Words
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

        self.product_data = self.data[['title', 'product']]
        self.hazard_data = self.data[['title', 'hazard']]
        self.product_category_data = self.data[['title', 'product-category']]
        self.hazard_category_data = self.data[['title', 'hazard-category']]

    def remove_html_tags(self, text):
        return BeautifulSoup(text, "html.parser").get_text() if text else ""

    def preprocess_text(self, text):
        if text is None:
            return ""
        text = self.remove_html_tags(text)
        text = text.lower()
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = ' '.join(
            [self.lemmatizer.lemmatize(word) for word in text.split() if
             word not in self.stop_words]
        )
        return re.sub(r'\s+', ' ', text).strip()

    def combine_date_columns(self):
        if {'year', 'month', 'day'}.issubset(self.data.columns):
            self.data['date'] = pd.to_datetime(
                self.data[['year', 'month', 'day']])
            self.data['date'] = self.data['date'].astype('int64') // 1e9
            self.data.drop(['year', 'month', 'day'], axis=1, inplace=True)
        print("✔ Combined date columns successfully.")

    def encode_country_column(self):
        if 'country' in self.data.columns:
            # Save the mapping of the numerical index to the country name
            self.country_mapping = dict(
                enumerate(self.data['country'].factorize()[1]))
            self.data['country'] = pd.factorize(self.data['country'])[0]
        print("✔ Encoded country column successfully.")

    def vectorize_data(self):
        # Vectorizer instance
        vectorizer = TfidfVectorizer()

        # List to store the updated datasets with TF-IDF vectors
        updated_datasets = []

        for dataset in [self.product_data, self.hazard_data, self.product_category_data, self.hazard_category_data]:
            # Fit and transform the title data
            tfidf_matrix = vectorizer.fit_transform(dataset['title'])

            # Convert the TF-IDF matrix to a DataFrame
            tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=[f'tfidf_{i}' for i in range(tfidf_matrix.shape[1])])

            # Reset index of TF-IDF DataFrame to align with the dataset
            tfidf_df.reset_index(drop=True, inplace=True)

            # Concatenate the original dataset with the TF-IDF DataFrame
            dataset_with_tfidf = pd.concat([dataset.reset_index(drop=True), tfidf_df], axis=1)

            # Append the updated dataset
            updated_datasets.append(dataset_with_tfidf)

        # Assign the updated datasets back to their respective variables
        self.product_data, self.hazard_data, self.product_category_data, self.hazard_category_data = updated_datasets
        print("✔ Vectorized data using TF-IDF and added columns successfully.")

    def generate_synthetic_data(self, min_samples = 2):
        # create llm pipeline:
        # Get special tokens for later:
        bos_token_id = llm.tokenizer.convert_tokens_to_ids('<|begin_of_text|>')
        eos_token_id = llm.tokenizer.convert_tokens_to_ids('<|eot_id|>')
        pad_token_id = llm.tokenizer.convert_tokens_to_ids('<|eot_id|>')

        balanced_data = []

        self.seperated_dataset = [self.product_data, self.hazard_data, self.product_category_data, self.hazard_category_data]

        updated_datasets = []  # To store the updated datasets

        for i, spec_data in enumerate(self.seperated_dataset):
            print(f"Number {i}")

            column = spec_data.columns[1]
            # Find rare classes
            class_counts = spec_data[column].value_counts()
            rare_classes = class_counts[class_counts < min_samples].index

            prompts = []
            representative_rows = []
            for rare_class in rare_classes:
                # Get data points belonging to the rare class
                rare_class_data = spec_data[spec_data[column] == rare_class]

                # Generate synthetic samples for the rare class
                target_count = min_samples - len(rare_class_data)
                # Use one representative prompt to generate the required number of samples
                if len(rare_class_data) > 0:
                    representative_row = rare_class_data.iloc[0]  # Use the first row as a representative example
                    prompt = f"""
                      Task: Generate synthetic data for the given data point to balance the dataset.
                      Label Name: {column}
                      Label Value: {representative_row[column]}
                      Title: {representative_row['title']}
                      Instructions:
                      1. Use the title as a reference to create a synthetic data point.
                      3. Ensure the output aligns with the label value.
                      Output: Provide a single, concise title.
                      """
                    prompts.append(prompt)
                    representative_rows.append(representative_row)

            num_created_data = 1
            synthetic_samples = []

            if prompts:
              # Generate synthetic samples in one call
                synthetic_samples = llm(
                  prompts,
                  max_new_tokens=20,
                  num_return_sequences=num_created_data,
                  pad_token_id=pad_token_id,
                  bos_token_id=bos_token_id,
                  eos_token_id=eos_token_id,
                )

            # Add the generated samples to the balanced dataset
            for i, sequences in enumerate(synthetic_samples):
                for sequence in sequences:
                    new_row = representative_rows[i].copy()
                    new_row['title'] = sequence['generated_text'].strip()  # Update the text with the generated output
                    balanced_data.append(new_row)
                    print(new_row)

            # Add the synthetic data to the original dataset
            if balanced_data:
                balanced_df = pd.DataFrame(balanced_data)
                spec_data = pd.concat([spec_data, balanced_df], ignore_index=True)

            updated_datasets.append(spec_data)

        self.product_data, self.hazard_data, self.product_category_data, self.hazard_category_data = updated_datasets
        print("✔ Balanced data using LLM-generated synthetic data.")

    def preprocess(self):
        self.generate_synthetic_data()
        self.product_data['title'] = self.product_data['title'].fillna('').apply(self.preprocess_text)
        self.hazard_data['title'] = self.hazard_data['title'].fillna('').apply(self.preprocess_text)
        self.product_category_data['title'] = self.product_category_data['title'].fillna('').apply(self.preprocess_text)
        self.hazard_category_data['title'] = self.hazard_category_data['title'].fillna('').apply(self.preprocess_text)
        print("✔ Preprocessed title columns successfully.")

        self.vectorize_data()

        print("✔ Preprocessing completed successfully.")
        return self.data

    def save_preprocessed_data(self):
        print("dimension data- ", self.data.shape)
        print("dimension product_data- ", self.product_data.shape)
        print("dimension hazard_data- ", self.hazard_data.shape)
        print("dimension product_category_data- ", self.product_category_data.shape)
        print("dimension hazard_category_data- ", self.hazard_category_data.shape)

        self.product_data.to_csv('gs://<bucket_name>/final_preprocessed_product_data.csv', index=False)
        self.hazard_data.to_csv('gs://<bucket_name>/final_preprocessed_hazard_data.csv', index=False)
        self.product_category_data.to_csv('gs://<bucket_name>/final_preprocessed_product_category_data.csv', index=False)
        self.hazard_category_data.to_csv('gs://<bucket_name>/final_preprocessed_hazard_category_data.csv', index=False)

        print("✔ Final preprocessed data saved to 'final_preprocessed_..._data.csv'.")

if __name__ == '__main__':
    file_path = 'gs://<bucket_name>/incidents_train.csv'
    mappings_file = 'gs://<bucket_name>/label_mappings.json'
    preprocessor = DataPreprocessorLogisticReg(file_path, mappings_file)
    preprocessed_data = preprocessor.preprocess()
    preprocessor.save_preprocessed_data()

In [None]:
import pandas as pd
import re
import numpy as np
from bs4 import BeautifulSoup
import nltk
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from typing import List
import transformers
import torch
from google.colab import drive
import getpass
from huggingface_hub import login
from transformers import pipeline
import bitsandbytes as bnb
from google.cloud import storage
import json
import gcsfs

fs = gcsfs.GCSFileSystem()

# Initialize client
client = storage.Client()

# Set your bucket name
bucket_name = "<bucket_name>"
bucket = client.bucket(bucket_name)

login(getpass.getpass('Enter your huggingface API-key:'))

nltk.download('wordnet')
nltk.download('stopwords')

llm = transformers.pipeline(
            "text-generation",
            model="meta-llama/Llama-3.1-8B-Instruct",
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="auto"
        )

In [None]:
preprocessor.product_data.iloc[3].title
