In [None]:
#@title Download da amazon e filtro per rating
# Installare la libreria datasets se non è già installata
!pip install datasets

import pandas as pd
from datasets import load_dataset

# Caricare il dataset Amazon Reviews per la categoria "Grocery and Gourmet Food"
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Grocery_and_Gourmet_Food", trust_remote_code=True)

# Filtrare recensioni con rating 1, 3 e 5
ratings_to_include = [1, 3, 5]

# Funzione per filtrare in base al rating
def filter_ratings(example):
    return example['rating'] in ratings_to_include

filtered_dataset = dataset.filter(filter_ratings)




Filter:   0%|          | 0/14318520 [00:00<?, ? examples/s]

In [None]:
#@title Stampa di due esempi per esplorare la struttura
print(dataset["full"][0])
print(dataset["full"][1])

{'rating': 5.0, 'title': 'Excellent!  Yummy!', 'text': 'Excellent!! Yummy!  Great with other foods and great alone.', 'images': [], 'asin': 'B00CM36GAQ', 'parent_asin': 'B00CM36GAQ', 'user_id': 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ', 'timestamp': 1587854482395, 'helpful_vote': 0, 'verified_purchase': True}
{'rating': 5.0, 'title': 'Delicious!!! Yum!', 'text': 'Excellent!  The best!  I use it with my beef and broccoli and to throw in chicken and rice soup. It’s also great by itself. Yum!', 'images': [], 'asin': 'B074J5WVYH', 'parent_asin': 'B0759B7KLH', 'user_id': 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ', 'timestamp': 1587854400380, 'helpful_vote': 0, 'verified_purchase': True}


In [None]:
#@title Rimozione dati sporchi e filtra lunghezza
!pip install langdetect
from langdetect import detect


def filter_shortest_reviews(example):
    text = example['text']
    return (
        15 < len(text) < 100 and
        "VIDEOID" not in text
    )

filtered_shortest_dataset = filtered_dataset.filter(filter_shortest_reviews)




Filter:   0%|          | 0/12366296 [00:00<?, ? examples/s]

In [None]:
#@title Stampa numerosità in questo stage
from collections import Counter

# Supponiamo che il tuo DatasetDict abbia uno split 'train'
split_name = 'full'  # Modifica questo se hai altri split

# Verifica che lo split esista
if split_name not in filtered_shortest_dataset:
    raise ValueError(f"Lo split '{split_name}' non esiste nel dataset.")

# Accedi alla colonna 'rating'
ratings = filtered_shortest_dataset[split_name]['rating']

# Utilizza Counter per contare le occorrenze
counter = Counter(ratings)

# Definisci i rating di interesse
desired_ratings = [1, 3, 5]

# Crea un dizionario con i conteggi desiderati
rating_counts = {rating: counter.get(rating, 0) for rating in desired_ratings}

# Stampa i risultati
print("Numero di recensioni per ciascun rating in 'processed_dataset':")
for rating, count in rating_counts.items():
    print(f"Rating {rating}: {count}")


Numero di recensioni per ciascun rating in 'processed_dataset':
Rating 1: 704416
Rating 3: 317643
Rating 5: 4458593


In [None]:
print(filtered_shortest_dataset["full"][0])
print(filtered_shortest_dataset["full"][1])

{'rating': 5.0, 'title': 'Excellent!  Yummy!', 'text': 'Excellent!! Yummy!  Great with other foods and great alone.', 'images': [], 'asin': 'B00CM36GAQ', 'parent_asin': 'B00CM36GAQ', 'user_id': 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ', 'timestamp': 1587854482395, 'helpful_vote': 0, 'verified_purchase': True}
{'rating': 5.0, 'title': 'Great taste', 'text': 'Great for making brownies and crinkle cookies.', 'images': [], 'asin': 'B005CD4196', 'parent_asin': 'B005CD4196', 'user_id': 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ', 'timestamp': 1581313294965, 'helpful_vote': 7, 'verified_purchase': True}


In [None]:
#@title Monta drive colab
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title Primo preprocessing
!pip install contractions

import contractions
import re
import pandas as pd
from datasets import load_dataset
import html

def clean_text(text: str) -> str:
    # Rimuove i doppi apici
    #text = text.replace('“', '').replace('”', '').replace('"', '').replace('"', '')

    # Espande le contrazioni utilizzando la libreria 'contractions'
    text = contractions.fix(text)

    # Decodifica entità HTML come &#34; in "
    text = html.unescape(text)

    # Rimuove sequenze di escape HTML, come <br />, <p>, ecc.
    text = re.sub(r"<[^>]+>", " ", text)

    # (Opzionale) Rimuove URL
    text = re.sub(r"http\S+|www.\S+", "", text)

    # (Opzionale) Altre sostituzioni tipiche, ad esempio emoji -> testo
    text = text.encode('ascii', 'ignore').decode('ascii')  # Rimuove eventuali emoji


    return text

def preprocess_example(example):
    example['text'] = clean_text(example['text'])
    return example

# Applica il preprocessing
processed_dataset = filtered_shortest_dataset.map(preprocess_example, batched=False)

# (Opzionale) Esplora il dataset processato
print(processed_dataset['full']['text'][:5])


Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2

Map:   0%|          | 0/5480652 [00:00<?, ? examples/s]

['Excellent!! Yummy!  Great with other foods and great alone.', 'Great for making brownies and crinkle cookies.', 'Yummy for your tummy.', 'This is an excellent white tea and the jasmine is wonderful.', 'I was never a huge fan of Earl Grey tea until I found this one. It is exquisite.']


In [None]:
#@title Neutrality filter automatico
from textblob import TextBlob

# Funzione per verificare se una frase è neutrale
def is_neutral_text(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity  # Valore tra -1 (negativo) e +1 (positivo)
    return -0.1 <= polarity <= 0.1  # Considera neutrale se la polarità è tra -0.1 e 0.1

# Filtra il dataset per contenuti neutri con rating 3
def filter_neutral_ratings(example):
    if example['rating'] == 3:
        return is_neutral_text(example['text'])  # Mantieni solo se è neutrale
    return True  # Mantieni tutti gli altri rating

# Applica il filtro al dataset processato
neutral_filtered_dataset = processed_dataset.filter(filter_neutral_ratings)




Filter:   0%|          | 0/5480652 [00:00<?, ? examples/s]

In [None]:
#@title Riconto esempi
from collections import Counter

# Supponiamo che il tuo DatasetDict abbia uno split 'full'
split_name = 'full'  # Modifica questo se hai altri split

# Verifica che lo split esista
if split_name not in filtered_shortest_dataset:
    raise ValueError(f"Lo split '{split_name}' non esiste nel dataset.")

# Accedi alla colonna 'rating'
ratings = neutral_filtered_dataset[split_name]['rating']

# Utilizza Counter per contare le occorrenze
counter = Counter(ratings)

# Definisci i rating di interesse
desired_ratings = [1, 3, 5]

# Crea un dizionario con i conteggi desiderati
rating_counts = {rating: counter.get(rating, 0) for rating in desired_ratings}

# Stampa i risultati
print("Numero di recensioni per ciascun rating in 'processed_dataset':")
for rating, count in rating_counts.items():
    print(f"Rating {rating}: {count}")


Numero di recensioni per ciascun rating in 'processed_dataset':
Rating 1: 704416
Rating 3: 105483
Rating 5: 4458593


In [None]:
#@title Prendo 24k esempi per classe
import pandas as pd
from sklearn.model_selection import train_test_split

# Funzione per bilanciare le classi
def balance_classes(dataset, max_per_class):
    balanced_data = []
    for rating in [1, 3, 5]:
        # Filtrare per ciascun rating
        class_data = dataset.filter(lambda x: x['rating'] == rating)

        # Limitare al massimo per classe e randomizzare
        class_data = class_data.shuffle(seed=42).select(range(min(len(class_data), max_per_class)))

        # Aggiungere i dati bilanciati alla lista
        balanced_data.extend(class_data)
    return balanced_data

# Lavorare su uno specifico split di `neutral_filtered_dataset`
# Assumiamo che `neutral_filtered_dataset` abbia uno split chiamato 'train'
neutral_filtered_train = neutral_filtered_dataset['full']

# Bilanciare le classi con massimo 24000 recensioni per classe
max_reviews_per_class = 24000
balanced_records = balance_classes(neutral_filtered_train, max_reviews_per_class)





Filter:   0%|          | 0/5268492 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5268492 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5268492 [00:00<?, ? examples/s]

In [None]:
#@title Filtro solo frasi in inglese
#non applicabile a monte per ragioni prestazionali

from langdetect import detect, LangDetectException

# Funzione per filtrare le frasi inglesi in una lista con gestione delle eccezioni
def filter_english_sentences(records):
    english_records = []
    for record in records:
        try:
            # Verifica che 'text' esista e che il testo abbia una lunghezza minima
            if 'text' in record and len(record['text']) > 5:
                # Rileva la lingua solo per testi validi
                if detect(record['text']) == 'en':
                    english_records.append(record)
        except LangDetectException:
            # Ignora record che causano errori di rilevamento della lingua
            pass
    return english_records

# Applica il filtro per tenere solo le frasi in inglese
english_balanced_records = filter_english_sentences(balanced_records)


In [None]:
#@title Bilancio le classi sul numero minimo di record
from collections import Counter

# Funzione per bilanciare le classi basandosi sul minimo numero di record
def rebalance_to_min(records):
    # Step 1: Conta i record per ciascuna classe
    class_counts = Counter(record['rating'] for record in records)

    # Step 2: Trova la dimensione della classe più piccola
    min_count = min(class_counts.values())

    # Step 3: Raggruppa i record per classe
    grouped_records = {rating: [] for rating in class_counts}
    for record in records:
        grouped_records[record['rating']].append(record)

    # Step 4: Seleziona fino al numero minimo per ciascuna classe
    balanced_records = []
    for rating, group in grouped_records.items():
        balanced_records.extend(group[:min_count])

    return balanced_records

# Applica il ribilanciamento
final_balanced_records = rebalance_to_min(english_balanced_records)


In [None]:
#@title Creo train test e eval split

# Convertire i record bilanciati in un DataFrame Pandas
balanced_df = pd.DataFrame(final_balanced_records)

# Codifica dei rating
rating_mapping = {1: 'negative', 3: 'neutral', 5: 'positive'}
balanced_df['rating'] = balanced_df['rating'].map(rating_mapping)

# Suddividere in Train (90%), Eval (5%), Test (5%)
# Step 1: Dividi in Train (90%) e Temp (10%)
train_set, temp = train_test_split(
    balanced_df,
    test_size=0.1,  # 10% per eval + test
    stratify=balanced_df['rating'],  # Mantieni la stratificazione
    random_state=42
)

# Step 2: Dividi Temp (10%) in Eval (5%) e Test (5%)
eval_set, test_set = train_test_split(
    temp,
    test_size=0.5,  # 50% di Temp sarà Test (quindi 5% del totale)
    stratify=temp['rating'],  # Mantieni la stratificazione
    random_state=42
)

# Selezionare solo le colonne desiderate
columns_to_keep = ['rating', 'text']
train = train_set[columns_to_keep]
eval_set = eval_set[columns_to_keep]
test_set = test_set[columns_to_keep]

# Percorso per salvare i file CSV
path = '/content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/'

# Salvataggio nei file CSV
train.to_csv(f"{path}Train.csv", index=False)
eval_set.to_csv(f"{path}Eval.csv", index=False)
test_set.to_csv(f"{path}Test.csv", index=False)

print("File CSV generati:")
print("- Train.csv (90%)")
print("- Eval.csv (5%)")
print("- Test.csv (5%)")

File CSV generati:
- Train.csv (90%)
- Eval.csv (5%)
- Test.csv (5%)


In [None]:
#@title Conto di nuovo

import pandas as pd

# Percorso dei file CSV
path = '/content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/'

# Caricare i file CSV
train = pd.read_csv(f"{path}Train.csv")
eval_set = pd.read_csv(f"{path}Eval.csv")
test_set = pd.read_csv(f"{path}Test.csv")

# Funzione per contare le classi
def count_classes(data, dataset_name):
    class_counts = data['rating'].value_counts()
    print(f"\nConteggio delle classi nel dataset {dataset_name}:")
    for cls, count in class_counts.items():
        print(f"{cls}: {count}")

# Contare le classi in ogni file
count_classes(train, "Train")
count_classes(eval_set, "Eval")
count_classes(test_set, "Test")



Conteggio delle classi nel dataset Train:
negative: 19833
positive: 19832
neutral: 19832

Conteggio delle classi nel dataset Eval:
neutral: 1102
positive: 1102
negative: 1101

Conteggio delle classi nel dataset Test:
negative: 1102
positive: 1102
neutral: 1102


In [None]:
#@title Creo i file nel formato della GAN

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    AutoConfig,
    BertForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
import os
from tabulate import tabulate

def print_metrics_tabulate(total_metrics_eval):
    """
    Stampa le metriche di valutazione in una tabella formattata utilizzando tabulate.
    """
    table_data = []
    for style_pair, metrics in total_metrics_eval.items():
        row = [style_pair] + list(metrics.values())
        table_data.append(row)

    if total_metrics_eval:
        first_metrics = next(iter(total_metrics_eval.values()))
        headers = ["Style Pair"] + list(first_metrics.keys())
    else:
        headers = ["Style Pair"]

    print(tabulate(table_data, headers=headers, tablefmt='grid', floatfmt=".4f"))

def process_csv_file(file_path, split_name, output_suffix="_split"):
    """
    Processa un file CSV per estrarre tutti i campioni per classe e salvarli in file di testo.
    """
    df = pd.read_csv(file_path)

    # Filtra solo le righe con 'text' non nullo e 'rating' non vuoto
    df = df.dropna(subset=['text'])
    df = df[df['rating'].astype(str).str.strip() != '']

    # Verifica che la colonna 'text' esista
    if 'text' not in df.columns:
        raise ValueError(f"Il file {file_path} non contiene una colonna 'text'.")

    # Assicurati che tutti i testi siano stringhe
    df['text'] = df['text'].astype(str)

    # Ottieni i valori unici di 'rating'
    gold_labels = df['rating'].unique()

    # Creare una directory per i file suddivisi
    output_dir = f"{os.path.splitext(file_path)[0]}{output_suffix}"
    os.makedirs(output_dir, exist_ok=True)

    total_metrics_eval = {}

    for label in gold_labels:
        label_df = df[df['rating'] == label].copy()

        # Sostituisci gli accapo con ';' nella colonna 'text'
        label_df['text'] = label_df['text'].str.replace('\n', ';', regex=False)

        # Salva solo le frasi in formato .txt
        output_file = os.path.join(output_dir, f"{label}_{split_name}.txt")
        label_df['text'].to_csv(output_file, index=False, header=False, sep='\n')

        print(f"File creato: {output_file}")

        # Raccogli le metriche per questa classe
        total_metrics_eval[f'{split_name}|{label}'] = {
            'num_samples': len(label_df)
        }

    return total_metrics_eval

# Lista dei file da processare con il relativo split
files_to_process = [
    {
        "file_path": "/content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Train.csv",
        "split": "train"
    },
    {
        "file_path": "/content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Eval.csv",
        "split": "eval"
    },
    {
        "file_path": "/content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Test.csv",
        "split": "test"
    },
]

# Dizionario per raccogliere tutte le metriche
all_metrics = {}

# Itera su ogni file e processa
for file_info in files_to_process:
    file_path = file_info["file_path"]
    split = file_info["split"]

    print(f"\nProcessing {split} split from file: {file_path}")
    metrics = process_csv_file(file_path, split)
    all_metrics.update(metrics)

# Stampa le metriche raccolte in una tabella ben formattata
print("\n=== Metrics Summary ===")
print_metrics_tabulate(all_metrics)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]


Processing train split from file: /content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Train.csv
File creato: /content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Train_split/positive_train.txt
File creato: /content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Train_split/negative_train.txt
File creato: /content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Train_split/neutral_train.txt

Processing eval split from file: /content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Eval.csv
File creato: /content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Eval_split/neutral_eval.txt
File creato: /content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Eval_split/negative_eval.txt
File creato: /content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Eval_split/positive_eval.txt

Processing test split from file: /content/drive/MyDrive/ProjectNLP/TST-CycleGAN_20241227/data/AMAZON/Test.csv
F

In [None]:
#@title Aumenta Neutralità
import pandas as pd
import re

# Load the dataset
file_paths = ["/content/drive/MyDrive/ProjectNLP/20250108_Project/data/AMAZON/Train_split/neutral_train.txt",
                "/content/drive/MyDrive/ProjectNLP/20250108_Project/data/AMAZON/Test_split/neutral_test.txt",
              "/content/drive/MyDrive/ProjectNLP/20250108_Project/data/AMAZON/Eval_split/neutral_eval.txt"]

output_paths = ["/content/drive/MyDrive/ProjectNLP/20250108_Project/data/AMAZON/Train_split/TrainNeutroVero.txt",
                "/content/drive/MyDrive/ProjectNLP/20250108_Project/data/AMAZON/Test_split/TestNeutroVero.txt",
                "/content/drive/MyDrive/ProjectNLP/20250108_Project/data/AMAZON/Eval_split/EvalNeutroVero.txt"]

# List of extreme adjectives and their milder alternatives
adjective_replacements = {
    "love": "like",
    "loves":"likes",
    "loved": "liked",
    "lovely":"fine",
    "hates":"dislikes",
    "hate": "dislike",
    "disgust": "dislike",
    "disgusting":"not ok",
    "amazing": "good",
    "terrible": "mediocre",
    "horrible": "bad",
    "horribe": "bad",
    "outrageous": "not good",
    "Definitely":"probably",
    "awesome": "ok",
    "ridiculous":"funny",
    "fantastic": "good",
    "awful": "unpleasant",
    "super": "a little",
    "really": "quite",
    "at all":""
}

# Function to neutralize a single line of text
def neutralize_text(line):
    # 1. Change words in ALL CAPS and reduce !!! or ??? to single punctuation
    line = re.sub(r'\b([A-Z]{2,})\b', lambda x: x.group(0).lower(), line)
    line = re.sub(r'!{2,}', '!', line)
    line = re.sub(r'\?{2,}', '?', line)

    # 2. Substitute single '!' with '.'
    line = line.replace('!', '.')

    # 3. Replace extreme adjectives with milder alternatives
    for word, replacement in adjective_replacements.items():
        line = re.sub(rf'\b{word}\b', replacement, line, flags=re.IGNORECASE)

    # 4. Normalize elongated words (e.g., "soooo" -> "so")
    line = re.sub(r'(\w)\1{2,}', r'\1', line)

    # 5. Replace strong intensifiers (e.g., "very" -> "quite")
    line = re.sub(r'\b(very|extremely)\b', 'quite', line, flags=re.IGNORECASE)

    # 6. Replace .......... with ...
    line = re.sub(r'\.{4,}', '...', line)

    # 7. Great becomes good unless it's "not great", in that case it stays
    line = re.sub(r'\b(?<!not\s)great\b', 'good', line, flags=re.IGNORECASE)

    return line

for file_path,output_path in zip(file_paths,output_paths):
    # For each dataset split, Read the input file, process each line, and write the output
    with open(file_path, 'r') as infile, open(output_path, 'w') as outfile:
        for line in infile:
            neutralized_line = neutralize_text(line.strip())
            outfile.write(neutralized_line + '\n')

    print(f"Neutralized reviews saved to {output_path}")


Neutralized reviews saved to /content/drive/MyDrive/ProjectNLP/20250108_Project/data/AMAZON/Train_split/TrainNeutroVero.txt
Neutralized reviews saved to /content/drive/MyDrive/ProjectNLP/20250108_Project/data/AMAZON/Test_split/TestNeutroVero.txt
Neutralized reviews saved to /content/drive/MyDrive/ProjectNLP/20250108_Project/data/AMAZON/Eval_split/EvalNeutroVero.txt


## Creazione file csv per nuovo classifier

In [None]:
#@title riaccorpa per il nuovo classifier
import pandas as pd

# File paths
base_path = "/content/drive/MyDrive/ProjectNLP/00.Amazon_Project/data/AMAZON/"
subdirs = ["Train_split", "Eval_split", "Test_split"]
output_files = {
    "Train_split": "Train.csv",
    "Eval_split": "Eval.csv",
    "Test_split": "Test.csv"
}

for subdir in subdirs:
    negative_file = f"{base_path}{subdir}/negative_{subdir.split('_')[0].lower()}.txt"
    positive_file = f"{base_path}{subdir}/positive_{subdir.split('_')[0].lower()}.txt"
    neutral_file = f"{base_path}{subdir}/neutral_{subdir.split('_')[0].lower()}.txt"
    output_file = f"{base_path}/{output_files[subdir]}"

    # Read the files
    with open(negative_file, 'r', encoding='utf-8') as neg_file:
        negative_data = neg_file.readlines()

    with open(positive_file, 'r', encoding='utf-8') as pos_file:
        positive_data = pos_file.readlines()

    with open(neutral_file, 'r', encoding='utf-8') as neu_file:
        neu_data = neu_file.readlines()

    # Create dataframes
    negative_df = pd.DataFrame({'text': negative_data, 'label': 'negative'})
    positive_df = pd.DataFrame({'text': positive_data, 'label': 'positive'})
    neu_df = pd.DataFrame({'text': neu_data, 'label': 'neutral'})

    # Concatenate the dataframes
    combined_df = pd.concat([negative_df, positive_df, neu_df], ignore_index=True)

    # Save to CSV
    combined_df.to_csv(output_file, index=False, encoding='utf-8')

    print(f"Combined data saved to {output_file}")

Combined data saved to /content/drive/MyDrive/ProjectNLP/00.Amazon_Project/data/AMAZON//Train.csv
Combined data saved to /content/drive/MyDrive/ProjectNLP/00.Amazon_Project/data/AMAZON//Eval.csv
Combined data saved to /content/drive/MyDrive/ProjectNLP/00.Amazon_Project/data/AMAZON//Test.csv
