<a href="https://colab.research.google.com/github/samvedrao/Dice-Game/blob/main/formanalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#
# INSTALLATION

!pip install transformers[torch] datasets



In [2]:
#
# CELL 2: THE FULL TRAINING SCRIPT (ALL FIXES INCLUDED)
#
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import re
import torch
import warnings
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Suppress warnings
warnings.filterwarnings("ignore")

# --- 1. Define Helper Functions ---

def compute_metrics(p):
    """ Helper for calculating metrics during training """
    preds = np.argmax(p.predictions, axis=1)
    # This line is corrected
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted', zero_division=0)
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# *** FIX: Define Tokenizer in Global Scope ***
try:
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
except Exception as e:
    print(f"Error loading tokenizer: {e}")
    print("Please check your internet connection.")
    exit()

def tokenize_function(batch):
    """ Tokenizer helper """
    return tokenizer(batch['text'], padding=True, truncation=True)


def train_model(dataset, model_name, num_labels, label_map):
    """
    A full function to fine-tune DistilBERT for a classification task.
    """
    print(f"\n--- Starting Training for: {model_name} ---")

    # 1. Tokenize Dataset
    print("Tokenizing data...")
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    # 2. Load Pre-trained Model
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=num_labels,
        id2label=label_map,
        label2id={v: k for k, v in label_map.items()}
    )

    # *** FIX: Corrected TrainingArguments ***
    # This new block fixes the 'evaluation_strategy' error by
    # matching all strategies to 'steps'.
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        num_train_epochs=2,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',

        # This is the new, compatible configuration
        logging_strategy="steps",
        logging_steps=500,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        load_best_model_at_end=True,
        save_total_limit=2, # Only keep the 2 best checkpoints
    )

    # 4. Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset['train'],
        eval_dataset=tokenized_dataset['test'],
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    # 5. Train!
    print("Starting fine-tuning...")
    trainer.train()

    # 6. Save Model
    save_path = f'./models/{model_name}'
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"\nModel saved to {save_path}")

# --- 2. Prepare and Train Sarcasm Model ---
try:
    print("\nLoading Sarcasm Data (Reddit)...")
    # *** FIX: Added on_bad_lines='skip' ***
    df_sarcasm = pd.read_csv('train-balanced-sarcasm.csv', on_bad_lines='skip')

    df_sarcasm = df_sarcasm.dropna(subset=['comment', 'label'])
    df_sarcasm = df_sarcasm[['comment', 'label']].rename(columns={'comment': 'text'})
    df_sarcasm_sample = df_sarcasm.sample(n=50000, random_state=42)

    train_df, test_df = train_test_split(df_sarcasm_sample, test_size=0.2, random_state=42)
    ds_sarcasm = DatasetDict({
        'train': Dataset.from_pandas(train_df),
        'test': Dataset.from_pandas(test_df)
    })

    sarcasm_label_map = {0: 'Not Sarcastic', 1: 'Sarcastic'}
    train_model(ds_sarcasm, 'sarcasm-model', 2, sarcasm_label_map)

except FileNotFoundError:
    print("Error: 'train-balanced-sarcasm.csv' not found. Please upload it.")
except Exception as e:
    print(f"An error occurred with sarcasm data: {e}")


# --- 3. Prepare and Train Sentiment Model ---
try:
    print("\nLoading Sentiment Data (Amazon)...")
    # *** FIX: Added on_bad_lines='skip' ***
    df_sentiment = pd.read_csv('Reviews.csv', on_bad_lines='skip')
    df_sentiment = df_sentiment.dropna(subset=['Text', 'Score'])

    df_sentiment['label'] = df_sentiment['Score'].apply(lambda x: 0 if x < 3 else (1 if x == 3 else 2))
    df_sentiment = df_sentiment[['Text', 'label']].rename(columns={'Text': 'text'})
    df_sentiment_sample = df_sentiment.sample(n=20000, random_state=42)

    train_df, test_df = train_test_split(df_sentiment_sample, test_size=0.2, random_state=42)
    ds_sentiment = DatasetDict({
        'train': Dataset.from_pandas(train_df),
        'test': Dataset.from_pandas(test_df)
    })

    sentiment_label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    train_model(ds_sentiment, 'sentiment-model', 3, sentiment_label_map)

except FileNotFoundError:
    print("Error: 'Reviews.csv' not found. Please upload it.")
except Exception as e:
    print(f"An error occurred with sentiment data: {e}")


# --- 4. Prepare and Train Aspect Model (Laptops) ---
try:
    print("\nLoading Aspect Data (Laptops)...")
    # *** FIX: Changed filename to 'Laptop_Train_v2' ***
    tree = ET.parse('Laptop_Train_v2')
    root = tree.getroot()
    data = []

    for sentence in root.findall('sentence'):
        text_node = sentence.find('text')
        if text_node is None: continue
        text = text_node.text
        if text is None: continue

        aspect_terms_node = sentence.find('aspectTerms')
        if aspect_terms_node is not None:
            for aspect_term in aspect_terms_node.findall('aspectTerm'):
                aspect = aspect_term.get('term')
                polarity = aspect_term.get('polarity')

                model_input = f"{text} [SEP] {aspect}"

                if polarity == 'positive':
                    data.append({'text': model_input, 'label': 2})
                elif polarity == 'neutral':
                    data.append({'text': model_input, 'label': 1})
                elif polarity == 'negative':
                    data.append({'text': model_input, 'label': 0})

    df_aspect = pd.DataFrame(data)

    train_df, test_df = train_test_split(df_aspect, test_size=0.2, random_state=42)
    ds_aspect = DatasetDict({
        'train': Dataset.from_pandas(train_df),
        'test': Dataset.from_pandas(test_df)
    })

    aspect_label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    train_model(ds_aspect, 'aspect-model', 3, aspect_label_map)

except FileNotFoundError:
    print("Error: 'Laptop_Train_v2' not found. Please upload it.")
    print("Please upload the 672 KB file from your screenshot.")
except Exception as e:
    print(f"An error occurred with aspect data: {e}")

print("\n\n--- All Model Training Complete! ---")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]


Loading Sarcasm Data (Reddit)...
An error occurred with sarcasm data: Error tokenizing data. C error: EOF inside string starting at row 584666

Loading Sentiment Data (Amazon)...
An error occurred with sentiment data: Error tokenizing data. C error: EOF inside string starting at row 388254

Loading Aspect Data (Laptops)...
Error: 'Laptop_Train_v2' not found. Please upload it.
Please upload the 672 KB file from your screenshot.


--- All Model Training Complete! ---
