In [None]:
%pip install transformers
%pip install datasets
%pip install nltk scikit-learn --quiet




In [None]:
#  Import libraries
import os
import gc
import json
import numpy as np
import pandas as pd
import torch.nn as nn
import torch
import nltk
import sklearn
import string
import spacy



from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from datasets import Dataset as Dataset
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    classification_report,
)
from transformers import EarlyStoppingCallback
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    RobertaConfig,
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# File Paths
train_file_path = "/content/drive/MyDrive/nlp_project/train.jsonlist"
validation_file_path = "/content/drive/MyDrive/nlp_project/validation.jsonlist"
test_file_path = "/content/drive/MyDrive/nlp_project/test.jsonlist"

# Helper function to process JSON into datasets
def json_to_dataset(path):
    dataset = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                l = json.loads(line.strip())
                if "verdict" in l and "body" in l:
                    newL = {
                        "labels": 1 if l["verdict"].lower() == "not the asshole" else 0,
                        "text": l["body"]
                    }
                    dataset.append(newL)
            except (json.JSONDecodeError, KeyError) as e:
                print(f"Skipping line due to error: {e}")
                continue
    return Dataset.from_pandas(pd.DataFrame(dataset))

# Load datasets
dataset_train = json_to_dataset(train_file_path)
dataset_validation = json_to_dataset(validation_file_path)
dataset_test = json_to_dataset(test_file_path)

print(dataset_train)
print(dataset_validation)
print(dataset_test)


Dataset({
    features: ['labels', 'text'],
    num_rows: 32000
})
Dataset({
    features: ['labels', 'text'],
    num_rows: 4000
})
Dataset({
    features: ['labels', 'text'],
    num_rows: 4000
})


In [None]:
# Load Tokenizer
print("building tokenizer...")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")


def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=512
    )
    tokenized["labels"] = examples["labels"]
    return tokenized

tokenized_dataset_train = dataset_train.map(tokenize_function, batched=True)
tokenized_dataset_validation = dataset_validation.map(tokenize_function, batched=True)
tokenized_dataset_test = dataset_test.map(tokenize_function, batched=True)

print("First Tokenized Example:")
print(tokenized_dataset_train[0])

building tokenizer...


Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

First Tokenized Example:
{'labels': 1, 'text': "[TITLE] [AITA] Hockey Fan [BODY] I went to a hockey game at my college and since I didn't have season tickets to sit in the student section, got the nosebleed seats around center ice about 6 rows from the top.  It was against our rivals, so the seats were packed.  Every time the action came around the boards on our side, I had to stand up to see what was happening.  I also stood when we got close to scoring a goal.  The people behind me were pissed that I kept standing up and told me to just watch what was happening on the jumbotron.  My thoughts were that if I wanted to sit down and watch the game on a screen, I would have stayed at home and watched from my couch.  What are your thoughts; am I the asshole?", 'input_ids': [0, 10975, 47217, 3850, 742, 646, 250, 2068, 250, 742, 8471, 11232, 646, 387, 37588, 742, 38, 439, 7, 10, 5006, 177, 23, 127, 1564, 8, 187, 38, 399, 75, 33, 191, 3308, 7, 2662, 11, 5, 1294, 2810, 6, 300, 5, 8658, 5225, 1

In [None]:
# Metrics computation
def compute_metrics(eval_pred):
    logits, labels = eval_pred

    #predictions = logits.argmax(axis=-1)

    if isinstance(logits, tuple):
        logits = logits[0]
    predictions = np.argmax(logits, axis=-1)

    print("Labels:", labels)
    print("Predictions:", predictions)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="binary", zero_division=0
    )
    accuracy = accuracy_score(labels, predictions)

    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

In [None]:

# Model setup with roBERTa
model_config = RobertaConfig.from_pretrained("roberta-base", num_labels=2)
model = RobertaForSequenceClassification.from_pretrained("roberta-base", config=model_config)
model.config.use_cache = False


# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/nlp_project/model_output",
    num_train_epochs=3,              # Keep 3, starts overfitting around 4-5 epochs
    per_device_train_batch_size=4,  # Keep lower batch size = better metrics, slower
    per_device_eval_batch_size=4,
    learning_rate=1e-5,              # Keep low = better metrics
    gradient_checkpointing=True,     # Enable gradient checkpointing for mem optimization
    load_best_model_at_end=True,     # Load the best model at the end
    save_total_limit=1,              # Save only the best model
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    weight_decay=0.01,
    warmup_steps=500,                # Gradual learning rate warmup
    lr_scheduler_type="cosine",      # Keep cosine, slightly better than linear
    gradient_accumulation_steps=1,
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
%env CUDA_LAUNCH_BLOCKING=1

# Define a custom Trainer to use Cross-Entropy Loss explicitly
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):

        labels = inputs.pop("labels")  # Extract labels from inputs
        outputs = model(**inputs)  # Forward pass
        logits = outputs.logits  # Extract logits from the outputs

        # Use Cross-Entropy Loss
        loss_fct = nn.CrossEntropyLoss(label_smoothing=0.1)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_validation,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Early stopping
)

# Train the model
trainer.train()


# Evaluate model on the test set
test_results = trainer.evaluate(tokenized_dataset_test)
print(f"Test set results: {test_results}")

# Save the model and tokenizer
model.save_pretrained("/content/drive/MyDrive/nlp_project/model_output/aita_model")
tokenizer.save_pretrained("/content/drive/MyDrive/nlp_project/model_output/aita_tokenizer")

env: CUDA_LAUNCH_BLOCKING=1


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6433,0.636515,0.652,0.635602,0.667033,0.607
2,0.5075,0.752844,0.66975,0.637586,0.706383,0.581
3,0.489,1.049175,0.672,0.660104,0.684946,0.637


Labels: [1 1 1 ... 0 0 0]
Predictions: [1 1 1 ... 0 0 0]
Labels: [1 1 1 ... 0 0 0]
Predictions: [1 1 1 ... 0 0 0]
Labels: [1 1 1 ... 0 0 0]
Predictions: [1 1 1 ... 0 0 0]


Labels: [1 1 1 ... 0 0 0]
Predictions: [1 0 0 ... 1 0 0]
Test set results: {'eval_loss': 0.6462544202804565, 'eval_accuracy': 0.641, 'eval_f1': 0.613978494623656, 'eval_precision': 0.663953488372093, 'eval_recall': 0.571, 'eval_runtime': 16.4466, 'eval_samples_per_second': 243.211, 'eval_steps_per_second': 60.803, 'epoch': 3.0}


('/content/drive/MyDrive/nlp_project/model_output/aita_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/nlp_project/model_output/aita_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/nlp_project/model_output/aita_tokenizer/vocab.json',
 '/content/drive/MyDrive/nlp_project/model_output/aita_tokenizer/merges.txt',
 '/content/drive/MyDrive/nlp_project/model_output/aita_tokenizer/added_tokens.json',
 '/content/drive/MyDrive/nlp_project/model_output/aita_tokenizer/tokenizer.json')

In [None]:
# EVALUATE MODEL

# Load the trained model and tokenizer
model_path = "/content/drive/MyDrive/nlp_project/model_output/aita_roBERTa_classifier_model"
tokenizer_path = "/content/drive/MyDrive/nlp_project/model_output/aita_roBERTa_classifier_tokenizer"

model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Load the test dataset
print("Tokenizing test dataset...")
test_dataset = tokenized_dataset_test  # Use previously tokenized test dataset

# Ensure test_dataset is compatible with DataLoader
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Evaluate the model on the test dataset
print("Evaluating the model...")
model.eval()
model.cuda()  # Ensure model is on GPU if available

true_labels = []
predicted_labels = []

# Perform evaluation
dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=4)  # Adjust batch size as needed

with torch.no_grad():
    for batch in dataloader:
        # Move inputs and labels to GPU
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()
        labels = batch["labels"].cpu().numpy()

        # Forward pass through the model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, axis=-1).cpu().numpy()

        true_labels.extend(labels)
        predicted_labels.extend(predictions)

# Calculate and display metrics
print("Generating evaluation metrics...")
report = classification_report(true_labels, predicted_labels, target_names=["Asshole", "Not the Asshole"])
accuracy = accuracy_score(true_labels, predicted_labels)

print("Classification Report:")
print(report)
print(f"Accuracy: {accuracy:.4f}")


Tokenizing test dataset...
Evaluating the model...
Generating evaluation metrics...
Classification Report:
                 precision    recall  f1-score   support

        Asshole       0.65      0.69      0.67      2000
Not the Asshole       0.67      0.62      0.65      2000

       accuracy                           0.66      4000
      macro avg       0.66      0.66      0.66      4000
   weighted avg       0.66      0.66      0.66      4000

Accuracy: 0.6597


In [None]:
# COMMON WORK EXTACTION

nlp = spacy.load("en_core_web_sm")

# Extend stop words to include overly common narrative terms
non_context_words = {"tell", "want", "say", "get", "go", "know", "think", "feel", "ask", "year","have","new","stay","place", "see", "happen","hour","see" }
for word in non_context_words:
    nlp.Defaults.stop_words.add(word)

# Function to extract refined context words
def extract_context_words(texts, top_n=12):
    context_words = []
    for doc in nlp.pipe(texts, batch_size=100, disable=["ner", "parser"]):
        # Filter tokens: exclude stop words, punctuation, and overly generic terms
        filtered_tokens = [
            token.lemma_.lower()
            for token in doc
            if not token.is_stop and
               token.is_alpha and
               token.pos_ in {"NOUN", "VERB", "ADJ"}
        ]
        context_words.extend(filtered_tokens)
    # Exclude extremely high-frequency terms common across both classes
    common_threshold = len(texts) * 0.3  # 30% occurrence threshold
    context_count = Counter(context_words)
    filtered_context = {k: v for k, v in context_count.items() if v < common_threshold}
    return Counter(filtered_context).most_common(top_n)

# Grouped data for analysis
def get_most_common_words_by_verdict(dataset, top_n=12):
    grouped_words = {"Not the Asshole": [], "Asshole": []}

    for example in dataset:
        verdict = "Not the Asshole" if example["labels"] == 1 else "Asshole"
        grouped_words[verdict].append(example["text"])

    # Process each group and extract top words
    results = {}
    for verdict, texts in grouped_words.items():
        common_words = extract_context_words(texts, top_n)
        results[verdict] = common_words

    return results

# Example usage
result = get_most_common_words_by_verdict(dataset_train, top_n=12)

# Display the results
for verdict, words in result.items():
    print(f"\n{verdict}:")
    for word, count in words:
        print(f"{word}: {count}")
