In [1]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=c631c986b1e3e9c576efc72e252dcd3f5ca90d0a35347a67b2d69317a1f67118
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [3]:
import pandas as pd
import networkx as nx
import numpy as np
import torch
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# --- Load Dataset ---
df = pd.read_csv("final_labels.csv")

# Remove rows with missing bodies
df = df.dropna(subset=['body'])

# Create a directed graph for threading
G = nx.DiGraph()
for _, row in df.iterrows():
    G.add_node(row['entry_id'], body=row['body'], parent=row['parent_id'])
    if pd.notna(row['parent_id']):
        G.add_edge(row['parent_id'], row['entry_id'])

# Function to reconstruct a discussion thread
def get_thread(root_id):
    thread = []
    for node in nx.dfs_preorder_nodes(G, source=root_id):
        thread.append(G.nodes[node]['body'])
    return " ".join(thread)

# Identify root comments (no parent_id or missing parent)
root_comments = df[df['parent_id'].isna()]['entry_id'].tolist()

df["reconstructed_thread"] = df["entry_id"].apply(lambda x: get_thread(x) if x in root_comments else None)

# --- Summarization ---
summarizer = pipeline("summarization", model="t5-base", device=0)

def generate_summary(text):
    if pd.isna(text) or len(text.split()) < 5:
        return text  # Skip short texts
    text = " ".join(text.split()[:512])  # Limit input size
    word_count = len(text.split())
    max_len = min(150, int(0.75 * word_count))
    min_len = min(5, int(0.3 * word_count))
    try:
        return summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
    except:
        return text

df["summary"] = df["reconstructed_thread"].apply(generate_summary)

df.to_csv("thread_summaries.csv", index=False)

# --- Context Mismatch Detection ---
similarity_model = SentenceTransformer("all-MiniLM-L6-v2")

def check_context_mismatch(comment, parent_comment):
    if not parent_comment:
        return "Missing Parent"
    embeddings = similarity_model.encode([comment, parent_comment], convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
    return similarity < 0.5  # Mark as mismatch if similarity is low

def detect_context_mismatch(row):
    if pd.notna(row['parent_id']):
        parent_body = df.loc[df['entry_id'] == row['parent_id'], 'body']
        if not parent_body.empty:
            return check_context_mismatch(row['body'], parent_body.values[0])
        else:
            return "Missing Parent"
    return False

df['context_mismatch'] = df.apply(detect_context_mismatch, axis=1)

df.to_csv("thread_summaries_with_mismatch.csv", index=False)

# --- Performance Metrics ---
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

bleu_scores, rouge_scores = [], []

for _, row in df.iterrows():
    if pd.notna(row['reconstructed_thread']) and pd.notna(row['summary']):
        reference = row['reconstructed_thread'].split()
        candidate = row['summary'].split()
        bleu_scores.append(sentence_bleu([reference], candidate))
        rouge = scorer.score(row['reconstructed_thread'], row['summary'])
        rouge_scores.append(rouge['rougeL'].fmeasure)

# --- Perplexity Calculation ---
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

def calculate_perplexity(text):
    if not text or len(text.split()) < 5:  # Avoid very short texts
        return np.nan
    try:
        encodings = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
        with torch.no_grad():
            loss = model(encodings, labels=encodings).loss.item()
        perplexity = torch.exp(torch.tensor(loss)).item()
        return perplexity if np.isfinite(perplexity) else np.nan  # Avoid infinite values
    except:
        return np.nan

perplexities = [calculate_perplexity(summary) for summary in df['summary'].dropna()]
valid_perplexities = [p for p in perplexities if not np.isnan(p)]

# --- Semantic Similarity ---
similarities = []
for _, row in df.iterrows():
    if pd.notna(row['reconstructed_thread']) and pd.notna(row['summary']):
        embeddings = similarity_model.encode([row['reconstructed_thread'], row['summary']], convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
        similarities.append(similarity)

# --- Final Results ---
avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
avg_rouge = sum(rouge_scores) / len(rouge_scores) if rouge_scores else 0
avg_perplexity = sum(valid_perplexities) / len(valid_perplexities) if valid_perplexities else np.nan
avg_similarity = sum(similarities) / len(similarities) if similarities else 0

print(f"Average BLEU Score: {avg_bleu:.4f}")
print(f"Average ROUGE-L Score: {avg_rouge:.4f}")
print(f"Average Perplexity: {avg_perplexity:.4f}")
print(f"Average Semantic Similarity: {avg_similarity:.4f}")


Device set to use cuda:0
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average BLEU Score: 0.1142
Average ROUGE-L Score: 0.4294
Average Perplexity: 283.0098
Average Semantic Similarity: 0.6705
