In [None]:
# semantic similarity

import re
import pandas as pd
import json
from sentence_transformers import SentenceTransformer, util

# --- Cleaning ---
def clean_text(text):
    text = re.sub(r'["“”«»‘’]', '', text)
    text = re.sub(r'\[\d+\]', '', text)      # [1], [12]
    text = re.sub(r'\d+\)', '', text)        # 1)
    text = re.sub(r'[¹²³⁴⁵⁶⁷⁸⁹⁰]', '', text)  # superscripts
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- Load SBERT model ---
model = SentenceTransformer('all-MiniLM-L6-v2')

similarity_scores = []

with open('project-3-at-2025-05-19-09-43-1d74e9d7.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# --- Process each document ---
for doc in data:
    try:
        results = doc.get('annotations', [{}])[0].get('result', [])

        span_dict = {}
        # Collect spans
        for item in results:
            if item.get('type') == 'labels' and 'value' in item:
                span_id = item.get('id')
                text = item['value'].get('text', '')
                labels = item['value'].get('labels', [])
                if span_id and text:
                    span_dict[span_id] = {
                        'text': clean_text(text),
                        'labels': labels
                    }

        # Process relations
        for item in results:
            if item.get('type') == 'relation':
                from_id = item.get('from_id')
                to_id = item.get('to_id')
                relation_labels = item.get('labels', [])
                relation_label = relation_labels[0] if relation_labels else "Unknown"

                if from_id in span_dict and to_id in span_dict:
                    text1 = span_dict[from_id]['text']
                    text2 = span_dict[to_id]['text']
                    label1 = span_dict[from_id]['labels']
                    label2 = span_dict[to_id]['labels']

                    if len(text1.split()) < 3 or len(text2.split()) < 3:
                        continue

                    emb1 = model.encode(text1, convert_to_tensor=True)
                    emb2 = model.encode(text2, convert_to_tensor=True)

                    similarity = util.pytorch_cos_sim(emb1, emb2).item()

                    similarity_scores.append({
                        "text1": text1,
                        "text2": text2,
                        "similarity": round(similarity, 4),
                        "label1": label1,
                        "label2": label2,
                        "relation": relation_label
                    })

    except Exception as e:
        print(f"Error processing doc {doc.get('id', 'unknown')}: {e}")
        continue

# --- Save or inspect results ---
result_df = pd.DataFrame(similarity_scores)
print(result_df.head())

result_df.to_csv("semantic_similarity_output.csv", index=False)

high_sim_df = result_df[result_df['similarity'] >= 0.8]
thresholds = [0.8, 0.85]

print("Number of pairs in each similarity group:")
for t in thresholds:
    count = (result_df['similarity'] >= t).sum()
    print(f"Similarity >= {t}: {count} pairs")

# Optional: print pairs grouped by threshold with relation
for t in thresholds:
    print(f"\n--- Pairs with similarity >= {t} ---")
    filtered = result_df[result_df['similarity'] >= t]
    if filtered.empty:
        print("No pairs found.")
        continue

    for idx, row in filtered.iterrows():
        print(f"Pair {idx}:")
        print(f"Relation: {row['relation']}")
        print(f"Text 1: {row['text1']}")
        print(f"Label 1: {row['label1']}")
        print(f"Text 2: {row['text2']}")
        print(f"Label 2: {row['label2']}")
        print(f"Similarity: {row['similarity']:.3f}")
        print("-" * 50)

for relation, group in high_sim_df.groupby('relation'):
    filename = f'high_similarity_pairs_relation_{relation}.csv'
    group.to_csv(filename, index=False)
    print(f"Saved {len(group)} pairs with relation '{relation}' to {filename}")



In [None]:
# extracting chains of elaborations

import pandas as pd
from collections import defaultdict

def extract_elaboration_restatement_chains(df, min_length=3):
    relevant_relations = ['Elaboration', 'Restatement']
    df_filtered = df[df['relation'].isin(relevant_relations)].copy()

    def parse_label(x):
        if isinstance(x, list):
            return x
        elif isinstance(x, str):
            return eval(x)  # assumes trusted input
        return []

    df_filtered['label2'] = df_filtered['label2'].apply(parse_label)

    chains = defaultdict(list)
    for _, row in df_filtered.iterrows():
        if 'N' in row['label2']:
            key = row['text2']  # nucleus
            chains[key].append((row['relation'], row['text1'], row['label1']))

    long_chains = {k: v for k, v in chains.items() if len(v) >= min_length}
    return long_chains

# ---- Extract chains ----
chains = extract_elaboration_restatement_chains(result_df, min_length=3)

rows = []

for nucleus, supports in chains.items():
    print(f"\nNucleus:\n{nucleus}")
    for rel, sat_text, sat_label in supports:
        print(f"  → {rel}: {sat_text} [Label: {sat_label}]")

for nucleus, supports in chains.items():
    for rel, sat_text, sat_label in supports:
        rows.append({
            'Nucleus': nucleus,
            'Relation': rel,
            'Satellite Text': sat_text,
            'Satellite Label': sat_label
        })

chains_df = pd.DataFrame(rows)
print(len(chains_df))
chains_df.to_csv("elaboration_chains.csv", index=False)

def load_chains_as_dict(csv_path):
    df = pd.read_csv(csv_path)
    chains = {}
    for nucleus, group in df.groupby('Nucleus'):
        satellites = []
        for _, row in group.iterrows():
            satellites.append((row['Relation'], row['Satellite Text'], row['Satellite Label']))
        chains[nucleus] = satellites
    return chains

chains = load_chains_as_dict("elaboration_chains.csv")

texts_to_analyze = []

for nucleus, satellites in chains.items():
    block = f"Nucleus:\n{nucleus}\n"
    for rel, sat_text, sat_label in satellites:
        block += f"→ {rel}: {sat_text} [Label: {sat_label}]\n"
    texts_to_analyze.append(block.strip())

api_input_text = "\n\n---\n\n".join(texts_to_analyze)

In [None]:
# identifying fallacies using LLMs (initial stage using Claude, with stance 8 being the example)

pip install anthropic
import anthropic
import os

os.environ["ANTHROPIC_API_KEY"] = "key" 

client = anthropic.Anthropic()

def process_with_claude(content, system_prompt):
    print("Sending request to Claude API...")

    try:
        response = client.messages.create(
            model="claude-3-7-sonnet-20250219",  # Correct model string
            system=system_prompt,  # System prompt goes here
            max_tokens=8000,
            temperature=0.0,
            messages=[
                {"role": "user", "content": content}
            ]
        )
        # Return the response content
        return response.content[0].text
    except Exception as e:
        print(f"Error calling Claude API: {str(e)}")
        raise

texts_to_analyze = []

for nucleus, satellites in chains.items():
    block = f"Nucleus:\n{nucleus}\n"
    for rel, sat_text, sat_label in satellites:
        block += f"→ {rel}: {sat_text} [Label: {sat_label}]\n"
    texts_to_analyze.append(block.strip())

api_input_text = "\n\n---\n\n".join(texts_to_analyze)

system_prompt = (
    "You are an expert in rhetorical theory and informal logic. "
    "Analyze the following discourse chains for potential fallacies according to informal logic. "
    "Meanwhile, pay attention to the rhetorical relations that are present in the chains that contain fallacies. "
    "See if there are potential correlation or patterns. "
    "Return your findings with references to the relevant text."
)

fallacy_analysis = process_with_claude(api_input_text, system_prompt)
print(fallacy_analysis)

In [None]:
# validation step using both Claude and Gemini; code for google colab
pip install -q -U google-generativeai

import google.generativeai as genai
import os
from google.colab import userdata

GEMINI_API_KEY = userdata.get('GOOGLE_API_KEY')

def process_with_gemini(content):
    model = genai.GenerativeModel('gemini-1.5-pro-latest')

    user_prompt = (
        "You are an expert in rhetorical theory and informal logic. "
        "Analyze the following discourse chains for potential fallacies according to informal logic. "
        "Meanwhile, pay attention to the rhetorical relations that are present in the chains that contain fallacies. "
        "See if there are potential correlation or patterns. "
        "Return your findings with references to the relevant text."
        f"{content}"
    )

    try:
        response = model.generate_content(
            user_prompt,
            generation_config=genai.GenerationConfig(
                max_output_tokens=4000, # Max output tokens
                temperature=0.0,       # Lower temperature for less creativity, more factual
            )
        )
        # Gemini's response structure can vary, content[0].text is common for simple text
        return response.text
    except Exception as e:
        print(f"Error calling Gemini API: {str(e)}")
        # If you want to re-raise the exception to stop execution:
        # raise
        return None # Return None if an error occurs

texts_to_analyze = []

for nucleus, satellites in chains.items():
    block = f"Nucleus:\n{nucleus}\n"
    for rel, sat_text, sat_label in satellites:
        block += f"→ {rel}: {sat_text} [Label: {sat_label}]\n"
    texts_to_analyze.append(block.strip())

api_input_text = "\n\n---\n\n".join(texts_to_analyze)

# --- Run the analysis twice ---

print("\n--- FIRST RUN OF GEMINI ANALYSIS ---")
fallacy_analysis_run1 = process_with_gemini(api_input_text)
if fallacy_analysis_run1:
    print(fallacy_analysis_run1)
else:
    print("First Gemini run failed.")

print("\n--- SECOND RUN OF GEMINI ANALYSIS ---")
fallacy_analysis_run2 = process_with_gemini(api_input_text)
if fallacy_analysis_run2:
    print(fallacy_analysis_run2)
else:
    print("Second Gemini run failed.")