project/
‚îÇ
‚îú‚îÄ‚îÄ main.py
‚îú‚îÄ‚îÄ config.py
‚îú‚îÄ‚îÄ chunking.py
‚îú‚îÄ‚îÄ embeddings.py
‚îú‚îÄ‚îÄ vector_store.py
‚îú‚îÄ‚îÄ retrieval.py
‚îú‚îÄ‚îÄ timeline.py
‚îú‚îÄ‚îÄ claims.py
‚îú‚îÄ‚îÄ reasoning.py
‚îú‚îÄ‚îÄ cache.py
‚îú‚îÄ‚îÄ pathway_rag.py      # optional but impressive
‚îî‚îÄ‚îÄ llm_utils.py


In [2]:
import pandas as pd
import os

In [3]:
df=pd.read_csv('/Users/Shreyanshsingh/Downloads/train.csv')
df

Unnamed: 0,id,book_name,char,caption,content,label
0,46,In Search of the Castaways,Thalcave,,Thalcave‚Äôs people faded as colonists advanced;...,consistent
1,137,The Count of Monte Cristo,Faria,The Origin of His Connection with the Count of...,"Suspected again in 1815, he was re-arrested an...",contradict
2,74,In Search of the Castaways,Kai-Koumou,,Before each fight he studied the crack-pattern...,consistent
3,109,The Count of Monte Cristo,Noirtier,The Complexity of Family and Personal Life,Villefort‚Äôs drift toward the royalists disappo...,contradict
4,104,The Count of Monte Cristo,Noirtier,Involvement and Turning Point in the French Re...,His parents were targeted in a reprisal for su...,consistent
...,...,...,...,...,...,...
75,90,The Count of Monte Cristo,Noirtier,A Double Life in the Napoleonic Era,To obtain royalist intelligence from the Vend√©...,consistent
76,100,The Count of Monte Cristo,Noirtier,Early Life and Political Awakening,Growing up in Paris he devoured Voltaire and R...,consistent
77,138,The Count of Monte Cristo,Faria,Addendum to Character Relationships,Long political warfare severed him from his fa...,consistent
78,130,The Count of Monte Cristo,Faria,The Answer to a Lingering Question,What seemed an epileptic fit was in fact sudde...,contradict


In [4]:
df1=pd.read_csv('/Users/Shreyanshsingh/Downloads/test.csv')
df1

Unnamed: 0,id,book_name,char,caption,content
0,95,The Count of Monte Cristo,Noirtier,The Fatal Decision of the Hundred Days,Learning that Villefort meant to denounce him ...
1,136,The Count of Monte Cristo,Faria,Escape and Secret Life,From 1800 onward he lived quietly on a small i...
2,59,In Search of the Castaways,Thalcave,,"Posing as a relay-station hand, he slipped cap..."
3,60,In Search of the Castaways,Thalcave,,First rescue: in 1852 an avalanche buried a si...
4,124,The Count of Monte Cristo,Faria,Foreshadowing of Relationships,On the Marseille quay he noticed young Caderou...
5,111,The Count of Monte Cristo,Noirtier,Wisdom and Influence in the Post-Revolution Era,Though bodily strength ebbed he still pulled s...
6,135,The Count of Monte Cristo,Faria,Secret Society and Political Struggle,A failed 1796 coup landed him in a Roman priso...
7,27,In Search of the Castaways,Tom Ayrton/Ben Joyce,,"At twelve he ran away to the docks, worked as ..."
8,110,The Count of Monte Cristo,Noirtier,The Complexity of Family and Personal Life,He kept a locked study full of revolutionary p...
9,42,In Search of the Castaways,Tom Ayrton/Ben Joyce,,He accepted a lucrative berth on the British m...


In [5]:
# Global Configuration Constants

# --- BEST CASE HYPERPARAMETERS ---
CHUNK_SIZE = 500              # Reduced from 1000 for higher precision
CHUNK_OVERLAP = 200           # Balanced overlap for context
RETRIEVAL_TOP_K = 10          # Higher K for Reranker to sift through
RERANK_TOP_K = 3              # Final chunks passed to logic loop
CONTRADICTION_THRESHOLD = 0.35
PHASE_SPLITS={"early":0.2,"middle":0.35,"late":0.45}# Slightly more sensitive

In [6]:
import transformers
!pip install -U sentence-transformers



In [7]:
import sys
!{sys.executable} -m pip install -U sentence-transformers



In [8]:
def smart_chunk(
    text,
    tokenizer,
    chunk_size=CHUNK_SIZE,
    overlap=CHUNK_OVERLAP
):
    tokens = tokenizer.encode(text)
    total_tokens = len(tokens)

    chunks = []
    start = 0
    chunk_id = 0

    while start < total_tokens:
        end = min(start + chunk_size, total_tokens)
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens)

        chunks.append({
            "chunk_id": chunk_id,
            "text": chunk_text,
            "start": start,
            "end": end,
            "position_ratio": start / max(1, total_tokens)  # üîí GUARANTEED
        })

        start += chunk_size - overlap
        chunk_id += 1

    return chunks


In [9]:
import hashlib
import json
import os




from sentence_transformers import SentenceTransformer

_local_embedder = SentenceTransformer(
    "intfloat/e5-base-v2",
    device="mps" ,
    # or "cpu"
    trust_remote_code=True
)

def get_embedding(text: str):
    return _local_embedder.encode(
        text,
        normalize_embeddings=True
    )


# Integrated cache functions


In [10]:
import sys
!{sys.executable} -m pip install faiss-cpu



In [11]:
!pip install faiss-cpu
import faiss
import numpy as np

class FAISSStore:
    def __init__(self, dim):
        self.index = faiss.IndexFlatIP(dim)
        self.metadata = [] # Renamed from self.data for clarity

    def add(self, embedding, meta): # Changed 'chunk' to 'meta'
        self.index.add(np.array([embedding]).astype("float32"))
        self.metadata.append(meta)  # Store full metadata

    def search(self, embedding, k=8):
        D, I = self.index.search(
            np.array([embedding]).astype("float32"), k
        )
        return [self.metadata[i] for i in I[0]] # Accessing self.metadata



In [12]:
import hashlib

def deduplicate(chunks):
    """
    Remove duplicate chunks based on text content.
    """
    seen = set()
    unique = []

    for c in chunks:
        h = hashlib.sha256(c["text"].encode()).hexdigest()
        if h not in seen:
            seen.add(h)
            unique.append(c)

    return unique


In [13]:


STATIC_QUERIES = [
    "protagonist acts of violence or physical conflict",
    "protagonist moral decisions and ethical choices",
    "protagonist interactions with authority figures",
    "protagonist fear, motivation, or belief statements",
    "protagonist childhood or formative experiences"
]
def generate_claim_queries(claim_text):
    return [
        f"scenes related to {claim_text}",
        f"actions that contradict {claim_text}"
    ]
def retrieve_relevant_chunks(store, claims, top_k=6):
    retrieved = []

    for claim in claims:
        emb = get_embedding(claim["claim"])
        results = store.search(emb, k=top_k)

        for r in results:
            # üîí ENSURE position_ratio survives retrieval
            if "position_ratio" not in r:
                raise RuntimeError(
                    "FAISS returned chunk without position_ratio. "
                    "Check store.add() logic."
                )
            retrieved.append(r)

    return retrieved




In [14]:
import re

# -----------------------------
# Knowledge priors
# -----------------------------
ACTION_VERBS = [
    "kill", "attack", "fight", "escape", "save", "protect",
    "betray", "help", "steal", "lie", "confess", "refuse"
]

TRAIT_KEYWORDS = [
    "brave", "coward", "honest", "cruel", "kind",
    "fearful", "loyal", "greedy", "selfish"
]

EVIDENCE_WEIGHTS = {
    "ACTION": 1.0,
    "SELF_STATEMENT": 0.7,
    "THIRD_PARTY_STATEMENT": 0.4
}

# -----------------------------
# Evidence inference
# -----------------------------
def infer_evidence_type(text):
    text_l = text.lower()

    if any(p in text_l for p in [" i ", " my ", " me ", " myself "]):
        return "SELF_STATEMENT"

    if any(p in text_l for p in ["he said", "she said", "they said", "was rumored"]):
        return "THIRD_PARTY_STATEMENT"

    return "ACTION"

# -----------------------------
# Action / trait extraction
# -----------------------------
def extract_actions_traits(text):
    text_l = text.lower()

    actions = [
        v for v in ACTION_VERBS
        if re.search(rf"\b{v}\b", text_l)
    ]

    traits = [
        t for t in TRAIT_KEYWORDS
        if re.search(rf"\b{t}\b", text_l)
    ]

    evidence_type = infer_evidence_type(text)
    weight = EVIDENCE_WEIGHTS[evidence_type]

    return {
        "actions": actions,
        "traits": traits,
        "evidence_type": evidence_type,
        "weight": weight
    }

# -----------------------------
# Timeline construction
def build_timeline(enriched_items):
    timeline = {"early": [], "middle": [], "late": []}
    for item in enriched_items:
        r = item.get("position_ratio", 0)
        # Weighting the ends of the book more heavily for "Lies"
        if r < 0.25: timeline["early"].append(item)
        elif r < 0.75: timeline["middle"].append(item)
        else: timeline["late"].append(item)
    return timeline

def calculate_name_overlap(text1, text2):
    """Checks if the same characters are present in both texts."""
    names1 = set(re.findall(r'\b[A-Z][a-z]+\b', text1))
    names2 = set(re.findall(r'\b[A-Z][a-z]+\b', text2))
    if not names1 or not names2: return 1.0 # No names to compare
    intersection = names1.intersection(names2)
    return len(intersection) / min(len(names1), len(names2))



In [15]:
# ===============================
# Contradiction + Weighting Logic
# ===============================

# ---- Evidence reliability ----
EVIDENCE_WEIGHTS = {
    "ACTION": 1.0,
    "SELF_STATEMENT": 0.6,
    "THIRD_PARTY_STATEMENT": 0.3
}

# ---- Behavior severity ----
ACTION_WEIGHTS = {
    "kill": 1.0,
    "attack": 0.8,
    "betray": 0.7,
    "steal": 0.6,
    "lie": 0.4,
    "help": 0.3,
    "protect": 0.4
}

TRAIT_WEIGHTS = {
    "violent": 0.9,
    "cruel": 0.9,
    "coward": 0.6,
    "greedy": 0.6,
    "kind": 0.4,
    "brave": 0.4,
    "honest": 0.5
}

# ---- Narrative phase ----
PHASE_WEIGHTS = {
    "early": 0.8,
    "middle": 0.98,
    "late": 1
}

NEGATION_PAIRS = [
    ("violent", "non-violent"),
    ("cruel", "kind"),
    ("coward", "brave"),
    ("selfish", "selfless"),
    ("kill", "never killed")
]

def _norm(text):
    return text.lower().strip()
    

# ---- Semantic Antonym Clusters ----
# If a word from List A appears in the Claim and a word from List B appears in the Fact, 
# it's a high-probability contradiction.
CLASH_CLUSTERS = [
    ({"banker", "clerk", "merchant"}, {"count", "noble", "aristocrat", "abb√©"}),
    ({"sailor", "captain", "mate"}, {"soldier", "general", "colonel"}),
    ({"forgave", "pardon", "mercy", "peace", "forget"}, {"revenge", "vengeance", "punish", "retribution", "plot"}),
    ({"innocent", "honest", "truthful", "loyal"}, {"guilty", "traitor", "betrayed", "liar", "deceived", "spy"}),
    ({"alive", "living", "survived", "safe"}, {"dead", "killed", "corpse", "died", "murdered", "execution"}),
    ({"immediately", "fast", "quickly", "soon"}, {"years", "decades", "long", "slowly", "eventually"}),
    ({"rich", "wealth", "fortune"}, {"poor", "poverty", "destitute", "bankrupt"})
]

import torch
import torch
import torch.nn.functional as F

def behavior_weight(fact_data):
    """
    Calculates intensity of an action/trait.
    Returns higher values for specific matches, but maintains a baseline for general facts.
    """
    # Lower the floor so that identified actions can actually 'move' the value
    w = 0.5 
    
    # Check for specific actions
    actions = fact_data.get("actions", [])
    for a in actions:
        # If found in dictionary, use that value; otherwise, a 'matched action' is 0.7
        w = max(w, ACTION_WEIGHTS.get(a.lower(), 0.7))
        
    # Check for specific traits
    traits = fact_data.get("traits", [])
    for t in traits:
        w = max(w, TRAIT_WEIGHTS.get(t.lower(), 0.6))
        
    # Safety: If the text contains critical 'Lie' words, boost it regardless of extraction
    # This prevents the 'score 0' if the NLP parser misses a verb
    critical_boost = ["murder", "prison", "betray", "never", "secret"]
    if any(word in str(fact_data).lower() for word in critical_boost):
        w = max(w, 0.9)
        
    return w
    

def contradicts(claim_text, claim_emb, fact_item):
    fact_text = fact_item.get("text", "").lower()
    fact_emb = fact_item.get("embedding")
    if fact_emb is None: return False, 0.0

    # 1. Similarity
    c_tensor = torch.from_numpy(claim_emb).float() if isinstance(claim_emb, np.ndarray) else claim_emb.float()
    f_tensor = torch.from_numpy(fact_emb).float() if isinstance(fact_emb, np.ndarray) else fact_emb.float()
    sim = F.cosine_similarity(c_tensor.unsqueeze(0), f_tensor.unsqueeze(0)).item()

    # 2. Extract specific features
    claim_words = set(_norm(claim_text).lower().split())
    fact_words = set(fact_text.lower().split())
    
    # 3. Enhanced Collision Check (Synonym aware)
    cluster_collision = False
    for group_a, group_b in CLASH_CLUSTERS:
        if (claim_words & group_a and fact_words & group_b) or \
           (claim_words & group_b and fact_words & group_a):
            cluster_collision = True
            break

    # 4. Negation logic
    negators = {"not", "never", "no", "neither", "refused", "denied", "without"}
    neg_mismatch = (any(n in claim_words for n in negators) != any(n in fact_words for n in negators))

    # --- REVISED DECISION LOGIC ---
    is_con = False
    
    # CASE A: Explicit Clashes (The Gold Standard)
    if cluster_collision or (neg_mismatch and sim > 0.5):
        is_con = True
    
    # CASE B: Identity/Action Mismatch (The "Who/What" Gap)
    # If the text is very similar but mentions different key nouns
    elif sim > 0.70:
        # Check for Proper Nouns (Names/Places) that differ
        c_entities = {w for w in claim_words if w[0].isupper()} # Needs raw text really
        # If the similarity is very high but the words differ significantly, 
        # it's often a "substituted" fact (a lie).
        diff = len(claim_words - fact_words) / len(claim_words)
        if diff > 0.6: # More than 60% of words changed despite high vector sim
            is_con = True

    # CASE C: Broad Contradiction (Low Similarity but relevant)
    elif 0.30 < sim < 0.60:
        # This is where thematic opposition lives
        if neg_mismatch:
            is_con = True

    return is_con, sim
def compute_contradiction_score(claims, timeline):
    total_score = 0.0
    details = []

    for c in claims:
        # 1. Reset max_delta for EVERY claim
        max_delta = 0.0
        c_emb = c.get('emb') 
        best_evidence = None
        
        for phase in ["late", "middle", "early"]:
            for item in timeline.get(phase, []):
                is_con, sim_val = contradicts(c["claim"], c_emb, item)
                
                if is_con:
                    b_w = behavior_weight(item["facts"])
                    p_w = PHASE_WEIGHTS.get(phase, 1.0)

                    # --- REFINED SCORING MATH ---
                    # We reward High Rerank Confidence (relevance) 
                    # while keeping a baseline for the behavior intensity.
                    relevance_boost = (sim_val + 0.5) * (item.get("rerank_score", 1.0) / 2.0)
                    delta = (0.5 + (c["weight"] * b_w)) * p_w * relevance_boost
    
                    if delta > max_delta:
                        max_delta = delta
                        best_evidence = {
                            "claim": c["claim"],
                            "fact": item["text"][:200],
                            "delta": round(delta, 3)
                        }
        
        # 2. Add only the strongest contradiction per atomic claim
        if max_delta > 0:
            total_score += max_delta
            details.append(best_evidence)
                                    
    return total_score, details


In [16]:
def extract_claims(backstory_text):
    sentences = re.split(r'(?<=[.!?]) +', backstory_text.strip())
    claims = []
    
    high_stakes = ["betrayed", "killed", "lied", "stole", "murdered", "secret", "never"]

    for sentence in sentences:
        if sentence and len(sentence.split()) >= 10:
            # Boost weight for actionable claims
            weight = 0.8 if any(word in sentence.lower() for word in high_stakes) else 0.5
            claims.append({
                "claim": sentence.strip(),
                "weight": weight
            })
    return claims

In [17]:
#!pip install pathway
#import pathway as pw

#class StreamingRAG(pw.Schema):
    #text: str

#def pathway_stream(chunks):
    #table = pw.debug.table_from_list(
        #[{"text": c["text"]} for c in chunks],
        #schema=StreamingRAG
    #)

    # You can add real-time ingestion here
    #return table

In [18]:
class UnifiedStoryDataset:
    def __init__(self, csv_path, novels_dir):
        self.df = pd.read_csv(csv_path)
        self.novels_dir = novels_dir
        self._novel_cache = {}

        required_cols = {"id", "book_name", "content", "label"}
        if not required_cols.issubset(self.df.columns):
            raise ValueError(...)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        book = row["book_name"]

        if book not in self._novel_cache:
            novel_path = os.path.join(self.novels_dir, book)
            with open(novel_path, encoding="utf-8") as f:
                self._novel_cache[book] = f.read()

        return {
            "id": row["id"],
            "novel": self._novel_cache[book],
            "backstory": row["content"],
            "label": int(row["label"])
        }


In [19]:
import pandas as pd
import os

class BackstoryDataset:
    """
    Each item = {
        id, novel, backstory, label (optional)
    }
    """

    LABEL_MAP = {
        "consistent": 1,
        "contradict": 0
    }

    def __init__(self, csv_path, novels_dir):
        self.df = pd.read_csv(csv_path)
        self.novels_dir = novels_dir

        required_cols = {"id", "book_name", "content"}
        if not required_cols.issubset(self.df.columns):
            raise ValueError(f"CSV missing required columns: {required_cols}")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        novel_path = os.path.join(
            self.novels_dir,
            f"{row['book_name']}.txt"
        )

        if not os.path.exists(novel_path):
            raise FileNotFoundError(f"Novel not found: {novel_path}")

        with open(novel_path, encoding="utf-8") as f:
            novel_text = f.read()

        sample = {
            "id": row["id"],
            "novel": novel_text,
            "backstory": row["content"]
        }

        # ‚úÖ SAFE LABEL HANDLING
        if "label" in self.df.columns:
            label_str = str(row["label"]).strip().lower()
            if label_str not in self.LABEL_MAP:
                raise ValueError(f"Unknown label: {label_str}")
            sample["label"] = self.LABEL_MAP[label_str]

        return sample


In [20]:
class DataLoader:
    def __init__(self, dataset, shuffle=False):
        self.dataset = dataset
        self.shuffle = shuffle
        self.indices = list(range(len(dataset)))

    def __iter__(self):
        if self.shuffle:
            import random
            random.shuffle(self.indices)

        for idx in self.indices:
            yield self.dataset[idx]


In [21]:
def train(loader, tokenizer):
    correct = 0
    total = 0
    errors = []
    store_cache = {}

    for i, sample in enumerate(loader):
        # 1. FIXED LABEL RETRIEVAL
        # The logs show 'label' IS in the keys, but it might be nested or named differently in some batches
        raw_label = sample.get("label")
        
        if raw_label is None:
            # Check for alternative keys if 'label' fails
            raw_label = sample.get("gold_label") or sample.get("target")

        if raw_label is None:
            print(f"[WARN] Skipping sample {i}: No label found. Keys available: {list(sample.keys())}")
            continue

        # 2. STANDARDIZE LABEL TO INTEGER (1 or 0)
        if isinstance(raw_label, str):
            # Handles "consistent" -> 1, "contradiction"/"not_consistent" -> 0
            true_label = 1 if raw_label.lower() == "consistent" else 0
        else:
            true_label = int(raw_label)

        # 3. RUN EVALUATION
        pred, expl = evaluate(
            novel=sample["novel"],
            backstory=sample["backstory"],
            tokenizer=tokenizer,
            store_cache=store_cache
        )

        # 4. FIXED COMPARISON (Checking against true_label, not 'label')
        if pred == true_label:
            correct += 1
        else:
            errors.append({
                "id": sample.get("id", i),
                "prediction": pred,
                "label": true_label,
                "explanation": expl
            })

        total += 1

    acc = correct / total if total > 0 else 0.0
    print(f"\n--- TRAINING FINISHED ---")
    print(f"Total Processed: {total}")
    print(f"Final Accuracy: {acc:.3f}")
    
    return acc, errors

In [22]:
from sentence_transformers import CrossEncoder

# Load a lightweight but powerful reranker
# This model is specifically trained to identify if Sentence B follows Sentence A
re_ranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


In [23]:
import re
import numpy as np

PHASE_WEIGHTS = {"late": 1.2, "middle": 1.0, "early": 1.4}

def evaluate(novel, backstory, tokenizer, store_cache):
    # 1. FAISS Loading (Back to Basics)
    novel_id = hash(novel[:1000])
    if novel_id not in store_cache:
        chunks = smart_chunk(novel, tokenizer)
        store = FAISSStore(dim=768)
        texts = [c["text"] for c in chunks]
        # Lower batch size to 32 to prevent Mac hanging
        embs = _local_embedder.encode(texts, normalize_embeddings=True, batch_size=32)
        for i, chunk in enumerate(chunks):
            store.add(embs[i], {
                "text": chunk["text"], 
                "embedding": embs[i], 
                "position_ratio": chunk["position_ratio"]
            })
        store_cache[novel_id] = store
    store = store_cache[novel_id]

    # 2. Claim Extraction
    raw_claims = extract_claims(backstory)
    if not raw_claims: return 1, {"label": 1, "details": []}
    
    # 3. Direct Loop Processing (No massive batching to freeze RAM)
    total_friction = 0.0
    max_hit = 0.0
    details = []

    for rc in raw_claims:
        c_text = rc["claim"]
        c_emb = get_embedding(c_text)
        c_weight = rc.get("weight", 0.6)
        
        # Search k=15 for speed
        results = store.search(c_emb, k=15)
        if not results: continue
        
        # Simple Rerank
        pairs = [[c_text, r["text"]] for r in results]
        scores = re_ranker.predict(pairs)
        
        claim_max = 0.0
        best_ev = None
        
        for i, r in enumerate(results):
            score = scores[i]
            # Use a simpler gate to keep the signal alive without heavy math
            if score > 0.0 or r.get('score', 0) > 0.75:
                # Identity check
                if calculate_name_overlap(c_text, r["text"]) < 0.2: continue
                
                # Logic check
                is_con, _ = contradicts(c_text, c_emb, r)
                if is_con:
                    # Scaled impact
                    conf = 1.0 / (1.0 + np.exp(-score))
                    delta = (1.0 + c_weight) * (conf * 3.5)
                    
                    if delta > claim_max:
                        claim_max = delta
                        best_ev = r["text"]

        total_friction += claim_max
        if claim_max > max_hit: max_hit = claim_max
        
        if claim_max > 0.4:
            details.append({
                "claim": c_text,
                "evidence": best_ev,
                "impact": round(claim_max, 3)
            })

    # 4. Final Verdict
    prediction = 0 if (max_hit > 1.4 or total_friction > 2.5) else 1
    
    return prediction, {
        "label": prediction,
        "details": details
    }

In [24]:
def evaluate_dataset(loader, tokenizer):
    results = []
    store_cache = {}   # üîë FAISS cache per novel

    for sample in loader:
        novel_key = hash(sample["novel"][:2000])  # stable ID

        pred, explanation = evaluate(
            novel=sample["novel"],
            backstory=sample["backstory"],
            tokenizer=tokenizer,
            store_cache=store_cache
        )

        results.append({
            "id": sample["id"],
            "prediction": pred,
            "explanation": explanation
        })

    return results



In [25]:

def get_dataloader(csv_path, novels_dir, shuffle=False):
    dataset = BackstoryDataset(csv_path, novels_dir)
    return DataLoader(dataset, shuffle=shuffle)


In [26]:
def build_novel_index(novel_text, tokenizer):
    chunks = smart_chunk(novel_text, tokenizer)
    store = FAISSStore(dim=768)

    for c in chunks:
        emb = get_embedding(c["text"])
        store.add(
            emb,
            {
                "text": c["text"],
                "position_ratio": c["position_ratio"]
            }
        )

    return store


In [27]:
def build_novel_store(novel_text, tokenizer):
    """
    Build FAISS index ONCE for a novel.
    """
    chunks = smart_chunk(novel_text, tokenizer)

    store = FAISSStore(dim=768)  # must match embedding model

    for chunk in chunks:
        emb = get_embedding(chunk["text"])
        store.add(
            emb,
            {
                "text": chunk["text"],
                "position_ratio": chunk["position_ratio"]
            }
        )

    return store


In [28]:
import pickle
import os

def load_store(book_name, index_dir="."):
    """
    Load prebuilt FAISS store for a given novel.
    """

    name_map = {
        "In Search of the Castaways": "castaways.index",
        "The Count of Monte Cristo": "montecristo.index"
    }

    # Normalize
    book_name = book_name.replace(".txt", "").strip()

    if book_name not in name_map:
        raise ValueError(f"No index available for book: {book_name}")

    index_path = os.path.join(index_dir, name_map[book_name])

    if not os.path.exists(index_path):
        raise FileNotFoundError(f"Index file not found: {index_path}")

    with open(index_path, "rb") as f:
        store = pickle.load(f)

    return store



In [29]:



# Evaluate test set



In [None]:
# ===============================
# Tokenizer
# ===============================

class SimpleWordTokenizer:
    def encode(self, text):
        return text.split()

    def decode(self, tokens):
        return " ".join(tokens)


def get_tokenizer():
    return SimpleWordTokenizer()


# ===============================
# Paths
# ===============================

novels_base_dir = "/Users/Shreyanshsingh/Downloads/data/novels"


# ===============================
# Loaders
# ===============================
# IMPORTANT:
# get_dataloader() RETURNS A DataLoader (not a dataset)

train_loader = get_dataloader(
    "/Users/Shreyanshsingh/Downloads/train.csv",
    novels_base_dir,
    shuffle=True
)

test_loader = get_dataloader(
    "/Users/Shreyanshsingh/Downloads/test.csv",
    novels_base_dir,
    shuffle=False
)

tokenizer = get_tokenizer()


# ===============================
# Sanity check ONE sample
# ===============================

store_cache = {}  # üîë REQUIRED

sample = next(iter(train_loader))  # ‚úÖ correct way

pred, expl = evaluate(
    novel=sample["novel"],
    backstory=sample["backstory"],
    tokenizer=tokenizer,
    store_cache=store_cache
)

print("Sample prediction:", pred)
print("Explanation:", expl)



train_acc, errors = train(train_loader, tokenizer)
print(f"Train Accuracy: {train_acc:.3f}")


test_results = evaluate_dataset(test_loader, tokenizer)

print(f"Generated predictions for {len(test_results)} test samples")
for r in test_results:
    print(f"ID: {r['id']} ‚Üí Prediction: {r['prediction']}")

import pandas as pd

pd.DataFrame(test_results).to_csv("/Users/Shreyanshsingh/Downloads/Final_Results_2.csv", index=False)
print("Saved results.csv")

Sample prediction: 1
Explanation: {'label': 1, 'details': []}

--- TRAINING FINISHED ---
Total Processed: 80
Final Accuracy: 0.662
Train Accuracy: 0.662


In [None]:
from google.colab import output
output.enable_custom_widget_manager()

Support for third party widgets will remain active for the duration of the session. To disable support:

In [None]:
from google.colab import output
output.disable_custom_widget_manager()