In [3]:
!pip install -q pathway sentence-transformers faiss-cpu pandas numpy tqdm ollama scikit-learn
print(1)

python(74553) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


1


In [None]:
# ============================================================
# KDSH 2026 — ULTIMATE ROBUST HYBRID PIPELINE
# Pathway + Dual-RAG + ML + CoT reasoning
# ============================================================

import os, re, glob, json
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import faiss
import ollama
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# ============================================================
# CONFIG
# ============================================================
OLLAMA_MODEL = "phi3:latest" 
EMBED_MODEL = "all-MiniLM-L6-v2"
RAG_K = 15
MIN_SIM_FOR_LLM = 0.22  # Trigger reasoning for even moderate matches
MIN_PAR_LEN = 20

embedder = SentenceTransformer(EMBED_MODEL)

# ============================================================
# ADVANCED MAPPING & DOSSIER BUILDING
# ============================================================
def clean_text(t):
    t = str(t)
    t = t.replace("“", '"').replace("”", '"').replace("’", "'")
    return re.sub(r"\s+", " ", t).strip()

def canon_char(name): return re.sub(r"\s+", " ", str(name).lower().strip())
def canon_book(name): return name.lower().replace(".txt", "").strip()

def get_alias_regex(char):
    parts = char.split()
    aliases = {char}
    if len(parts) >= 2:
        aliases.add(parts[-1])
        for t in ["mr", "mrs", "miss", "sir", "ms", "count"]:
            aliases.add(f"{t} {parts[-1]}")
    return r"|".join([rf"\b{re.escape(a)}\b" for a in aliases])

# ============================================================
# DUAL-INDEX RAG (LOCAL + GLOBAL)
# ============================================================
FAISS_LOCAL = {}
FAISS_GLOBAL = {}

def build_indices(train_df, test_df):
    required_chars = defaultdict(set)
    for df in [train_df, test_df]:
        for r in df.itertuples():
            required_chars[canon_book(r.book_name)].add(canon_char(r.char))

    for path in glob.glob("*.txt"):
        book_key = canon_book(path)
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
            if "*** START" in text: text = text.split("*** START", 1)[1]
            if "*** END" in text: text = text.split("*** END", 1)[0]
            
            paras = [clean_text(p) for p in text.split("\n\n") if len(p) > MIN_PAR_LEN]
            if not paras: continue
            
            # 1. Global Index (Safety Net)
            g_emb = embedder.encode(paras, normalize_embeddings=True)
            g_idx = faiss.IndexFlatIP(g_emb.shape[1])
            g_idx.add(g_emb)
            FAISS_GLOBAL[book_key] = (g_idx, paras)
            
            # 2. Local Mapping (Character specific with windows)
            book_chars = required_chars.get(book_key, set())
            for char in book_chars:
                regex = re.compile(get_alias_regex(char), re.I)
                char_paras = []
                for i, p in enumerate(paras):
                    if regex.search(p):
                        # Add window context (i-1, i, i+1)
                        window = " ".join(paras[max(0, i-1):min(len(paras), i+2)])
                        char_paras.append(window)
                
                if char_paras:
                    char_paras = list(set(char_paras))
                    l_emb = embedder.encode(char_paras, normalize_embeddings=True)
                    l_idx = faiss.IndexFlatIP(l_emb.shape[1])
                    l_idx.add(l_emb)
                    FAISS_LOCAL[f"{book_key}::{char}"] = (l_idx, char_paras)

def hybrid_retrieve(book, char, query, k=RAG_K):
    book, char = canon_book(book), canon_char(char)
    res = []
    
    # Check Local First
    local_key = f"{book}::{char}"
    if local_key in FAISS_LOCAL:
        idx, paras = FAISS_LOCAL[local_key]
        q_emb = embedder.encode([query], normalize_embeddings=True)
        s, ids = idx.search(q_emb, min(k, len(paras)))
        res = [(paras[i], float(s[0][j])) for j, i in enumerate(ids[0])]
    
    # If weak results, fall back to Global
    if not res or res[0][1] < 0.30:
        if book in FAISS_GLOBAL:
            idx, paras = FAISS_GLOBAL[book]
            q_emb = embedder.encode([query], normalize_embeddings=True)
            s, ids = idx.search(q_emb, k)
            res = list(set(res + [(paras[i], float(s[0][j])) for j, i in enumerate(ids[0])]))
            res = sorted(res, key=lambda x: x[1], reverse=True)[:k]
    return res

# ============================================================
# ML FEATURES
# ============================================================
def extract_features(ret, query):
    if not ret: return [0.0] * 6
    scores = [s for _, s in ret]
    return [max(scores), np.mean(scores), np.std(scores), np.percentile(scores, 75), len(ret), scores[0] - scores[-1]]

# ============================================================
# ROBUST CoT OLLAMA JUDGE
# ============================================================
def ollama_judge_robust(book, query, evidence):
    ev_text = "\n".join([f"SCENE {i}: {p}" for i, (p, _) in enumerate(evidence[:6])])
    
    prompt = f"""
Act as a Senior Continuity Editor. Your task is to find FACTUAL CONTRADICTIONS.

CLAIM TO TEST: "{query}"
BOOK CONTEXT: {book}

EVIDENCE FROM TEXT:
{ev_text}

THOUGHT PROCESS:
1. What is the specific fact asserted in the CLAIM? (e.g., Character X is dead, or Character Y is at location Z)
2. Does the EVIDENCE mention this fact?
3. Is there a direct contradiction? (Example: Claim says "Married", Evidence says "Single"). 
4. Note: If the evidence is just about a different topic, it is CONSISTENT (1).

Return JSON ONLY:
{{
  "analysis": "Identify specific scenes that contradict or support.",
  "verdict": 0 or 1
}}
0 = CONTRADICT, 1 = CONSISTENT
"""
    try:
        r = ollama.chat(model=OLLAMA_MODEL, messages=[{"role": "user", "content": prompt}], options={"temperature": 0})
        content = r["message"]["content"]
        match = re.search(r"\{.*\}", content, re.DOTALL)
        if match:
            res = json.loads(match.group(0))
            return int(res.get("verdict", 1))
    except:
        pass
    return 1 # Default to Consistent (Safe bet in literature)

# ============================================================
# EXECUTION PIPELINE
# ============================================================
def main():
    # 1. Load Data
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    
    def prep(df):
        df["Full_Query"] = df.apply(lambda r: clean_text(f"{r['caption']} : {r['content']}") if pd.notna(r['caption']) else clean_text(r['content']), axis=1)
        if "label" in df.columns:
            df["label"] = df["label"].str.lower().map({"consistent": 1, "contradict": 0})
        return df

    train_df = prep(train_df)
    test_df = prep(test_df)

    # 2. Indexing
    print("Building Global and Local Indices...")
    build_indices(train_df, test_df)

    # 3. ML Training
    print("Extracting Training Features...")
    X = [extract_features(hybrid_retrieve(r.book_name, r.char, r.Full_Query), r.Full_Query) for r in train_df.itertuples()]
    y = train_df["label"].values
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
    
    rf = RandomForestClassifier(n_estimators=1000, max_depth=12, class_weight="balanced", random_state=42).fit(X_tr, y_tr)

    # 4. Hybrid Inference
    def run_inference(df_subset):
        results = []
        for r in tqdm(df_subset.itertuples(), total=len(df_subset)):
            ret = hybrid_retrieve(r.book_name, r.char, r.Full_Query)
            feats = extract_features(ret, r.Full_Query)
            ml_prob = rf.predict_proba([feats])[0][1] # Prob(Consistent)
            
            # Logic: If evidence found, let CoT verify.
            if ret and feats[0] > MIN_SIM_FOR_LLM:
                llm_verdict = ollama_judge_robust(r.book_name, r.Full_Query, ret)
                
                # Confidence-Weighted Verdict
                if llm_verdict == 0:
                    final = 0 if ml_prob < 0.75 else 1 # High bar for LLM contradiction
                else:
                    final = 1 if ml_prob > 0.30 else 0
            else:
                final = 1 if ml_prob > 0.45 else 0
            results.append(final)
        return results

    # 5. Validation
    val_indices = train_df.index[len(X_tr):]
    val_preds = run_inference(train_df.iloc[val_indices])
    acc = (np.array(val_preds) == y_val).mean()
    print(f"\nFINAL SYSTEM ACCURACY: {acc:.4f}")

    # 6. Test/Submission
    print("Generating Submission...")
    test_preds = run_inference(test_df)
    test_df["prediction"] = ["consistent" if p == 1 else "contradict" for p in test_preds]
    test_df[["id", "prediction"]].to_csv("submission.csv", index=False)

if __name__ == "__main__":
    main()

Building Global and Local Indices...
Extracting Training Features...


100%|███████████████████████████████████████████| 12/12 [11:08<00:00, 55.71s/it]



FINAL SYSTEM ACCURACY: 0.7500
Generating Submission...


 40%|██████████████▍                     | 24/60 [2:03:02<18:31:12, 1852.00s/it]