In [None]:
#!kill -9 $(lsof -t -i:5000) 2>/dev/null
#!kill -9 $(lsof -t -i:5050) 2>/dev/null
#!pkill -f flask 2>/dev/null || true
#!pkill -f ngrok 2>/dev/null || true

!pip install -q flask flask-cors pyngrok transformers accelerate torch rapidfuzz spacy spacy-llm beautifulsoup4 requests networkx ipysigma python-louvain pandas scikit-learn

import torch, json, re, gc, time
import os, tempfile
import pathlib
import networkx as nx
from ipysigma import Sigma
import community.community_louvain as community_louvain
import requests
from bs4 import BeautifulSoup
from flask import Flask, request, jsonify
from flask_cors import CORS
from pyngrok import ngrok
from threading import Thread
from transformers import AutoTokenizer, AutoModelForCausalLM
from google.colab import userdata
from rapidfuzz import fuzz, process
import spacy
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

actor_lists = {
    "EIT Organizations": [
        "European Institute of Innovation and Technology (EIT)",
        "EIT Food", "EIT Health", "EIT InnoEnergy", "EIT Digital",
        "EIT RawMaterials", "EIT Manufacturing", "EIT Urban Mobility",
        "EIT Climate-KIC", "EIT Culture & Creativity"
    ],
    "Universities / Research Institutes": [
        "KU Leuven", "KTH Royal Institute of Technology", "Universitat Politècnica de Catalunya",
        "Eindhoven University of Technology (TU/e)", "Instituto Superior Técnico",
        "ESADE Business School", "Politecnico di Torino", "École Polytechnique",
        "Aalto University", "Université Paris Sciences et Lettres (PSL)", "Université Paris-Saclay",
        "Grenoble INP Institute of Technology", "École des Ponts ParisTech (ENPC)",
        "AGH University of Science and Technology", "Imperial College London",
        "University of Oxford", "University of Cambridge", "IESE Business School",
        "IMIM – Hospital del Mar Medical Research Institute",
        "Delft University of Technology", "University of Porto",
        "University of Debrecen", "University of Luxembourg",
        "Leitat Technology Center", "Fraunhofer Institute",
        "Technical University of Munich", "RWTH Aachen University",
        "Czech Technical University in Prague",
        "Uppsala University", "Tartu University Hospital",
        "North Lisbon University Hospital Centre", "Hospital Clínic",
        "University of Maastricht", "Eötvös Lorand University", "KTH Institute",
        "University of Pécs"
    ],
    "Companies / Corporates": [
        "Schneider Electric", "Siemens Healthineers", "Roche", "Bayer", "Sanofi",
        "Johnson & Johnson", "IBM", "Philips", "Bosch", "Microsoft", "GE Healthcare",
        "URGO Group", "Ferrer", "Matmut", "ABB", "ENGIE", "Airbus", "ArcelorMittal",
        "Veolia", "TotalEnergies", "Nestlé", "Danone", "PepsiCo", "Unilever",
        "Shell", "Vattenfall", "Iberdrola", "Enel", "Equinor", "Nokia", "Ericsson"
    ],
    "Startups / SMEs": [
        "iLoF", "Sleepiz", "Optellum", "Idoven", "PIPRA", "Antegenes",
        "Clinomic", "Unhindr", "Leuko", "Ochre Bio",
        "Hearts Radiant", "Allelica", "SolasCure", "Peptomyc",
        "Oxford Endovascular", "Tubulis", "SideROS", "Emperra",
        "FasTeesH", "MEDIKURA", "SpinDiag", "Selio Medical",
        "Damibu", "Telomium", "Tracegrow", "Entremo",
        "Recycleye", "InnoTractor", "LMAD", "OvaExpert",
        "Feno", "Ganymed Robotics",
        "ABLE Human Motion", "FLOWTION", "SeizeIT", "NanoRacks",
        "AMEN New Technologies", "CroíValve"
    ],
    "Government / Public Sector": [
        "European Commission", "European Union", "European Parliament",
        "European Investment Fund (EIF)", "European Investment Bank (EIB)",
        "National Health Service (UK)", "Ministry of Human Capacities (Hungary)",
        "German Federal Ministry of Education and Research", "City of Debrecen",
        "Spanish Ministry of Science and Innovation",
        "French Ministry of Higher Education and Research",
        "Italian Ministry for Economic Development",
        "Polish Ministry of Climate and Environment",
        "European Council", "European Court of Auditors"
    ],
    "Networks / Consortia / Foundations": [
        "European Battery Alliance", "European Youth Energy Network",
        "CommUnity+", "Foundation for Management and Industrial Research (MIR)",
        "Enterprise Europe Network", "HealthTech For Care",
        "Venture Centre of Excellence (VCoE)",
        "BioMed Alliance", "WE Health Consortium", "CLOSE Consortium",
        "InnoEnergy Alliance", "RawMaterials Academy", "Urban Mobility Academy",
        "Climate-KIC Alumni Association", "EIT Digital Alumni", "RIS Hub Network",
        "Regional Innovation Scheme (RIS)", "EIT Community Booster",
        "Supernovas Programme", "Fondation de l'Avenir",
        "Polish Medical Mission", "BRIGHT Project Innovation Team"
    ],
    "Investors / Funding Bodies": [
        "Santander InnoEnergy Climate Fund", "EBA Strategic Battery Materials Fund",
        "Aescuvest Crowdfunding Platform", "Zafir Capital", "Alta Life Sciences",
        "YES!Delft", "Startup Wise Guys", "Speedinvest", "Seedcamp",
        "EASME – Executive Agency for SMEs", "Horizon Europe",
        "European Innovation Council (EIC)", "Business Finland",
        "Vinnova", "CDTI Spain", "Bpifrance"
    ]
}

ALL_ACTORS = []
for category_actors in actor_lists.values():
    ALL_ACTORS.extend(category_actors)

ACTOR_LOWER_MAP = {actor.lower(): actor for actor in ALL_ACTORS}

print(f"Loaded {len(ALL_ACTORS)} actors\n")

# LOAD SPACY
print("Loading SpaCy English model...")
try:
    nlp_spacy = spacy.load("en_core_web_sm")
    print(" SpaCy loaded successfully")
except:
    print("Downloading SpaCy English model...")
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp_spacy = spacy.load("en_core_web_sm")
    print(" SpaCy downloaded and loaded")

# LOAD MISTRAL
print("\nLoading Mistral 7B...")
mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
mistral_tokenizer.pad_token = mistral_tokenizer.eos_token
mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)
print(" Mistral loaded\n")

# HELPER FUNCTIONS
def fuzzy_match_actor(name, threshold=60):
    if not name or len(name) < 2:
        return None

    name_lower = name.lower().strip()

    if name_lower in ACTOR_LOWER_MAP:
        return ACTOR_LOWER_MAP[name_lower]

    result = process.extractOne(
        name,
        ALL_ACTORS,
        scorer=fuzz.token_sort_ratio,
        score_cutoff=threshold
    )

    if result:
        matched_actor, score, _ = result
        return matched_actor

    return None

def validate_triplet(triplet):
    role_raw = triplet.get("role", "")
    counterrole_raw = triplet.get("counterrole", "")
    practice = triplet.get("practice", "")

    if not role_raw or not counterrole_raw or not practice:
        return None, "Missing fields"

    role_raw = role_raw.strip()
    counterrole_raw = counterrole_raw.strip()
    practice = practice.strip()

    if len(role_raw) < 2 or len(counterrole_raw) < 3:
        return None, "Too short"

    counterrole_lower = counterrole_raw.lower()
    generic_terms = [
        "us", "we", "you", "they", "them", "it", "this", "that",
        "people", "businesses", "companies", "organizations", "partners",
        "stakeholders", "members", "community", "projects", "programmes",
        "opportunities", "solutions", "services", "products",
        "Europe", "innovation", "development", "healthcare", "health"
    ]

    if counterrole_lower in generic_terms:
        return None, "Generic counterrole"

    if len(counterrole_raw) > 100:
        return None, "Too long"

    vague_practices = ["has", "is", "does", "makes", "gets", "uses"]
    if practice.lower() in vague_practices:
        return None, "Vague practice"

    role_matched = fuzzy_match_actor(role_raw, threshold=60)

    if not role_matched:
        return None, "Role not in list"

    triplet["role"] = role_matched
    triplet["counterrole"] = counterrole_raw

    return triplet, "valid"

def chunk_text(text, chunk_size=1500):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks, current = [], ""

    for sent in sentences:
        if len(current) + len(sent) <= chunk_size:
            current += sent + " "
        else:
            if current.strip():
                chunks.append(current.strip())
            current = sent + " "

    if current.strip():
        chunks.append(current.strip())

    return chunks

def make_base_prompt(text_chunk, user_prompt=""):
    actor_examples = "\n".join([f"  • {actor}" for actor in ALL_ACTORS[:30]])

    base_prompt = f"""Extract organizational relationships as JSON.

CONSTRAINT: 'role' MUST be from this list:
{actor_examples}
  • ... and {len(ALL_ACTORS)-30} more actors

RULES:
1. role: Organization taking action (from list above)
2. practice: Specific action verb (e.g., "fund", "partner with", "support")
3. counterrole: Specific named entity (not generic terms)
4. context: Exact sentence

Output ONLY valid JSON array."""

    if user_prompt and user_prompt.strip():
        base_prompt += f"\n\nADDITIONAL INSTRUCTIONS:\n{user_prompt.strip()}"

    base_prompt += f"\n\nTEXT:\n{text_chunk}\n\nJSON:"

    return base_prompt

# EXTRACTION FUNCTIONS
def extract_with_mistral(text, user_prompt):
    prompt = make_base_prompt(text, user_prompt)

    try:
        inputs = mistral_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=3000).to(mistral_model.device)

        with torch.no_grad():
            outputs = mistral_model.generate(
                **inputs,
                max_new_tokens=700,
                temperature=0.3,
                do_sample=True,
                pad_token_id=mistral_tokenizer.eos_token_id,
                return_dict_in_generate=True,
                output_scores=True
            )

        result = mistral_tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
        result = result.replace(prompt, "").strip()

        scores = outputs.scores
        token_confidences = [torch.max(torch.nn.functional.softmax(score[0], dim=-1)).item() for score in scores]
        avg_confidence = sum(token_confidences) / len(token_confidences) if token_confidences else 0.5

        clean_output = result

        if "```json" in clean_output:
            clean_output = clean_output.split("```json")[1].split("```")[0].strip()
        elif "```" in clean_output:
            clean_output = clean_output.split("```")[1].split("```")[0].strip()

        for prefix in ["JSON:", "OUTPUT:"]:
            if clean_output.upper().startswith(prefix):
                clean_output = clean_output[len(prefix):].strip()

        start, end = clean_output.find("["), clean_output.rfind("]")
        if start == -1:
            start, end = clean_output.find("{"), clean_output.rfind("}")
            if start != -1 and end != -1:
                clean_output = "[" + clean_output[start:end+1] + "]"

        json_text = clean_output[start:end+1] if start != -1 and end != -1 else "[]"

        triples = json.loads(json_text)
        if isinstance(triples, dict):
            triples = [triples]

        validated_triples = []
        for triple in triples:
            triple['model_confidence'] = round(avg_confidence, 3)
            triple.setdefault('role', '')
            triple.setdefault('practice', '')
            triple.setdefault('counterrole', '')
            triple.setdefault('context', '')

            validated, reason = validate_triplet(triple)

            if validated:
                validated_triples.append(validated)

        return validated_triples

    except Exception as e:
        return []
    finally:
        torch.cuda.empty_cache()
        gc.collect()

def extract_with_spacy_llm(text, user_prompt):
    try:
        doc = nlp_spacy(text[:5000])
        entities = [(ent.text, ent.label_) for ent in doc.ents]

        potential_roles = []
        text_lower = text.lower()
        for actor in ALL_ACTORS:
            if actor.lower() in text_lower:
                potential_roles.append(actor)

        if not potential_roles:
            potential_roles = ["EIT Health", "EIT Food", "EIT InnoEnergy"]

        verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"][:20]

        spacy_enhanced_prompt = f"""Extract organizational relationships using NLP analysis:

DETECTED ORGANIZATIONS: {', '.join(potential_roles[:10])}
NAMED ENTITIES: {', '.join([f"{e[0]} ({e[1]})" for e in entities[:15]])}
KEY VERBS: {', '.join(verbs[:15])}

Task: Extract triplets (role → practice → counterrole)
- role: MUST be from detected organizations above
- practice: Action verb (preferably from key verbs)
- counterrole: Specific named entity (from detected entities or text)
- context: Exact sentence from text

STRICT RULES:
1. role must match detected organizations
2. counterrole must be specific (NOT generic terms like "partners", "stakeholders")
3. practice must be a clear action

Text to analyze:
{text[:2000]}

Output ONLY JSON array:
[{{"role": "...", "practice": "...", "counterrole": "...", "context": "..."}}]

JSON:"""

        inputs = mistral_tokenizer(
            spacy_enhanced_prompt,
            return_tensors="pt",
            truncation=True,
            max_length=3000
        ).to(mistral_model.device)

        with torch.no_grad():
            outputs = mistral_model.generate(
                **inputs,
                max_new_tokens=700,
                temperature=0.3,
                do_sample=True,
                pad_token_id=mistral_tokenizer.eos_token_id,
                return_dict_in_generate=True,
                output_scores=True
            )

        result = mistral_tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
        result = result.replace(spacy_enhanced_prompt, "").strip()

        scores = outputs.scores
        token_confidences = [
            torch.max(torch.nn.functional.softmax(score[0], dim=-1)).item()
            for score in scores
        ]
        avg_confidence = sum(token_confidences) / len(token_confidences) if token_confidences else 0.75

        clean_output = result
        if "```json" in clean_output:
            clean_output = clean_output.split("```json")[1].split("```")[0].strip()
        elif "```" in clean_output:
            clean_output = clean_output.split("```")[1].split("```")[0].strip()

        for prefix in ["JSON:", "OUTPUT:"]:
            if clean_output.upper().startswith(prefix):
                clean_output = clean_output[len(prefix):].strip()

        start, end = clean_output.find("["), clean_output.rfind("]")
        if start == -1:
            start, end = clean_output.find("{"), clean_output.rfind("}")
            if start != -1 and end != -1:
                clean_output = "[" + clean_output[start:end+1] + "]"

        json_text = clean_output[start:end+1] if start != -1 and end != -1 else "[]"

        relations = json.loads(json_text)
        if isinstance(relations, dict):
            relations = [relations]

        validated_triples = []
        for rel in relations:
            triplet = {
                "role": rel.get("role", ""),
                "practice": rel.get("practice", ""),
                "counterrole": rel.get("counterrole", ""),
                "context": rel.get("context", ""),
                "model_confidence": round(avg_confidence, 3)
            }

            validated, reason = validate_triplet(triplet)
            if validated:
                validated_triples.append(validated)

        return validated_triples

    except Exception as e:
        print(f"SpaCy-LLM error: {e}")
        return []
    finally:
        torch.cuda.empty_cache()
        gc.collect()

# FLASK APP
app = Flask(__name__)
CORS(app)

@app.route("/extract_triplets", methods=["POST"])
def extract_endpoint():
    data = request.get_json(force=True)
    text = data.get("text", "")
    model_choice = data.get("model", "Mistral 7B").strip()
    user_prompt = data.get("user_prompt", "")
    max_triplets = data.get("max_triplets", None)

    if not text.strip():
        return jsonify({"error": "No text provided"}), 400

    model_map = {
        "Mistral 7B": "mistral",
        "SpacyLLM": "spacy-llm"
    }

    model_key = model_map.get(model_choice, "mistral")

    model_names = {
        "mistral": "Mistral 7B",
        "spacy-llm": "SpaCy-LLM (SpaCy NER + Mistral)"
    }

    print(f"\n{'='*70}")
    print(f"EXTRACTION STARTED")
    print(f"{'='*70}")
    print(f"  Text: {len(text):,} characters")
    print(f"  Model: {model_names[model_key]}")
    print(f"  User prompt: {'Yes' if user_prompt else 'No'}")
    print(f"  Max triplets: {max_triplets if max_triplets else 'Unlimited'}")
    print(f"{'='*70}")

    chunks = chunk_text(text, chunk_size=1500)
    total_chunks = len(chunks)

    print(f"Total chunks: {total_chunks}")
    print(f"Estimated time: {total_chunks*2/60:.1f} minutes")
    print(f"{'='*70}\n")

    all_triplets = []

    health_end = 111637
    food_end = health_end + 122735

    for i, chunk in enumerate(chunks):
        if max_triplets and len(all_triplets) >= max_triplets:
            print(f"\n Reached max triplets limit: {max_triplets}")
            print(f"  Stopping at chunk {i+1}/{total_chunks}")
            break

        char_pos = i * 1500
        if char_pos < health_end:
            current_org = "EIT Health"
        elif char_pos < food_end:
            current_org = "EIT Food"
        else:
            current_org = "EIT InnoEnergy"

        print(f"\n{'='*70}")
        print(f"CHUNK {i+1}/{total_chunks} | {current_org}")
        print(f"MODEL: {model_names[model_key]}")
        print(f"{'='*70}")

        try:
            if model_key == "spacy-llm":
                triples = extract_with_spacy_llm(chunk, user_prompt)
            else:
                triples = extract_with_mistral(chunk, user_prompt)

            if triples:
                for triple in triples:
                    if max_triplets and len(all_triplets) >= max_triplets:
                        print(f"\n Reached limit during chunk processing")
                        break

                    all_triplets.append(triple)

                    role = triple.get('role', 'Unknown')
                    practice = triple.get('practice', 'Unknown')
                    counterrole = triple.get('counterrole', 'Unknown')

                    print(f"  {role} → {practice} → {counterrole}")

                if max_triplets and len(all_triplets) >= max_triplets:
                    print(f"\n Stopping: reached {max_triplets} triplets")
                    break

                print(f"\n Total so far: {len(all_triplets)} triplets")
            else:
                print(f"  No valid triplets found")
                print(f" Total so far: {len(all_triplets)} triplets")

            time.sleep(0.3)

        except Exception as e:
            print(f" Error: {str(e)[:100]}")
            continue

    print(f"\n{'='*70}")
    print(f" EXTRACTION COMPLETE")
    print(f"{'='*70}")
    print(f"  Model: {model_names[model_key]}")
    print(f"  Total triplets: {len(all_triplets)}")
    print(f"{'='*70}\n")

    formatted_triplets = []
    for idx, triple in enumerate(all_triplets):
        formatted_triplets.append({
            "id": idx + 1,
            "text": triple.get("context", "No context"),
            "community": "EIT Community",
            "extracted": {
                "role": triple.get("role", "Unknown"),
                "practice": triple.get("practice", "Unknown"),
                "counterrole": triple.get("counterrole", "Unknown")
            },
            "confidence": triple.get("model_confidence", 0.5),
            "validated": None
        })

    return jsonify({
        "total_chunks": total_chunks,
        "total_triplets": len(formatted_triplets),
        "triplets": formatted_triplets,
        "model_used": model_names[model_key],
        "status": "success"
    })

@app.route("/scrape_url", methods=["POST"])
def scrape_endpoint():
    """Enhanced scraping with 1500 WORD limit"""
    data = request.get_json(force=True)
    url = data.get("url", "")

    if not url:
        return jsonify({"error": "No URL provided"}), 400

    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'iframe', 'form', 'button']):
            element.decompose()

        # Target main content area
        main_content = (
            soup.find('main') or
            soup.find('article') or
            soup.find(class_=['content', 'main-content', 'article-body', 'post-content']) or
            soup.find(id=['content', 'main', 'article']) or
            soup.body
        )

        if not main_content:
            return jsonify({"error": "Could not find main content"}), 400

        # Extract text from multiple elements
        text_elements = []

        # Headers (h1-h6)
        for header in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            text = header.get_text().strip()
            if text and len(text) > 10:
                text_elements.append(('header', text))

        # Paragraphs
        for p in main_content.find_all('p'):
            text = p.get_text().strip()
            if text and len(text) > 20:
                text_elements.append(('paragraph', text))

        # List items
        for li in main_content.find_all('li'):
            text = li.get_text().strip()
            if text and len(text) > 15:
                text_elements.append(('list', text))

        # Divs with substantial text
        for div in main_content.find_all('div'):
            if len(div.find_all()) < 3:
                text = div.get_text().strip()
                if text and len(text) > 30 and len(text) < 500:
                    text_elements.append(('div', text))

        # Remove duplicates while preserving order
        seen = set()
        unique_texts = []

        for element_type, text in text_elements:
            normalized = ' '.join(text.split())

            if normalized and normalized not in seen:
                seen.add(normalized)
                unique_texts.append(normalized)

        # Join all text
        full_text = "\n\n".join(unique_texts)

        # Limit to 1500 WORDS
        MAX_WORDS = 1500
        words = full_text.split()

        if len(words) > MAX_WORDS:
            # Take first 1500 words
            limited_words = words[:MAX_WORDS]

            # Try to end at sentence boundary
            cleaned_text = ' '.join(limited_words)

            # Find last sentence ending
            last_period = cleaned_text.rfind('.')
            last_exclaim = cleaned_text.rfind('!')
            last_question = cleaned_text.rfind('?')

            cut_point = max(last_period, last_exclaim, last_question)

            # Only cut at sentence if reasonable (within last 100 chars)
            if cut_point > len(cleaned_text) - 100:
                cleaned_text = cleaned_text[:cut_point + 1]
            else:
                cleaned_text = cleaned_text + "..."
        else:
            cleaned_text = full_text

        word_count = len(cleaned_text.split())

        print(f"\n{'='*70}")
        print(f"✓ WEB SCRAPING COMPLETE")
        print(f"{'='*70}")
        print(f"  URL: {url}")
        print(f"  Text blocks extracted: {len(unique_texts)}")
        print(f"  Total words (raw): {len(words)}")
        print(f"  Words returned: {word_count}")
        print(f"  Characters: {len(cleaned_text)}")
        print(f"  Truncated: {'Yes' if len(words) > MAX_WORDS else 'No'}")
        print(f"{'='*70}\n")

        return jsonify({
            "text": cleaned_text,
            "source": url,
            "stats": {
                "total_blocks": len(unique_texts),
                "total_words": len(words),
                "returned_words": word_count,
                "returned_chars": len(cleaned_text),
                "truncated": len(words) > MAX_WORDS
            }
        })

    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route("/render_network", methods=["POST"])
def render_network_endpoint():
    csv_path = "/content/gt_graph_embedded.csv"

    if not os.path.exists(csv_path):
        return jsonify({"error": f"Upload gt_graph_embedded.csv to /content/ first"}), 400

    try:
        print(f"\n{'='*70}")
        print("BUILDING NETWORK FROM CSV")
        print(f"{'='*70}")

        df = pd.read_csv(csv_path)
        print(f" Loaded: {len(df)} rows, {len(df.columns)} columns")

        # Parse embeddings from JSON column
        embeddings = np.vstack(df['embedding'].apply(json.loads))
        print(f" Embeddings shape: {embeddings.shape}")

        # Calculate similarity
        print(" Calculating similarity matrix...")
        sim = cosine_similarity(embeddings)
        print(f" Similarity range: {sim.min():.3f} to {sim.max():.3f}")

        # HIGHER threshold to reduce edges
        non_diag_sims = sim[np.triu_indices_from(sim, k=1)]
        SIM_THRESHOLD = np.percentile(non_diag_sims, 95)
        print(f" Threshold: {SIM_THRESHOLD:.3f} (95th percentile)")

        # Build graph
        G = nx.Graph()

        # Add nodes
        for idx, row in df.iterrows():
            categories = str(row['category_groups_list']).split(',')
            label = categories[0].strip()[:50] if categories else f"Node_{idx}"
            G.add_node(idx, label=label)

        print(f" Added {G.number_of_nodes()} nodes")

        # Add edges with LIMIT
        MAX_EDGES = 10000
        edge_candidates = []

        for i in range(len(df)):
            for j in range(i + 1, len(df)):
                if sim[i, j] >= SIM_THRESHOLD:
                    edge_candidates.append((i, j, sim[i, j]))

        # Sort by weight and take top MAX_EDGES
        edge_candidates.sort(key=lambda x: x[2], reverse=True)
        edge_candidates = edge_candidates[:MAX_EDGES]

        for i, j, weight in edge_candidates:
            G.add_edge(i, j, weight=float(weight))

        print(f" Added {G.number_of_edges()} edges")

        # If still too few edges, lower threshold
        if G.number_of_edges() < 100:
            SIM_THRESHOLD = np.percentile(non_diag_sims, 90)
            print(f"Few edges. Lowering to 90th percentile: {SIM_THRESHOLD:.3f}")

            G.clear_edges()
            edge_candidates = []

            for i in range(len(df)):
                for j in range(i + 1, len(df)):
                    if sim[i, j] >= SIM_THRESHOLD:
                        edge_candidates.append((i, j, sim[i, j]))

            edge_candidates.sort(key=lambda x: x[2], reverse=True)
            edge_candidates = edge_candidates[:MAX_EDGES]

            for i, j, weight in edge_candidates:
                G.add_edge(i, j, weight=float(weight))

            print(f" Added {G.number_of_edges()} edges with lower threshold")

        # Community detection
        print("Detecting communities...")
        part = (
            community_louvain.best_partition(G, weight="weight")
            if G.number_of_edges() > 0
            else {n: 0 for n in G.nodes()}
        )

        nx.set_node_attributes(G, part, "cluster_id")
        num_communities = len(set(part.values()))
        print(f" Detected {num_communities} communities")

        # Color by community
        node_colors = {
            n: f"hsl({int((part[n] * 360) / max(1, num_communities))}, 70%, 50%)"
            for n in G.nodes()
        }

        # Size by degree
        node_sizes = {n: max(8, min(30, int(G.degree(n)) * 3)) for n in G.nodes()}

        # Generate visualization
        print("Generating visualization...")
        tmpdir = tempfile.mkdtemp(prefix="eit_sigma_")
        html_path = os.path.join(tmpdir, "network.html")

        try:
            Sigma.write_html(
                G, html_path,
                fullscreen=True,
                node_color=node_colors,
                node_size=node_sizes,
                node_label="label",
                default_edge_type="curve",
                default_node_label_size=12,
                clickable_edges=False,
                node_border_color_from='node',
                edge_color='#cccccc'
            )

            html = pathlib.Path(html_path).read_text(encoding="utf-8", errors="ignore")

            print(f"{'='*70}")
            print(" NETWORK COMPLETE")
            print(f"{'='*70}")
            print(f"  Nodes: {G.number_of_nodes()}")
            print(f"  Edges: {G.number_of_edges()}")
            print(f"  Communities: {num_communities}")
            print(f"{'='*70}\n")

            return jsonify({
                "html": html,
                "num_nodes": G.number_of_nodes(),
                "num_edges": G.number_of_edges(),
                "num_communities": num_communities,
                "similarity_threshold": float(SIM_THRESHOLD),
                "source": "CSV embeddings (top similarities)"
            })

        except Exception as viz_error:
            print(f"Visualization error: {viz_error}")
            print(" Retrying with fewer edges...")

            G_small = nx.Graph()
            for n in G.nodes():
                G_small.add_node(n, **G.nodes[n])

            # Take only top 5000 edges
            edges_sorted = sorted(G.edges(data=True), key=lambda x: x[2].get('weight', 0), reverse=True)
            for i, j, data in edges_sorted[:5000]:
                G_small.add_edge(i, j, **data)

            print(f"Reduced to {G_small.number_of_edges()} edges")

            Sigma.write_html(
                G_small, html_path,
                fullscreen=True,
                node_color=node_colors,
                node_size=node_sizes,
                node_label="label",
                default_edge_type="curve",
                default_node_label_size=12
            )

            html = pathlib.Path(html_path).read_text(encoding="utf-8", errors="ignore")

            return jsonify({
                "html": html,
                "num_nodes": G_small.number_of_nodes(),
                "num_edges": G_small.number_of_edges(),
                "num_communities": num_communities,
                "similarity_threshold": float(SIM_THRESHOLD),
                "source": "CSV embeddings (reduced)"
            })

    except Exception as e:
        import traceback
        error_details = traceback.format_exc()
        print(f"{'='*70}")
        print(f" NETWORK ERROR")
        print(f"{'='*70}")
        print(error_details)
        print(f"{'='*70}\n")
        return jsonify({"error": f"Failed: {str(e)}", "details": error_details}), 500

@app.route("/models", methods=["GET"])
def get_models():
    return jsonify({
        "models": [
            {"id": "Mistral 7B", "name": "Mistral 7B", "type": "LLM"},
            {"id": "SpacyLLM", "name": "SpaCy-LLM Hybrid", "type": "Hybrid"}
        ]
    })

# START SERVER
print("\n" + "="*70)
print("STARTING SERVER...")
print("="*70 + "\n")

authtoken = userdata.get("NGROK")
ngrok.set_auth_token(authtoken)
public_url = ngrok.connect(5050)

print("\n" + "="*70)
print(" BACKEND READY")
print("="*70)
print("\n PASTE THIS URL IN YOUR UI:\n")
print(f"   {public_url.public_url}")
print("\n" + "="*70)
print("\n Quick Setup:")
print("   1. Copy the URL above")
print("   2. Open your HTML file")
print("   3. Find: const BACKEND_BASE = '...'")
print("   4. Paste the URL (no /endpoint needed)")
print("\n Models: Mistral 7B | SpaCy-LLM")
print(" Network: CSV embeddings (gt_graph_embedded.csv)")
print(" Scraping: 1500 word limit, no duplicates")
print("\n" + "="*70 + "\n")

Thread(target=lambda: app.run(port=5050, debug=False, use_reloader=False)).start()

try:
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("\nStopped.")