In [1]:
!pip install requests chromadb sentence-transformers pymupdf gradio arxiv python-docx reportlab matplotlib plotly pandas networkx --quiet

import os, re, json, requests, fitz, time
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
import gradio as gr
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
import arxiv
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import networkx as nx

# ------------------------------
# API Keys
# ------------------------------
OPENROUTER_API_KEY = "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"
RAPIDAPI_KEY = "537491845dmshb7f7f9ad7e79d29p1ce47bjsn1d8fcd968154"

# API Hosts
AI_DETECTION_HOST = "ai-detection4.p.rapidapi.com"
PLAGIARISM_HOST = "plagiarism-checker-and-auto-citation-generator-multi-lingual.p.rapidapi.com"
HUMANIZE_HOST = "humanize-ai-content-paraphrasing-api.p.rapidapi.com"
TEXTGEARS_HOST = "textgears-textgears-v1.p.rapidapi.com"

# ------------------------------
# Session Storage
# ------------------------------
session_messages = []
session_docs = []
current_research_topic = None
has_uploaded_pdfs = False

# ------------------------------
# Quality Check Functions
# ------------------------------
def detect_ai(text):
    """Detect AI-generated content score"""
    url = f"https://{AI_DETECTION_HOST}/v1/ai-detection-rapid-api"
    headers = {
        "x-rapidapi-key": RAPIDAPI_KEY,
        "x-rapidapi-host": AI_DETECTION_HOST,
        "Content-Type": "application/json"
    }
    payload = {"text": text, "lang": "en"}
    try:
        res = requests.post(url, headers=headers, json=payload, timeout=30)
        res.raise_for_status()
        data = res.json()
        return float(data.get("aiScore", 0))
    except Exception as e:
        print(f"❌ AI Detection Error: {e}")
        return 0

def check_plagiarism(text):
    """Check plagiarism percentage"""
    url = f"https://{PLAGIARISM_HOST}/plagiarism"
    headers = {
        "x-rapidapi-key": RAPIDAPI_KEY,
        "x-rapidapi-host": PLAGIARISM_HOST,
        "Content-Type": "application/json"
    }
    payload = {
        "text": text,
        "language": "en",
        "includeCitations": False,
        "scrapeSources": False
    }
    try:
        res = requests.post(url, headers=headers, json=payload, timeout=30)
        res.raise_for_status()
        data = res.json()
        return float(data.get("plagiarismPercentage", 0))
    except Exception as e:
        print(f"❌ Plagiarism Check Error: {e}")
        return 0

def humanize_text(text):
    """Humanize AI-generated text"""
    url = f"https://{HUMANIZE_HOST}/v1/paraphrase?raw=true"
    headers = {
        "x-rapidapi-key": RAPIDAPI_KEY,
        "x-rapidapi-host": HUMANIZE_HOST,
        "Content-Type": "application/json"
    }
    try:
        res = requests.post(url, headers=headers, json={"text": text}, timeout=30)
        res.raise_for_status()
        data = res.json()
        return data.get("humanized", text).strip()
    except Exception as e:
        print(f"❌ Humanize Error: {e}")
        return text

def grammar_check(text):
    """Check grammar errors"""
    url = f"https://{TEXTGEARS_HOST}/grammar"
    headers = {
        "x-rapidapi-key": RAPIDAPI_KEY,
        "x-rapidapi-host": TEXTGEARS_HOST
    }
    payload = {"text": text, "language": "en-US"}
    try:
        res = requests.post(url, headers=headers, data=payload, timeout=30)
        res.raise_for_status()
        data = res.json()
        errors = data.get("response", {}).get("errors", [])
        return errors
    except Exception as e:
        print(f"❌ Grammar Check Error: {e}")
        return []

def correct_text_with_openrouter(text):
    """Fix grammar using OpenRouter AI"""
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": "https://researchmate-ai.com",
        "X-Title": "ResearchMate AI"
    }
    payload = {
        "model": "mistralai/mistral-7b-instruct:free",
        "messages": [
            {"role": "system", "content": "Fix grammar and spelling errors. Maintain academic tone. Return ONLY the corrected text."},
            {"role": "user", "content": f"Fix grammar:\n\n{text}"}
        ]
    }
    try:
        res = requests.post(url, headers=headers, json=payload, timeout=60)
        res.raise_for_status()
        data = res.json()
        return data["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"❌ Grammar Correction Error: {e}")
        return text

def process_section_quality(text, section_name, ai_thresh=15, plag_thresh=10, max_humanize_attempts=2):
    """
    Run quality checks on a single section.
    If AI/plagiarism is high, attempt humanization up to max_humanize_attempts, then request a paraphrase from the LLM.
    Returns processed_text and report dict.
    """
    if not text or len(text.strip()) < 30:
        return text, {"ai_score": 0, "plagiarism_score": 0, "grammar_issues": 0, "status": "skipped"}

    # Initial checks
    ai_score = detect_ai(text)
    plag_score = check_plagiarism(text)
    processed_text = text

    # If needs humanizing, try multiple strategies
    humanize_attempt = 0
    if ai_score > ai_thresh or plag_score > plag_thresh:
        while humanize_attempt < max_humanize_attempts:
            try:
                processed_text = humanize_text(processed_text)
            except Exception:
                pass
            time.sleep(0.8)
            # Re-check quickly
            ai_score = detect_ai(processed_text)
            plag_score = check_plagiarism(processed_text)
            humanize_attempt += 1
            if ai_score <= ai_thresh and plag_score <= plag_thresh:
                break

        # If still flagged, ask OpenRouter to paraphrase to "human academic style"
        if ai_score > ai_thresh or plag_score > plag_thresh:
            prompt = (
                f"Paraphrase the following section to make it read like authentic, human-written academic prose. "
                f"Keep the technical meaning exactly the same, keep citations/values, but change wording and sentence rhythm. "
                f"Return ONLY the paraphrased section.\n\nSECTION NAME: {section_name}\n\n{text}"
            )
            try:
                messages = [
                    {"role": "system", "content": "You are a skilled academic writer. Paraphrase text to sound human and natural."},
                    {"role": "user", "content": prompt}
                ]
                paraphrased = get_ai_response(messages)
                if paraphrased and not paraphrased.startswith("[AI ERROR]"):
                    processed_text = paraphrased
            except Exception:
                pass

    # Grammar check and correction if many errors
    grammar_errors = grammar_check(processed_text)
    if grammar_errors and len(grammar_errors) > 2:
        try:
            processed_text = correct_text_with_openrouter(processed_text)
        except Exception:
            pass

    # Final checks for report
    final_ai = detect_ai(processed_text)
    final_plag = check_plagiarism(processed_text)
    final_grammar = grammar_check(processed_text)

    status = "✅ passed" if final_ai <= ai_thresh and final_plag <= plag_thresh else "⚠️ needs review"
    report = {
        "ai_score": final_ai,
        "plagiarism_score": final_plag,
        "grammar_issues": len(final_grammar),
        "status": status
    }
    return processed_text, report

# ------------------------------
# OpenRouter AI
# ------------------------------
FREE_MODELS = [
    "deepseek/deepseek-r1-0528-qwen3-8b:free",
    "meta-llama/llama-3.2-3b-instruct:free",
    "google/gemini-flash-1.5:free",
    "mistralai/mistral-7b-instruct:free",
    "nousresearch/hermes-3-llama-3.1-405b:free"
]
current_model_index = 0

def get_ai_response(messages, max_retries=3):
    global current_model_index

    models_tried = 0
    max_models_to_try = len(FREE_MODELS)

    while models_tried < max_models_to_try:
        model = FREE_MODELS[current_model_index]

        for attempt in range(max_retries):
            try:
                url = "https://openrouter.ai/api/v1/chat/completions"
                headers = {
                    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                    "Content-Type": "application/json",
                    "HTTP-Referer": "https://researchmate-ai.com",
                    "X-Title": "ResearchMate AI"
                }
                payload = {"model": model, "messages": messages}

                if attempt > 0:
                    time.sleep(2 ** attempt)

                r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=60)

                if r.status_code == 429:
                    current_model_index = (current_model_index + 1) % len(FREE_MODELS)
                    models_tried += 1
                    break

                r.raise_for_status() # Raise an exception for HTTP errors

                # Attempt to parse JSON, handle potential empty or non-JSON response
                try:
                    data = r.json()
                except json.JSONDecodeError:
                    return f"[AI ERROR] Invalid JSON response from model '{model}'. Response: {r.text}"

                if "choices" in data and len(data["choices"]) > 0:
                    return data["choices"][0]["message"]["content"]
                else:
                    return f"[AI ERROR] No choices found in response from model '{model}'. Response: {data}"

            except requests.exceptions.RequestException as req_e:
                # Catch specific request exceptions for better error reporting
                if attempt == max_retries - 1:
                    return f"[AI ERROR] Request failed for model '{model}' after {max_retries} attempts: {str(req_e)}"
            except Exception as e:
                # Catch any other unexpected errors during the process
                if attempt == max_retries - 1:
                    return f"[AI ERROR] An unexpected error occurred with model '{model}' after {max_retries} attempts: {str(e)}"

        # If we broke out of the inner loop due to 429, try the next model
        if models_tried < max_models_to_try:
            current_model_index = (current_model_index + 1) % len(FREE_MODELS)
            models_tried += 1

    return "⚠️ All models are currently rate limited or encountering persistent errors. Please wait and try again."


# ------------------------------
# RAG Setup
# ------------------------------
try:
    chromadb.api.client.SharedSystemClient.clear_system_cache()
except:
    pass

client = chromadb.PersistentClient(path="./chromadb_session")
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
collection = client.get_or_create_collection(name="papers_session", embedding_function=embedding_fn)

def add_texts_to_rag(texts, names, source_type="pdf"):
    for text, name in zip(texts, names):
        doc_id = re.sub(r'[^\w\-\. ]', '', name)[:200]
        try:
            collection.add(documents=[text], metadatas=[{"title": name, "source": source_type}], ids=[doc_id])
        except Exception as e:
            print("Chroma add error:", e)
        session_docs.append({"title": name, "text": text, "source": source_type})

def query_rag(user_query, research_topic=None):
    global has_uploaded_pdfs, current_research_topic

    try:
        pdf_docs = [d for d in session_docs if d.get("source") == "pdf"]
        arxiv_docs = [d for d in session_docs if d.get("source") == "arxiv"]

        if pdf_docs:
            results = collection.query(query_texts=[user_query], n_results=3)
            docs = results["documents"][0] if results["documents"] else []
            pdf_context = "\n\n".join([d for d in docs if d]) if docs else ""

            if not pdf_context:
                return "❌ No relevant content found in your uploaded PDFs."

            messages = [
                {"role": "system", "content": "You are a research assistant. Answer based on provided PDF content."},
                {"role": "user", "content": f"CONTENT:\n{pdf_context}\n\nQUESTION: {user_query}"}
            ]
            return get_ai_response(messages)

        elif current_research_topic and not pdf_docs:
            if not arxiv_docs:
                papers = fetch_research_papers(current_research_topic, max_results=5)
                if papers:
                    abstracts = [p["abstract"] for p in papers if p.get("abstract")]
                    titles = [p["title"] for p in papers]
                    add_texts_to_rag(abstracts, titles, source_type="arxiv")
                else:
                    return "❌ Could not fetch arXiv papers."

            results = collection.query(query_texts=[user_query], n_results=3)
            docs = results["documents"][0] if results["documents"] else []
            arxiv_context = "\n\n".join([d for d in docs if d]) if docs else ""

            messages = [
                {"role": "system", "content": f"Research assistant for {current_research_topic}."},
                {"role": "user", "content": f"TOPIC: {current_research_topic}\n\nCONTEXT:\n{arxiv_context}\n\nQUESTION: {user_query}"}
            ]
            return get_ai_response(messages)

        else:
            return "❌ Please upload PDFs or set a research topic."

    except Exception as e:
        return f"[RAG ERROR] {e}"

# ------------------------------
# PDF Utilities
# ------------------------------
def extract_text_from_pdfs(pdf_files):
    texts, names = [], []
    for pdf_file in pdf_files:
        try:
            doc = fitz.open(pdf_file.name)
            pdf_text = "".join([page.get_text() for page in doc])
            texts.append(pdf_text)
            names.append(os.path.basename(pdf_file.name))
        except Exception as e:
            print(f"Error reading PDF: {e}")
    return texts, names

# ------------------------------
# arXiv Utilities
# ------------------------------
def fetch_research_papers(topic, max_results=5):
    papers = []
    try:
        search = arxiv.Search(query=topic, max_results=max_results, sort_by=arxiv.SortCriterion.Relevance)
        for result in search.results():
            papers.append({
                "title": result.title,
                "authors": [a.name for a in result.authors],
                "abstract": result.summary,
                "pdf_url": result.pdf_url
            })
    except Exception as e:
        print("arXiv fetch error:", e)
    return papers

# ------------------------------
# Chat Integration
# ------------------------------
def combined_chat(idea_topic, pdf_files, user_input):
    global current_research_topic, has_uploaded_pdfs

    if not user_input:
        return "❌ Please enter a question."

    if idea_topic:
        current_research_topic = idea_topic

    if pdf_files and not has_uploaded_pdfs:
        pdf_texts, pdf_names = extract_text_from_pdfs(pdf_files)
        if pdf_texts:
            add_texts_to_rag(pdf_texts, pdf_names, source_type="pdf")
            has_uploaded_pdfs = True

    session_messages.append({"role": "user", "content": user_input})
    answer = query_rag(user_input, research_topic=current_research_topic)
    session_messages.append({"role": "assistant", "content": answer})

    return answer

# ------------------------------
# Enhanced Paper Generator with Quality Processing
# ------------------------------
def parse_paper_sections(paper_text):
    """
    Parse paper into canonical sections. Returns Ordered dict-like (normal dict preserves insertion in Python3.7+).
    More robust detection of common headings and fallback to splitting by big blank-lines.
    """
    required = ["Abstract", "Introduction", "Literature Review", "Methodology",
                "Results and Discussion", "Conclusion", "References"]
    sections = {}
    # Normalize newlines
    text = re.sub(r'\r\n?', '\n', paper_text).strip()

    # Define a more robust pattern for section headings
    # It looks for common headings at the start of a line, optionally followed by newlines/dashes/colons
    # and ensures it's not just part of a sentence.
    heading_pattern = re.compile(
        r'^\s*(Abstract|Introduction|Literature Review|Related Work|Background|Methodology|Methods|Materials and Methods|Results|Results and Discussion|Discussion|Conclusion|Conclusions|References)\s*[:\n\r\-]{0,2}\s*$',
        re.IGNORECASE | re.MULTILINE
    )

    matches = list(heading_pattern.finditer(text))

    if matches:
        # Collect sections by heading span
        for i, m in enumerate(matches):
            # Extract heading, normalize it, and get the content between this heading and the next
            head = m.group(1).strip()
            start = m.end()
            end = matches[i+1].start() if i+1 < len(matches) else len(text)
            content = text[start:end].strip()

            # Normalize headings to canonical names
            if head.lower() in ["related work", "background"]:
                head = "Literature Review"
            if head.lower() in ["methods", "materials and methods"]:
                head = "Methodology"
            if head.lower() == "results":
                head = "Results and Discussion"
            if head.lower() == "conclusions":
                head = "Conclusion"

            # Avoid overwriting a potentially pre-filled section with an empty one if a heading repeats
            if head not in sections or (head in sections and content):
                sections[head] = content
    else:
        # Fallback: split by double newlines and try to map first few parts
        parts = [p.strip() for p in re.split(r'\n{2,}', text) if p.strip()]
        # Heuristics: first part -> Abstract if short, else header
        for i, part in enumerate(parts):
            if i == 0 and len(part.split()) < 300:
                sections.setdefault("Abstract", part)
            elif i == 0:
                sections.setdefault("Introduction", part)
            elif "method" in part.lower() or "experiment" in part.lower():
                sections.setdefault("Methodology", part)
            elif "result" in part.lower() or "discussion" in part.lower():
                sections.setdefault("Results and Discussion", part)
            elif "conclusion" in part.lower():
                sections.setdefault("Conclusion", part)
            elif "reference" in part.lower():
                sections.setdefault("References", part)
            else:
                # distribute uncategorized content to Introduction or Literature Review
                if "Introduction" not in sections:
                    sections.setdefault("Introduction", part)
                else:
                    sections.setdefault("Literature Review", (sections.get("Literature Review","") + "\n\n" + part).strip())

    # Ensure canonical order and presence of required keys (may be empty)
    ordered = {}
    for key in ["Abstract", "Introduction", "Literature Review", "Methodology", "Results and Discussion", "Conclusion", "References"]:
        if key in sections:
            ordered[key] = sections[key]
        else:
            ordered[key] = ""  # placeholder empty
    return ordered


def generate_paper_with_quality_check(topic, enable_quality_check=True, progress=gr.Progress()):
    """
    Improved section-wise paper generation:
    - Request whole paper with explicit heading format.
    - Parse into canonical sections.
    - For missing/empty sections, call the LLM to produce that section only (using context).
    - Run process_section_quality on each section, rebuild paper, and return files.
    """
    if not topic:
        return "❌ Please enter a research topic.", None, None, ""

    progress(0, desc="Fetching research papers...")
    papers = fetch_research_papers(topic, max_results=5)
    abstracts = " ".join([p["abstract"] for p in papers if p.get("abstract")])

    progress(0.15, desc="Requesting structured paper from model...")
    # Strong instruction to produce clear labeled sections EXACTLY as required
    system_prompt = "You are an academic writer. Produce a complete research paper. Use the exact headings: Abstract, Introduction, Literature Review, Methodology, Results and Discussion, Conclusion, References. Return sections in that order. Do NOT include any introductory or conversational text before the Abstract or between sections other than the headings. Start directly with 'Abstract'."
    user_prompt = (
        f"Create a comprehensive research paper on '{topic}' with these sections:\n\n"
        "1. Abstract (150-250 words)\n2. Introduction\n3. Literature Review\n4. Methodology\n5. Results and Discussion\n6. Conclusion\n7. References\n\n"
        f"Context (optional - use to enrich content):\n{abstracts}\n\n"
        "Start your response directly with the 'Abstract' section."
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    paper_text = get_ai_response(messages)
    if not paper_text or paper_text.startswith("[AI ERROR]"):
        return f"❌ Could not generate paper: {paper_text}", None, None, ""

    # Pre-process the raw paper_text to remove any leading conversational filler
    # Look for the first actual section heading (e.g., "Abstract", "Introduction") and start from there.
    first_heading_match = re.search(r'^\s*(Abstract|Introduction|Literature Review|Related Work|Background|Methodology|Methods|Materials and Methods|Results|Results and Discussion|Discussion|Conclusion|Conclusions|References)\s*[:\n\r\-]{0,2}\s*$', paper_text, re.IGNORECASE | re.MULTILINE)
    if first_heading_match:
        paper_text = paper_text[first_heading_match.start():]
    else:
        # If no heading found, it means the entire response is likely malformed or filler
        return f"❌ Generated paper did not contain expected section headings. Raw output: {paper_text}", None, None, ""


    progress(0.35, desc="Parsing sections...")
    sections = parse_paper_sections(paper_text)

    # If sections are empty, request them individually using context and any available abstract snippets
    progress(0.45, desc="Ensuring all sections present...")
    for name, content in sections.items():
        if not content or content.strip() == "":
            progress(0.45 + 0.1 * (list(sections.keys()).index(name) / len(sections)), desc=f"Generating missing {name}...")
            # Build a targeted prompt to produce that section only
            prompt = (
                f"Produce ONLY the '{name}' section for a research paper on '{topic}'. Keep academic tone. "
                "If data/results are not available, create plausible placeholder results clearly labeled as synthetic. "
                "Return only the section content (no extra headings or introductory phrases)."
            )
            # Provide context from other sections and abstracts
            context = "\n\n".join([f"{k}:\n{v}" for k, v in sections.items() if v and k != name])
            messages = [
                {"role": "system", "content": "You are an academic writer producing missing sections. Respond ONLY with the content of the requested section."},
                {"role": "user", "content": f"{prompt}\n\nContext:\n{context}\n\nAbstracts:{abstracts}"}
            ]
            sec_text = get_ai_response(messages)
            # fallback: short placeholder
            if not sec_text or sec_text.startswith("[AI ERROR]"):
                sec_text = f"[MISSING] {name} could not be generated automatically."
            sections[name] = sec_text.strip() # Strip any leading/trailing whitespace or conversational bits
            time.sleep(0.7)

    progress(0.60, desc="Processing quality for sections...")
    processed_sections = {}
    section_reports = {}
    for idx, (section_name, section_content) in enumerate(sections.items()):
        # Skip references from heavy processing
        if section_name.lower() == "references" or not enable_quality_check:
            processed_sections[section_name] = section_content
            section_reports[section_name] = {"ai_score": 0, "plagiarism_score": 0, "grammar_issues": 0, "status": "skipped" if section_name.lower()=="references" else "not-checked"}
            continue

        progress(0.60 + 0.35 * (idx / max(1, len(sections)-1)), desc=f"Processing {section_name}...")
        processed_content, report = process_section_quality(section_content, section_name)
        processed_sections[section_name] = processed_content
        section_reports[section_name] = report
        # small pause to avoid rate limits
        time.sleep(0.6)

    # Reconstruct the paper in canonical order
    # Filter out empty sections from the final displayed text, but keep them in the processed_sections dict
    # so the DOCX/PDF generation correctly places headings for potentially empty sections.
    final_paper_parts = []
    for name in ["Abstract","Introduction","Literature Review","Methodology","Results and Discussion","Conclusion","References"]:
        content = processed_sections.get(name, "")
        final_paper_parts.append(f"{name}\n{content}".strip()) # Ensure heading is always present even if content is empty

    ordered_text = "\n\n".join(final_paper_parts)

    # Create files
    progress(0.95, desc="Generating files...")
    os.makedirs("generated_papers", exist_ok=True)
    safe_name = re.sub(r'[^\w\-\. ]', '', topic)[:50]

    # DOCX
    docx_path = f"generated_papers/{safe_name}.docx"
    doc = Document()
    doc.add_heading(topic, level=1)
    for name in ["Abstract","Introduction","Literature Review","Methodology","Results and Discussion","Conclusion","References"]:
        content = processed_sections.get(name, "")
        doc.add_heading(name, level=2) # Always add the heading
        if content:
            for para in re.split(r'\n{1,}', content):
                if para.strip():
                    doc.add_paragraph(para.strip())
        else:
            doc.add_paragraph("[Section content not generated]") # Placeholder for empty content
    doc.save(docx_path)

    # PDF (simple streaming, keeps headings)
    pdf_path = f"generated_papers/{safe_name}.pdf"
    c = canvas.Canvas(pdf_path, pagesize=letter)
    textobject = c.beginText(50, 750)
    textobject.setFont("Times-Roman", 11)
    for name in ["Abstract","Introduction","Literature Review","Methodology","Results and Discussion","Conclusion","References"]:
        content = processed_sections.get(name, "")
        textobject.textLine(name) # Always add the heading
        if content:
            for line in content.split("\n"):
                # wrap lines
                for chunk in [line[i:i+100] for i in range(0, len(line), 100)]:
                    if textobject.getY() < 60:
                        c.drawText(textobject)
                        c.showPage()
                        textobject = c.beginText(50, 750)
                        textobject.setFont("Times-Roman", 11)
                    textobject.textLine(chunk)
        else:
            textobject.textLine("[Section content not generated]") # Placeholder for empty content
        textobject.textLine("")  # blank line between sections
    c.drawText(textobject)
    c.save()

    # Build quality report string
    quality_report_text = ""
    if enable_quality_check:
        quality_report_text = "📊 QUALITY CHECK REPORT\n" + "="*60 + "\n\n"
        for section, report in section_reports.items():
            quality_report_text += f"📄 {section}:\n"
            quality_report_text += f"   AI Score: {report['ai_score']:.1f}%\n"
            quality_report_text += f"   Plagiarism: {report['plagiarism_score']:.1f}%\n"
            quality_report_text += f"   Grammar Issues: {report['grammar_issues']}\n"
            quality_report_text += f"   Status: {report['status']}\n\n"

    progress(1.0, desc="Complete!")
    return ordered_text, docx_path, pdf_path, quality_report_text


# ------------------------------
# Analytics Functions (keeping existing ones)
# ------------------------------
def get_session_statistics():
    total_docs = len(session_docs)
    total_messages = len(session_messages)
    user_messages = len([m for m in session_messages if m["role"]=="user"])
    ai_messages = len([m for m in session_messages if m["role"]=="assistant"])
    total_chars = sum(len(doc["text"]) for doc in session_docs)
    avg_doc_length = total_chars / total_docs if total_docs > 0 else 0
    pdf_count = len([d for d in session_docs if d.get("source") == "pdf"])
    arxiv_count = len([d for d in session_docs if d.get("source") == "arxiv"])

    return {
        "total_documents": total_docs,
        "pdf_documents": pdf_count,
        "arxiv_documents": arxiv_count,
        "total_messages": total_messages,
        "user_messages": user_messages,
        "ai_messages": ai_messages,
        "total_characters": total_chars,
        "avg_doc_length": avg_doc_length
    }

def get_top_keywords(top_n=15):
    all_text = " ".join([doc["text"] for doc in session_docs])
    words = re.findall(r'\b\w+\b', all_text.lower())
    stopwords = set(["the","and","for","with","that","this","from","are","was","were","have","has","using","our","can","which","these","their","been","into","than","more","also","will","such","when","there","other","through","about","some","only","would","between"])
    filtered_words = [w for w in words if w not in stopwords and len(w)>3]
    word_counts = Counter(filtered_words)
    return word_counts.most_common(top_n)

def plot_keyword_bar():
    keywords = get_top_keywords()
    if not keywords:
        fig = go.Figure()
        fig.update_layout(title="Top Keywords", plot_bgcolor="white")
        return fig
    df = pd.DataFrame(keywords, columns=["Keyword","Count"])
    fig = px.bar(df, x="Keyword", y="Count", title="Top Keywords", color="Count")
    return fig

def generate_insights_report_session():
    stats = get_session_statistics()
    keywords = get_top_keywords(10)

    topic_info = f"\n🎯 Topic: {current_research_topic}" if current_research_topic else ""
    source_info = f"\n📄 Source: {'PDFs' if has_uploaded_pdfs else 'arXiv'}"

    report = f"""
📊 Session Insights
{'='*60}
{topic_info}{source_info}

📚 Documents: {stats['total_documents']} ({stats['pdf_documents']} PDFs, {stats['arxiv_documents']} arXiv)
💬 Interactions: {stats['total_messages']}
📝 Characters: {stats['total_characters']:,}

🔑 Keywords: {', '.join([k for k,c in keywords[:10]]) if keywords else 'None'}
"""
    return report

def refresh_all_analytics():
    report = generate_insights_report_session()
    keyword_plot = plot_keyword_bar()
    return report, keyword_plot

# ------------------------------
# Gradio UI
# ------------------------------
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🧠 ResearchMate AI — Advanced Research Assistant with Quality Control
    ### 📚 Upload PDFs OR Set Topic | ✅ AI Detection, Plagiarism Check, Humanization & Grammar Correction
    """)

    with gr.Tabs():
        with gr.Tab("💬 Chat Assistant"):
            with gr.Row():
                clear_btn = gr.Button("🔄 Clear Session", variant="secondary")

            idea_input = gr.Textbox(label="🎯 Research Topic", placeholder="e.g., Machine Learning in Healthcare")
            pdf_upload = gr.Files(label="📄 Upload PDFs (Optional)", file_types=[".pdf"])
            pdf_status = gr.Textbox(label="📋 Status", interactive=False, lines=3)
            chat_input = gr.Textbox(label="❓ Your Question", placeholder="Ask anything...", lines=2)
            output_box = gr.Textbox(label="🤖 AI Answer", interactive=False, lines=12)
            btn = gr.Button("🚀 Send", variant="primary")

            def clear_session():
                global session_messages, session_docs, current_research_topic, has_uploaded_pdfs
                session_messages, session_docs = [], []
                current_research_topic, has_uploaded_pdfs = None, False
                try:
                    collection.delete(where={})
                except:
                    pass
                return "✅ Session cleared!", ""

            clear_btn.click(clear_session, outputs=[pdf_status, output_box])

            def upload_handler(idea_topic, pdf_files):
                global current_research_topic, has_uploaded_pdfs
                if idea_topic:
                    current_research_topic = idea_topic
                if not pdf_files:
                    return f"✅ Topic set: '{current_research_topic}'" if current_research_topic else "⚠️ Set topic or upload PDFs"
                try:
                    pdf_texts, pdf_names = extract_text_from_pdfs(pdf_files)
                    if pdf_texts:
                        add_texts_to_rag(pdf_texts, pdf_names, source_type="pdf")
                        has_uploaded_pdfs = True
                        return f"✅ Processed {len(pdf_texts)} PDF(s)"
                    return "❌ Failed to extract text"
                except Exception as e:
                    return f"❌ Error: {str(e)}"

            pdf_upload.change(upload_handler, inputs=[idea_input, pdf_upload], outputs=pdf_status)
            btn.click(combined_chat, inputs=[idea_input, pdf_upload, chat_input], outputs=output_box)

        with gr.Tab("📝 Paper Generator with Quality Control"):
            gr.Markdown("""
            Generate research papers with automatic quality checking:
            - ✅ AI Content Detection
            - ✅ Plagiarism Checking
            - ✅ Text Humanization
            - ✅ Grammar Correction
            """)

            topic_box = gr.Textbox(label="Research Topic", placeholder="e.g., Quantum Computing in Cryptography")
            quality_checkbox = gr.Checkbox(label="Enable Quality Processing (AI Detection, Plagiarism, Humanization, Grammar)", value=True)

            paper_output = gr.Textbox(label="Generated Paper", lines=20, interactive=False)
            quality_report_box = gr.Textbox(label="📊 Quality Report", lines=10, interactive=False)

            with gr.Row():
                docx_download = gr.File(label="📥 Download DOCX")
                pdf_download = gr.File(label="📥 Download PDF")

            gen_btn = gr.Button("✍️ Generate Paper", variant="primary")

            gen_btn.click(
                generate_paper_with_quality_check,
                inputs=[topic_box, quality_checkbox],
                outputs=[paper_output, docx_download, pdf_download, quality_report_box]
            )

        with gr.Tab("📊 Analytics"):
            refresh_btn = gr.Button("🔄 Refresh", variant="primary")
            insights_text = gr.Textbox(label="Overview", lines=15, interactive=False)
            keyword_plot_box = gr.Plot(label="Keywords")

            refresh_btn.click(refresh_all_analytics, outputs=[insights_text, keyword_plot_box])
            demo.load(refresh_all_analytics, outputs=[insights_text, keyword_plot_box])

demo.launch(share=True)

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5dc6837e9189a2c0ed.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


