<a href="https://colab.research.google.com/github/rishi10-tech/rishi10-tech-AI-Research-Copilot/blob/main/LearnIQai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers sentence-transformers faiss-cpu pypdf gradio torch numpy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.6/330.6 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import faiss
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import gradio as gr

In [4]:
# Embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Lightweight open-source LLM
llm = pipeline(
    "text-generation",
    model="google/flan-t5-base",
    max_length=512
)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Passing `generation_config` together with generation-related arguments=({'max_length'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'AfmoeForCausalLM', 'ApertusForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'BltForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'CwmForCausalLM', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCa

In [5]:
chunks = []
index = None

In [6]:
def read_pdf(file):
    reader = PdfReader(file.name)
    text = ""
    for page in reader.pages:
        if page.extract_text():
            text += page.extract_text()
    return text

In [7]:
def chunk_text(text, chunk_size=500):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

In [8]:
def build_vector_store(text):
    global chunks, index
    chunks = chunk_text(text)
    embeddings = embedder.encode(chunks, show_progress_bar=False)

    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings.astype("float32"))

In [9]:
def retrieve_context(query, top_k=3):
    if index is None:
        return "No document uploaded."

    query_embedding = embedder.encode([query])
    _, indices = index.search(query_embedding.astype("float32"), top_k)

    return " ".join([chunks[i] for i in indices[0]])

In [10]:
def answer_question(query):
    context = retrieve_context(query)

    prompt = f"""
    Answer the question using only the context below.

    Context:
    {context}

    Question:
    {query}
    """

    return llm(prompt)[0]["generated_text"]

In [11]:
def explain_equation(query):
    context = retrieve_context(query)

    prompt = f"""
    Explain the equation found in the context.

    Context:
    {context}

    Explain clearly:
    1. Meaning of variables
    2. Purpose of equation
    3. How it is implemented in real systems
    """

    return llm(prompt)[0]["generated_text"]

In [12]:
def detect_equation_type(text):
    t = text.lower()
    if "loss" in t or "error" in t:
        return "loss"
    if "sum" in t or "∑" in t:
        return "sum"
    if "matrix" in t or "vector" in t:
        return "matrix"
    return "general"

In [13]:
def recommend_language(eq_type):
    return {
        "loss": "Python",
        "sum": "C++",
        "matrix": "Python",
        "general": "C"
    }.get(eq_type, "Python")

In [14]:
def python_code():
    return """# Python implementation
import numpy as np

def compute(y, y_hat):
    return np.sum((y - y_hat) ** 2)
"""

def cpp_code():
    return """// C++ implementation
#include <vector>
using namespace std;

double compute(const vector<double>& y, const vector<double>& y_hat) {
    double loss = 0;
    for (int i = 0; i < y.size(); i++)
        loss += (y[i] - y_hat[i]) * (y[i] - y_hat[i]);
    return loss;
}
"""

def java_code():
    return """// Java implementation
public static double compute(double[] y, double[] yHat) {
    double loss = 0;
    for (int i = 0; i < y.length; i++)
        loss += Math.pow(y[i] - yHat[i], 2);
    return loss;
}
"""

def c_code():
    return """// C implementation
double compute(double y[], double y_hat[], int n) {
    double loss = 0;
    for (int i = 0; i < n; i++)
        loss += (y[i] - y_hat[i]) * (y[i] - y_hat[i]);
    return loss;
}
"""

In [15]:
def equation_to_code(equation_text, language):
    eq_type = detect_equation_type(equation_text)

    if language == "Auto":
        language = recommend_language(eq_type)

    if language == "Python":
        return python_code()
    if language == "C++":
        return cpp_code()
    if language == "Java":
        return java_code()
    if language == "C":
        return c_code()

    return "Unsupported language."

In [17]:
css_code = """
#answer_box textarea {
    height: 350px !important;
    font-size: 16px;
}
"""

with gr.Blocks(css=css_code) as demo:

    gr.Markdown("## 📘 AI Research Copilot (RAG + Equation → Code)")

    pdf = gr.File(label="Upload Research PDF", file_types=[".pdf"])

    mode = gr.Radio(
        ["Normal", "Equation"],
        value="Normal",
        label="Answer Mode"
    )

    question = gr.Textbox(label="Ask a question", lines=2)

    answer = gr.Textbox(
        label="Answer",
        elem_id="answer_box",
        lines=15
    )

    submit = gr.Button("Get Answer")

    lang = gr.Dropdown(
        ["Auto", "Python", "C", "C++", "Java"],
        value="Auto",
        label="Select Code Language"
    )

    code = gr.Code(
        label="Equation → Code Implementation",
        language="python"
    )

    gen_code = gr.Button("Generate Code")

    pdf.change(
        lambda f: build_vector_store(read_pdf(f)),
        pdf,
        []
    )

    submit.click(
        lambda q, m: explain_equation(q) if m == "Equation" else answer_question(q),
        [question, mode],
        answer
    )

    gen_code.click(
        equation_to_code,
        [question, lang],
        code
    )

demo.launch(debug=False)

  with gr.Blocks(css=css_code) as demo:


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9221e7b5b055cdcbfa.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


