In [4]:
%pip install langchain-google-genai

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.12-py3-none-any.whl.metadata (7.1 kB)
Collecting google-ai-generativelanguage<1,>=0.7 (from langchain-google-genai)
  Downloading google_ai_generativelanguage-0.9.0-py3-none-any.whl.metadata (10 kB)
Collecting filetype<2,>=1.2 (from langchain-google-genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading langchain_google_genai-2.1.12-py3-none-any.whl (50 kB)
Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Downloading google_ai_generativelanguage-0.9.0-py3-none-any.whl (1.4 MB)
[2K   [38;2;114;156;31m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.4/1.4 MB[0m [31m27.8 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: filetype, google-ai-generativelanguage, langchain-google-genai
[2K  Attempting un

In [6]:
%pip install langchain-chroma

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain-chroma
  Downloading langchain_chroma-0.2.6-py3-none-any.whl.metadata (1.1 kB)
Collecting chromadb>=1.0.20 (from langchain-chroma)
  Downloading chromadb-1.4.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb>=1.0.20->langchain-chroma)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb>=1.0.20->langchain-chroma)
  Downloading pybase64-1.4.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (8.7 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb>=1.0.20->langchain-chroma)
  Downloading uvicorn-0.39.0-py3-none-any.whl.metadata (6.8 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb>=1.0.20->langchain-chroma)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb>=1.0.20->langchain-chroma)
  Downloading onnxruntime-1.19.2-cp39-cp39-ma

In [8]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.document_loaders import WebBaseLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Configuration: Adding website liks and  data sources 

In [10]:
def load_data(pdf_directory_path):
    docs = []
    
    # 1. Load Websites
    print("Loading Websites...")
    web_links = [
        "https://www.devinci.fr/international/welcome-desk/",
        "https://www.devinci.fr/international/universites-partenaires/",
        "https://www.devinci.fr/international/doubles-diplomes-internationaux/",
        "https://www.devinci.fr/international/exchange-students/",
        "https://www.devinci.fr/international/programmes-en-anglais/"
    ]
    
    # We loop through links to handle potential errors gracefully
    for link in web_links:
        try:
            loader = WebBaseLoader(link)
            docs.extend(loader.load())
            print(f"Successfully loaded: {link}")
        except Exception as e:
            print(f"Failed to load {link}: {e}")

    # 2. Load PDFs from Directory
    print(f"\nLoading PDFs from directory: {pdf_directory_path}...")
    try:
        # PyPDFDirectoryLoader automatically finds all .pdf files in the folder
        pdf_loader = PyPDFDirectoryLoader(pdf_directory_path)
        pdf_docs = pdf_loader.load()
        docs.extend(pdf_docs)
        print(f"Successfully loaded {len(pdf_docs)} PDF documents.")
    except Exception as e:
        print(f"Error loading PDFs: {e}")
        
    return docs

# CALL THE FUNCTION
# Ensure you have created a folder named 'college_data' and put your PDFs there
documents = load_data("ESILV_docs")

print(f"\nTotal documents loaded: {len(documents)}")

Loading Websites...
Successfully loaded: https://www.devinci.fr/international/welcome-desk/
Successfully loaded: https://www.devinci.fr/international/universites-partenaires/
Successfully loaded: https://www.devinci.fr/international/doubles-diplomes-internationaux/
Successfully loaded: https://www.devinci.fr/international/exchange-students/
Successfully loaded: https://www.devinci.fr/international/programmes-en-anglais/

Loading PDFs from directory: ESILV_docs...
Successfully loaded 178 PDF documents.

Total documents loaded: 183


# Chunking and Vector Store Creation
### We need to break the text into smaller pieces (chunks) and turn them into numbers (embeddings) so the AI can search through them.

In [12]:
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma

# 1. Split Text into Chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

splits = text_splitter.split_documents(documents)
print(f"Documents split into {len(splits)} chunks.")

# 2. Setup Embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

# Initialize ChromaDB
persist_directory = "./chroma_db"
vectorstore = Chroma(embedding_function=embeddings, persist_directory=persist_directory)

# 3. Smart Batch Processing Function
def add_documents_smartly(splits, batch_size=5):
    total_splits = len(splits)
    print(f"Starting embedding of {total_splits} chunks...")
    
    for i in range(0, total_splits, batch_size):
        batch = splits[i : i + batch_size]
        
        retry_count = 0
        while retry_count < 3: # Try up to 3 times if there is an error
            try:
                vectorstore.add_documents(batch)
                print(f"Processed chunks {i} to {min(i + batch_size, total_splits)}")
                
                # Small pause to be nice to the API
                time.sleep(2) 
                break # Success! Exit the while loop and go to next batch
                
            except Exception as e:
                print(f"Hit rate limit. Waiting 60 seconds... (Attempt {retry_count+1}/3)")
                time.sleep(60) # Wait a full minute if we hit the limit
                retry_count += 1
                
        if retry_count == 3:
            print("Stopping script due to persistent errors. Try again later.")
            break

# RUN IT
add_documents_smartly(splits)

retriever = vectorstore.as_retriever()
print("\nVector store created successfully!")

Documents split into 499 chunks.
Starting embedding of 499 chunks...


  vectorstore = Chroma(embedding_function=embeddings, persist_directory=persist_directory)


Processed chunks 0 to 5
Processed chunks 5 to 10
Processed chunks 10 to 15
Processed chunks 15 to 20
Processed chunks 20 to 25
Processed chunks 25 to 30
Processed chunks 30 to 35
Processed chunks 35 to 40
Processed chunks 40 to 45
Processed chunks 45 to 50
Processed chunks 50 to 55
Processed chunks 55 to 60
Processed chunks 60 to 65
Processed chunks 65 to 70
Processed chunks 70 to 75
Processed chunks 75 to 80
Processed chunks 80 to 85
Processed chunks 85 to 90
Processed chunks 90 to 95
Processed chunks 95 to 100
Processed chunks 100 to 105
Processed chunks 105 to 110
Processed chunks 110 to 115
Processed chunks 115 to 120
Processed chunks 120 to 125
Processed chunks 125 to 130
Processed chunks 130 to 135
Processed chunks 135 to 140
Processed chunks 140 to 145
Processed chunks 145 to 150
Processed chunks 150 to 155
Processed chunks 155 to 160
Processed chunks 160 to 165
Processed chunks 165 to 170
Processed chunks 170 to 175
Processed chunks 175 to 180
Processed chunks 180 to 185
Proces

# Creating the RAG Chain
### This sets up the "Brain" (Gemini) and tells it how to answer.

In [28]:
%pip install -U langchain langchain-community langchain-google-genai chromadb


I0000 00:00:1767379295.884724 14613681 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [35]:
%pip install google-genai

I0000 00:00:1767380077.570406 14613681 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [37]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Chroma
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# üîê USE A NEW API KEY (DO NOT COMMIT THIS)
os.environ["GOOGLE_API_KEY"] = "AIzaSyB3KpyVNzxGqmMoKISVGeqgAlI7xZJsTcA"

print("1. API Key set.")

# Embeddings
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004"
)

# Load existing Chroma DB
vectorstore = Chroma(
    persist_directory="chroma_db",
    embedding_function=embeddings
)
retriever = vectorstore.as_retriever()
print("2. Database re-connected.")

# ‚úÖ FIXED MODEL NAME
llm = ChatGoogleGenerativeAI(
    model="gemini-3-flash-preview",
    temperature=0.3
)

# Prompt
system_prompt = (
    "You are a helpful assistant for a college project. "
    "Use the provided context to answer the question. "
    "If the user asks in French, answer in French. "
    "If the user asks in English, answer in English. "
    "If you don't know, strictly say you don't know.\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

print("3. System fully repaired and ready!")

def ask_question(question):
    response = rag_chain.invoke({"input": question})
    print("Q:", question)
    print("A:", response["answer"])
    print("-" * 50)


1. API Key set.
2. Database re-connected.
3. System fully repaired and ready!


# Testing the System
## Now you can ask questions based on the PDFs and Websites you loaded.

In [38]:
# Test 1: Asking about Dates (from "25 26 Calendrier acad√©mique PGE")
ask_question("What are the academic calendar dates for 2025-2026?")

Q: What are the academic calendar dates for 2025-2026?
A: Based on the provided document, the academic calendar dates for 2025-2026 include a schedule running from **January 12, 2026, to July 25, 2026**. Key dates and periods mentioned are:

**Internship Periods (Engineering Cursus):**
*   **A2 (After preparatory cycle):** June 1, 2026, to October 5, 2026 (12 weeks).
*   **A1 and A3:** Starting June 8, 2026.
*   **A4:** April 13, 2026, to September 7, 2026 (16 to 20 weeks).
*   **A5:** Starting February 2, 2026 (20 to 24 weeks).

**Internship Periods (Bachelor Cursus):**
*   **BIN1:** June 8, 2026, to August 31, 2026 (6 to 8 weeks).
*   **BIN2:** Dates to be specified (10 to 12 weeks).

**Holiday and Optional Internship Start Dates:**
*   Depending on the program, these begin on **June 1, June 8, or July 6, 2026**.

**General Calendar Markers:**
*   The schedule lists various activities (Exams, Public Holidays "F", etc.) between January and July 2026.
*   Specific public holidays are i

In [39]:
# Test 2: Asking about Dates (from "25 26 Calendrier acad√©mique PGE")
ask_question("Can you explain the structure of the engineering cycle, including Year 3, Year 4, Year 5, and the S11 extension of studies?")


Q: Can you explain the structure of the engineering cycle, including Year 3, Year 4, Year 5, and the S11 extension of studies?
A: Based on the provided document, the structure of the engineering cycle (Cursus Ing√©nieur) for the 2025-2026 academic year is as follows:

### **Year 3 (A3) - 1st Year of the Engineering Cycle**
*   **Internship Type:** Discovery internship (*Stage de d√©couverte de l‚Äôentreprise*).
*   **Duration/Timing:** This is an optional internship starting from June 8, 2026.

### **Year 4 (A4) - 2nd Year of the Engineering Cycle**
*   **Internship Type:** Technical internship (*Stage technique*).
*   **Duration:** 16 to 20 weeks.
*   **Timing:** Between April 13 and September 7, 2026.

### **Year 5 (A5) - 3rd Year of the Engineering Cycle**
*   **Curriculum Components:**
    *   Courses, Majors, Core Track, and Specific Tracks.
    *   **Project:** Presentation by EGPE and launch in September.
    *   **Requirements:** Master thesis, research report on the internship