# GreenPower RAG System v04

## Pipeline complet:
1. Setup Qdrant (local vectoriel)
2. Chargement documents GreenPower via Gradio
3. Chunking + Embeddings (sentence-transformers)
4. Stockage Qdrant vector index
5. RAG pipeline : query → retrieval → Mistral
6. Interface Gradio interactive

## Corrections v04:
- Fix méthode search() → query() pour Qdrant
- Fix création collection avant upsert
- Fix imports langchain
- Import JSON et CSV (only txt, docx, txt defore...)

In [11]:
# Cell 1: Installation des dépendances
!pip install -q langchain-mistralai langchain-community langchain-text-splitters
!pip install -q qdrant-client gradio sentence-transformers
!pip install -q pypdf python-docx python-dotenv

In [None]:
# Cell 2: Imports
import os
from pathlib import Path
import gradio as gr
from langchain_mistralai import ChatMistralAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid
from typing import List
import pypdf
import docx
import json
import csv
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

print("Imports OK")

In [13]:
env_path = Path('.env')
if env_path.exists():
    with open(env_path) as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith('#') and '=' in line:
                key, value = line.split('=', 1)
                os.environ[key.strip()] = value.strip()

MISTRAL_API_KEY = os.getenv('MISTRAL_API_KEY')
QDRANT_URL = os.getenv('QDRANT_URL', 'URL')  # Use :memory: for local or cloud URL
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY', None)

In [None]:
env_path = Path('.env')
if env_path.exists():
    with open(env_path) as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith('#') and '=' in line:
                key, value = line.split('=', 1)
                os.environ[key.strip()] = value.strip()

MISTRAL_API_KEY = os.getenv('MISTRAL_API_KEY')
QDRANT_URL = os.getenv('QDRANT_URL', 'URL')  # Use :memory: for local or cloud URL
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY', None)
# Configuration CHUNKS
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
# Initialize components
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
llm = ChatMistralAI(model='mistral-small-latest', mistral_api_key=MISTRAL_API_KEY, temperature=0.7)

qdrant_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
COLLECTION_NAME = "greenpower_docs"
# Text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
)

print("Clients initialises")
print(f"   - Qdrant: Mode in-memory")
print(f"   - Embeddings: {EMBEDDING_MODEL}")
print(f"   - LLM: Mistral Small")

In [None]:
# Cell 5: Creation de la collection Qdrant
# Obtenir la dimension des embeddings en generant un embedding de test
test_embedding = embeddings.embed_query("test")
VECTOR_SIZE = len(test_embedding)

def create_collection_if_not_exists():
    """Cree la collection Qdrant si elle n'existe pas deja"""
    try:
        collections = qdrant_client.get_collections()
        collection_names = [c.name for c in collections.collections]
        
        if COLLECTION_NAME in collection_names:
            print(f"[INFO] Collection '{COLLECTION_NAME}' existe deja")
            return
        
        qdrant_client.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=VectorParams(
                size=VECTOR_SIZE,
                distance=Distance.COSINE
            )
        )
        print(f"[OK] Collection '{COLLECTION_NAME}' creee (dimension: {VECTOR_SIZE})")
    except Exception as e:
        print(f"[WARN] Erreur creation collection: {e}")
        raise

# Creer la collection
create_collection_if_not_exists()

In [None]:
# Cell 6: Fonctions utilitaires

def extract_text_from_file(file_path: str) -> str:
    """Extrait le texte d'un fichier PDF, DOCX, TXT, JSON ou CSV"""
    file_path = Path(file_path)
    
    try:
        if file_path.suffix.lower() == '.pdf':
            with open(file_path, 'rb') as f:
                pdf_reader = pypdf.PdfReader(f)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
                return text
        
        elif file_path.suffix.lower() in ['.docx', '.doc']:
            doc = docx.Document(file_path)
            return "\n".join([para.text for para in doc.paragraphs])
        
        elif file_path.suffix.lower() == '.txt':
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        
        elif file_path.suffix.lower() == '.json':
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                # Convertir le JSON en texte lisible
                if isinstance(data, dict):
                    # Si c'est un dictionnaire, le formater proprement
                    text_parts = []
                    for key, value in data.items():
                        if isinstance(value, (dict, list)):
                            text_parts.append(f"{key}: {json.dumps(value, ensure_ascii=False, indent=2)}")
                        else:
                            text_parts.append(f"{key}: {value}")
                    return "\n".join(text_parts)
                elif isinstance(data, list):
                    # Si c'est une liste, traiter chaque element
                    return "\n\n".join([json.dumps(item, ensure_ascii=False, indent=2) for item in data])
                else:
                    return str(data)
        
        elif file_path.suffix.lower() == '.csv':
            with open(file_path, 'r', encoding='utf-8') as f:
                csv_reader = csv.DictReader(f)
                rows = []
                for row in csv_reader:
                    # Convertir chaque ligne en texte formate
                    row_text = ", ".join([f"{key}: {value}" for key, value in row.items()])
                    rows.append(row_text)
                return "\n".join(rows)
        
        else:
            return f"Format non supporte: {file_path.suffix}"
    
    except Exception as e:
        return f"Erreur lecture fichier {file_path.name}: {str(e)}"

def chunk_and_embed(text: str, source: str) -> List[dict]:
    """Decoupe le texte en chunks et genere les embeddings"""
    # Chunking
    chunks = text_splitter.split_text(text)
    
    # Embeddings
    chunk_embeddings = embeddings.embed_documents(chunks)
    
    # Preparer les points pour Qdrant
    points = []
    for i, (chunk, embedding) in enumerate(zip(chunks, chunk_embeddings)):
        point = PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding,
            payload={
                "text": chunk,
                "source": source,
                "chunk_index": i
            }
        )
        points.append(point)
    
    return points

print("Fonctions utilitaires definies")

In [None]:
# Cell 7: Fonction d'upload de documents

def upload_documents(files):
    """Traite et stocke les documents uploades"""
    if not files:
        return "[ERREUR] Aucun fichier uploade"
    
    # S'assurer que la collection existe
    create_collection_if_not_exists()
    
    total_chunks = 0
    results = []
    
    for file in files:
        try:
            # Extraction du texte
            file_path = file.name if hasattr(file, 'name') else file
            filename = Path(file_path).name
            text = extract_text_from_file(file_path)
            
            if text.startswith("Erreur") or text.startswith("Format non supporte"):
                results.append(f"[WARN] {filename}: {text}")
                continue
            
            # Chunking et embedding
            points = chunk_and_embed(text, filename)
            
            # Stockage dans Qdrant
            qdrant_client.upsert(
                collection_name=COLLECTION_NAME,
                points=points
            )
            
            total_chunks += len(points)
            results.append(f"[OK] {filename}: {len(points)} chunks")
            
        except Exception as e:
            results.append(f"[ERREUR] {filename}: Erreur - {str(e)}")
    
    summary = f"\n\n**Total: {total_chunks} chunks** stockes dans '{COLLECTION_NAME}'"
    return "\n".join(results) + summary

print("Fonction upload_documents definie")

In [None]:
# Cell 8: Fonction RAG

def search_and_answer(question: str, top_k: int = 3):
    """Recherche dans Qdrant et genere une reponse avec Mistral"""
    if not question or question.strip() == "":
        return "[WARN] Veuillez poser une question"
    
    try:
        # 1. Generer l'embedding de la question
        question_embedding = embeddings.embed_query(question)
        
        # 2. Recherche dans Qdrant (methode correcte: search)
        search_results = qdrant_client.search(
            collection_name=COLLECTION_NAME,
            query_vector=question_embedding,
            limit=top_k
        )
        
        # 3. Extraire les chunks pertinents
        if not search_results:
            return "[WARN] Aucun document trouve. Uploadez d'abord des documents."
        
        contexts = []
        sources = []
        for point in search_results:
            contexts.append(point.payload["text"])
            source = point.payload.get("source", "Unknown")
            if source not in sources:
                sources.append(source)
        
        context_text = "\n\n---\n\n".join(contexts)
        
        # 4. Construire le prompt
        prompt = f"""Tu es un assistant qui repond aux questions en te basant UNIQUEMENT sur le contexte fourni.

Contexte:
{context_text}

Question: {question}

Instructions:
- Reponds de maniere claire et concise
- Base-toi UNIQUEMENT sur le contexte fourni
- Si l'information n'est pas dans le contexte, dis-le clairement
- Cite les sources quand pertinent

Reponse:"""
        
        # 5. Generer la reponse avec Mistral
        response = llm.invoke(prompt)
        answer = response.content
        
        # 6. Formater la reponse
        output = f"""## Reponse

{answer}

---

**Sources consultees:** {', '.join(sources)}
**{len(contexts)} chunks** analyses
"""
        
        return output
    
    except Exception as e:
        return f"[ERREUR] {str(e)}\n\nDetails technique: {type(e).__name__}"

print("Fonction search_and_answer definie")

In [None]:
# Cell 9: Interface Gradio

with gr.Blocks(title="GreenPower RAG v04", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # GreenPower RAG System v04
        
        **Retrieval-Augmented Generation** pour vos documents GreenPower
        
        ### Workflow:
        1. **Upload** vos documents (PDF, DOCX, TXT, JSON, CSV)
        2. **Ask** vos questions sur le contenu
        3. Obtenez des reponses basees sur vos documents!
        """
    )
    
    with gr.Tab("Upload Documents"):
        gr.Markdown(
            """
            ### Upload vos documents
            
            Formats supportes: **PDF**, **DOCX**, **TXT**, **JSON**, **CSV**
            
            Les documents seront:
            - Decoupes en chunks
            - Vectorises avec sentence-transformers
            - Stockes dans Qdrant
            """
        )
        
        file_input = gr.File(
            file_count="multiple",
            label="Selectionnez vos documents",
            file_types=[".pdf", ".docx", ".doc", ".txt", ".json", ".csv"]
        )
        upload_btn = gr.Button("Upload et Traiter", variant="primary")
        upload_output = gr.Textbox(
            label="Resultat",
            lines=10,
            placeholder="Les resultats d'upload apparaitront ici..."
        )
        
        upload_btn.click(
            upload_documents,
            inputs=file_input,
            outputs=upload_output
        )
    
    with gr.Tab("Ask Questions"):
        gr.Markdown(
            """
            ### Posez vos questions
            
            Le systeme va:
            1. Rechercher les passages pertinents
            2. Generer une reponse avec Mistral
            3. Citer les sources utilisees
            """
        )
        
        question_input = gr.Textbox(
            label="Votre question",
            placeholder="Ex: Quels sont les objectifs de GreenPower pour 2025?",
            lines=3
        )
        
        top_k_slider = gr.Slider(
            minimum=1,
            maximum=10,
            value=3,
            step=1,
            label="Nombre de chunks a recuperer",
            info="Plus de chunks = plus de contexte (mais plus lent)"
        )
        
        ask_btn = gr.Button("Obtenir la Reponse", variant="primary")
        
        answer_output = gr.Markdown(
            label="Reponse",
            value="*La reponse apparaitra ici...*"
        )
        
        ask_btn.click(
            search_and_answer,
            inputs=[question_input, top_k_slider],
            outputs=answer_output
        )
        
        gr.Examples(
            examples=[
                ["Quels sont les objectifs principaux de l'entreprise?", 3],
                ["Quelle est la strategie de developpement?", 5],
                ["Quels sont les projets en cours?", 3],
            ],
            inputs=[question_input, top_k_slider],
        )
    
    with gr.Tab("Info"):
        gr.Markdown(
            f"""
            ### Configuration Technique
            
            - **Vector DB:** Qdrant (in-memory)
            - **Embeddings:** {EMBEDDING_MODEL}
            - **LLM:** Mistral Small
            - **Collection:** {COLLECTION_NAME}
            - **Chunk size:** {CHUNK_SIZE} caracteres
            - **Overlap:** {CHUNK_OVERLAP} caracteres
            
            ### Notes
            
            - Le mode in-memory ne persiste pas les donnees entre les redemarrages
            - Pour la production, utilisez Qdrant en mode serveur
            - Les embeddings sont generes localement (CPU)
            
            ### Corrections v04
            
            - Fix `search()` -> `query_points()` (API Qdrant correcte)
            - Fix creation collection avant upsert
            - Fix imports langchain
            - Gestion erreurs amelioree
            - Meilleur formatage des reponses
            - Support ajoute pour JSON et CSV
            """
        )

# Lancer l'interface
print("\n" + "="*60)
print("Lancement de l'interface Gradio...")
print("="*60)

demo.launch(
    server_name="127.0.0.1",
    server_port=7650,
    share=False,
    show_error=True
)