# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

### Chargement des données True et Fake

In [2]:
# Chargement des données csv 
df_true = pd.read_csv('/home/fadilatou/PROJETS/rag_fake_news/data/True.csv')
df_fake = pd.read_csv('/home/fadilatou/PROJETS/rag_fake_news/data/Fake.csv')
# Affichage
print(df_true.head())
print(df_fake.head())

# Création de la colonne label
df_true["label"] = "True"
df_fake["label"] = "Fake"

# Fusionner les deux datasets
df = pd.concat([df_true, df_fake], ignore_index=True)

# Afficher les premières lignes
print(df.head(2))

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   
                                               title  \
0   Donald Trump Sends Out Embarrassing Ne

###  Exploration de données 

In [3]:
# Les dimensions de chaque df
print(df.shape)

# Les types de données
print(f" types de données:",df.dtypes)
# Les valeurs de la colonne labels
print(f"Valeur du label:",df['label'].value_counts())


(44898, 5)
 types de données: title      object
text       object
subject    object
date       object
label      object
dtype: object
Valeur du label: label
Fake    23481
True    21417
Name: count, dtype: int64


### Nettoyage du Dataset

In [4]:
# # # La somme des valeurs manquantes
print(f"Les valeurs manquantes :", df.isnull().sum())

# # Détection des doublons
print(f"Nombre de doublons:", df.duplicated().sum())

# Supprimer les doublons
df = df.drop_duplicates(subset=['title', 'text'])

# Vérification
print(".csv après nettoyage :")
print("Doublons restants :", df.duplicated(subset=['title', 'text']).sum())


Les valeurs manquantes : title      0
text       0
subject    0
date       0
label      0
dtype: int64
Nombre de doublons: 209
.csv après nettoyage :
Doublons restants : 0


In [5]:
# # Sauvegarde du dataset propre 
# df.to_csv("/home/fadilatou/PROJETS/rag_fake_news/data/news_clean.csv", index=False)

### Application des fonction de nettoyage sur les colonnes text et title

In [6]:
from test_cleaning import clean_text_pipeline

# Test de fonction de nettoyage
df["text"] = df["text"].apply(clean_text_pipeline)
df["title"] = df["title"].apply(clean_text_pipeline)

fake news politics


### Chunkings : Découpage des données de la colonne text

In [39]:
def chunk_text(text, chunk_size=200, overlap=50):
    """
    Découpe un texte en plusieurs morceaux (chunks) avec un chevauchement optionnel.

    text : str → le texte à découper
    chunk_size : int → nombre de mots par chunk
    overlap : int → nombre de mots qui se chevauchent entre deux chunks
    """
    words = text.split()
    chunks = []

    # Taille du pas entre deux chunks
    step = chunk_size - overlap
    for i in range(0, len(words), step):
        chunk = words[i:i + chunk_size]
        chunks.append(" ".join(chunk))
    return chunks

# Application du chunking
df['text'] = df['text'].apply(lambda x: chunk_text(x, chunk_size=200, overlap=50))
display(df["text"].head())

AttributeError: 'list' object has no attribute 'split'

### Embeddings des chunks

In [None]:

# Explosion du DataFrame pour avoir 1 chunk / ligne 
df_chunks = df.explode("text").reset_index(drop=True)
print(f" Nombre total de chunks : {len(df_chunks)}")

✅ Nombre total de chunks : 76491


In [26]:
import ollama
model_name = "all-minilm:latest"

def generate_embeddings_ollama(texts, model=model_name, batch_size=100):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        response = ollama.embed(model=model, input=batch)
        embeddings = response.get("embeddings", [])
        all_embeddings.extend(embeddings)
        print(f"✅ Batch {i//batch_size + 1} traité ({len(all_embeddings)}/{len(texts)})")
    return all_embeddings


df_test = df_chunks.head(5000)
embeddings = generate_embeddings_ollama(df_test["text"].tolist())
print(f"Nombre d'embeddings générés: {len(embeddings)}")
print(f"Type du premier embedding: {type(embeddings[0])}")
print(f"Longueur du premier embedding: {len(embeddings[0])}")
print(f"Aperçu du premier embedding: {embeddings[0][:5]}")


✅ Batch 1 traité (100/5000)
✅ Batch 2 traité (200/5000)
✅ Batch 3 traité (300/5000)
✅ Batch 4 traité (400/5000)
✅ Batch 5 traité (500/5000)
✅ Batch 6 traité (600/5000)
✅ Batch 7 traité (700/5000)
✅ Batch 8 traité (800/5000)
✅ Batch 9 traité (900/5000)
✅ Batch 10 traité (1000/5000)
✅ Batch 11 traité (1100/5000)
✅ Batch 12 traité (1200/5000)
✅ Batch 13 traité (1300/5000)
✅ Batch 14 traité (1400/5000)
✅ Batch 15 traité (1500/5000)
✅ Batch 16 traité (1600/5000)
✅ Batch 17 traité (1700/5000)
✅ Batch 18 traité (1800/5000)
✅ Batch 19 traité (1900/5000)
✅ Batch 20 traité (2000/5000)
✅ Batch 21 traité (2100/5000)
✅ Batch 22 traité (2200/5000)
✅ Batch 23 traité (2300/5000)
✅ Batch 24 traité (2400/5000)
✅ Batch 25 traité (2500/5000)
✅ Batch 26 traité (2600/5000)
✅ Batch 27 traité (2700/5000)
✅ Batch 28 traité (2800/5000)
✅ Batch 29 traité (2900/5000)
✅ Batch 30 traité (3000/5000)
✅ Batch 31 traité (3100/5000)
✅ Batch 32 traité (3200/5000)
✅ Batch 33 traité (3300/5000)
✅ Batch 34 traité (3400/5000

### Normalisation des embeddings

In [None]:
def normalize_vectors(vectors):
    normalize = []
    for v in vectors:
        norm = np.linalg.norm(v)
        if norm == 0:
            normalize.append(v)
        else:
            normalize.append(v / norm)
    return normalize

emb_normalize = normalize_vectors(embeddings)
print(f"Nombre d'embeddings: {len(emb_normalize)}")
print(f"Exemple taille d’un embedding : {len(emb_normalize[0])}")
print(f"Shape d'un embedding: {emb_normalize[0].shape}") 

Nombre d'embeddings: 5000
Exemple taille d’un embedding : 384
Shape d'un embedding: (384,)
0.02383402641242069


## Insertions des données dans chromaDB

In [None]:
import chromadb

chroma_path = "/home/fadilatou/PROJETS/rag_fake_news/data"
chroma_client = chromadb.PersistentClient(path=chroma_path)

collection = chroma_client.get_or_create_collection(
    name="news_collection",
    embedding_function=None
)

# Création d’IDs et métadonnées cohérentes
df_test["chunk_id"] = [f"chunk_{i}" for i in range(len(df_test))]
df_test["metadata"] = df_test.apply(
    lambda row: {"label": row["label"], "title": row["title"]},
    axis=1
)

collection.upsert(
    documents=df_test["text"].tolist(),
    ids=df_test["chunk_id"].tolist(),
    embeddings=emb_normalize,
    metadatas=df_test["metadata"].tolist()
)
print(f" {len(df_test)} chunks insérés dans ChromaDB avec succès.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["chunk_id"] = [f"chunk_{i}" for i in range(len(df_test))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["metadata"] = df_test.apply(


 5000 chunks insérés dans ChromaDB avec succès.


## Le test utilisateur .query

In [40]:
# Tets utilisateur
from chromadb.utils import embedding_functions
from ollama import generate

ollama_embed = embedding_functions.OllamaEmbeddingFunction(model_name="all-minilm")

# Texte utilisateur
text = "Barack Obama was elected U.S. president in 2008"

# Embedding + normalisation
query_embedding = ollama_embed([text])
query_embedding = normalize_vectors(query_embedding)

# Récupération des chunks les plus proches
n_chunks = 10
results = collection.query(
    query_embeddings=query_embedding,
    n_results=n_chunks
)
print(results)



### RAG ( Resonning by Retrieval)

In [46]:
import ollama
import chromadb
import numpy as np

#  CONFIGURATION 
CHROMA_PATH = "/home/fadilatou/PROJETS/rag_fake_news/data"
COLLECTION_NAME = "news_collection"
EMBED_MODEL = "all-minilm:latest"
GEN_MODEL = "phi3:mini"
TOP_K = 5

#  Fonction principale RAG 
def rag_analysis(user_text: str, top_k: int = TOP_K):
    """
    Étape complète RAG :
    1. Nettoyage du texte utilisateur
    2. Vectorisation et normalisation
    3. Recherche dans ChromaDB
    4. Construction du prompt et génération du verdict
    """

    #  Nettoyage avec ta fonction clean_text_pipeline 
    clean_text = clean_text_pipeline(user_text)

    #  Vectorisation avec ta fonction generate_embeddings_ollama 
    embeddings = generate_embeddings_ollama([clean_text])

    #  Normalisation avec ta fonction normalize_vectors 
    normalized_emb = normalize_vectors(embeddings)[0]  # on récupère le premier (et seul) embedding

    # Connection à ChromaDB
    chroma_client = chromadb.PersistentClient(path=CHROMA_PATH)
    collection = chroma_client.get_or_create_collection(name=COLLECTION_NAME, embedding_function=None)

    #  Recherche dans la base Chroma des chunks les plus similaires
    results = collection.query(
        query_embeddings=[normalized_emb],
        n_results=top_k,
        include=["documents", "metadatas", "distances"]
    )

    # Récupération et assemblage du contexte
    retrieved_chunks = results["documents"][0]
    context = "\n".join(retrieved_chunks)

    # --- Création du prompt en anglais ---
    prompt = f"""
You are a fact-checking assistant. 
Use the context below to decide if the following statement is TRUE or FAKE. 
If information is missing or contradictory, answer "INSUFFICIENT EVIDENCE".
Provide a clear justification in English.

Context:
{context}

Statement to analyze:
"{user_text}"

Respond in this format:
VERDICT: <TRUE | FAKE | INSUFFICIENT EVIDENCE>
JUSTIFICATION:
- <brief explanation based on the context>
SOURCES:
- <titles or labels used>
"""

# Appel du modèle Ollama (par exemple phi3:mini)
    response = ollama.chat(
        model="phi3:mini",
        messages=[
            {"role": "system", "content": "You are an expert in news verification."},
            {"role": "user", "content": prompt}
        ]
    )
    
    # Retourne le résultat formaté
    answer = response["message"]["content"]
    return {
        "test": user_text,
        "context_used": context,
        "model_response": answer
    }

# --- Exemple d'utilisation ---
user_test = "Donald Trump won the 2020 U.S. presidential election."
result = rag_analysis(user_test)

print("=== TEST ===")
print(result["test"])
print("\n=== CONTEXT ===")
print(result["context_used"][:1000])  # affiche un aperçu du contexte
print("\n=== MODEL RESPONSE ===")
print(result["model_response"])

✅ Batch 1 traité (1/1)
=== TEST ===
Donald Trump won the 2020 U.S. presidential election.

=== CONTEXT ===
campaign voters arlington county suburban democratic stronghold bordering washington said national politics important votes trump talks draining swamp gillespie kind swamp said nick peacemaker works marketing considered republican trump won partys presidential nomination peacemaker said gillespie shift closer trumps policies securing republican gubernatorial nomination local races country democratic mayor bill de blasio new york marty walsh boston easily won reelection voters picking mayors detroit atlanta seattle charlotte north carolina
certified winner said currently assessing legal options fight result democrats claimed historic gains virginias statehouse month partys big wave victories republican donald trump won white house year nov general election virginia republicans held seats democrats house delegates majority state senate
washington reuters president donald trump congr