<a href="https://colab.research.google.com/github/ridvanyigit/LLMs/blob/main/pdf_RAG_Gradio_App_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1.SECTION**

In [None]:
# @title Install Required Libraries

!pip install langchain langchain-openai langchain-chroma
!pip install PyPDF2
!pip install plotly
!pip install scikit-learn
!pip install gradio
!pip install python-dotenv
!pip install -U langchain-community

In [None]:
# @title imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr
from google.colab import drive, files
import PyPDF2
import io

In [None]:
# @title imports for langchain, plotly and Chroma

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# @title Colab-Optimized Settings

MODEL = "gpt-4o-mini"
db_name = "vector_db"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 150
MAX_RETRIEVAL_DOCS = 10

In [None]:
# @title Mount Google Drive

drive.mount('/content/drive')
print("✅ Google Drive connected successfully!")

# Set OpenAI API Key (from Colab Secrets)

print("🔑 Setting OpenAI API Key...")
from google.colab import userdata
try:
    api_key = userdata.get('OPENAI_API_KEY')
    os.environ['OPENAI_API_KEY'] = api_key
    print("✅ API Key successfully retrieved from Colab Secrets!")
except Exception as e:
    print("❌ Could not retrieve API Key from Colab Secrets!")
    print("Please go to the 🔑 Secrets section in the left menu and add 'OPENAI_API_KEY'")
    api_key = input("Alternatively, enter your API Key here: ")
    os.environ['OPENAI_API_KEY'] = api_key
    print("✅ API Key set manually!")


# **2.SECTION**

In [None]:
# @title Read PDF Files

def read_pdf_files():
    pdf_folder = "/content/drive/MyDrive/MyPDFs"
    documents = []

    print(f"📂 Reading PDF files from: {pdf_folder}")

    # Find all PDF files in the folder
    pdf_files = glob.glob(os.path.join(pdf_folder, "*.pdf"))

    if not pdf_files:
        print("❌ No PDF files found!")
        print(f"Folder contents: {os.listdir(pdf_folder) if os.path.exists(pdf_folder) else 'Folder not found'}")
        return []

    print(f"📄 Found {len(pdf_files)} PDF files\n")

    for pdf_file in pdf_files:
        try:
            filename = os.path.basename(pdf_file)
            print(f"📖 Reading: {filename}")

            with open(pdf_file, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""

                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text += page.extract_text() + "\n"

                if text.strip():  # If not empty
                    doc = Document(
                        page_content=text,
                        metadata={
                            "source": filename,
                            "doc_type": "certificate",  # Certificate/Diploma category
                            "file_path": pdf_file
                        }
                    )
                    documents.append(doc)
                    print(f"✅ {filename} read successfully ({len(text)} characters)\n")
                else:
                    print(f"⚠️ {filename} is empty or unreadable\n")

        except Exception as e:
            print(f"❌ Error reading {filename}: {str(e)}")

    return documents

In [None]:
# @title Upload PDF Files

documents = read_pdf_files()

if not documents:
    print("❌ No documents could be loaded! Please check the file paths.")
    exit()

print(f"📚 A total of {len(documents)} documents loaded")

In [None]:
# @title Split Documents into Chunks

print("✂️ Splitting documents into chunks...")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", " ", ""]
)

chunks = text_splitter.split_documents(documents)
print(f"📝 Total {len(chunks)} chunks created")

In [None]:
# @title Create Vector Store

embeddings = OpenAIEmbeddings()

# If you want to use free embeddings, activate the lines below:
#embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# Delete Existing Database

if os.path.exists(db_name):
    import shutil
    shutil.rmtree(db_name)

# Create Vector Store

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=db_name
)

print(f"✅ Vectorstore created with {vectorstore._collection.count()} documents")

# Vector Analysis

print("📊 Performing vector analysis...")

collection = vectorstore._collection
count = collection.count()
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)

print(f"📈 {count:,} vectors, {dimensions:,} dimensions")

In [None]:
# @title 2D Visualization

def create_visualizations():
    print("🎨 Preparing visualizations...")

    result = collection.get(include=['embeddings', 'documents', 'metadatas'])
    vectors = np.array(result['embeddings'])
    documents_text = result['documents']
    metadatas = result['metadatas']

    # Color by file names
    sources = [metadata['source'] for metadata in metadatas]
    unique_sources = list(set(sources))
    color_map = {source: f"hsl({i*360/len(unique_sources)}, 70%, 60%)"
                for i, source in enumerate(unique_sources)}
    colors = [color_map[source] for source in sources]

    # 2D t-SNE
    print("📉 Calculating 2D t-SNE...")
    tsne_2d = TSNE(n_components=2, random_state=42, perplexity=min(30, len(vectors)-1))
    reduced_2d = tsne_2d.fit_transform(vectors)

    # 2D Plot
    fig_2d = go.Figure(data=[go.Scatter(
        x=reduced_2d[:, 0],
        y=reduced_2d[:, 1],
        mode='markers',
        marker=dict(size=8, color=colors, opacity=0.7),
        text=[f"File: {s}<br>Text: {d[:150]}..." for s, d in zip(sources, documents_text)],
        hoverinfo='text'
    )])

    fig_2d.update_layout(
        title='📊 2D Vector Map of Your PDF Documents',
        xaxis_title='Dimension 1',
        yaxis_title='Dimension 2',
        width=800,
        height=600,
        margin=dict(r=20, b=10, l=10, t=40)
    )
    fig_2d.show()

    # 3D t-SNE (only if enough data)
    if len(vectors) > 10:
        print("📈 Calculating 3D t-SNE...")
        tsne_3d = TSNE(n_components=3, random_state=42, perplexity=min(30, len(vectors)-1))
        reduced_3d = tsne_3d.fit_transform(vectors)

        fig_3d = go.Figure(data=[go.Scatter3d(
            x=reduced_3d[:, 0],
            y=reduced_3d[:, 1],
            z=reduced_3d[:, 2],
            mode='markers',
            marker=dict(size=6, color=colors, opacity=0.8),
            text=[f"File: {s}<br>Text: {d[:150]}..." for s, d in zip(sources, documents_text)],
            hoverinfo='text'
        )])

        fig_3d.update_layout(
            title='📊 3D Vector Map of Your PDF Documents',
            scene=dict(xaxis_title='Dimension 1', yaxis_title='Dimension 2', zaxis_title='Dimension 3'),
            width=900,
            height=700,
            margin=dict(r=20, b=10, l=10, t=40)
        )
        fig_3d.show()

# Create visualizations
create_visualizations()

# BÖLÜM 3

In [None]:
# @title Set up RAG Chain

print("🤖 Setting up AI Chat system...")

llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

retriever = vectorstore.as_retriever(search_kwargs={"k": MAX_RETRIEVAL_DOCS})

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory
)

In [None]:
# @title Test question

test_query = "Who is Ridvan Yigit?"
result = conversation_chain.invoke({"question": test_query})

print(f"Test Answer: {result['answer']}")
print("✅ System successfully set up!")

In [None]:
# @title Debug and troubleshooting version with callback

print("\n" + "="*50)
print("🔍 DEBUG AND TROUBLESHOOTING SECTION")
print("="*50)
print("If you experience issues in the system, run the code below:")
print("This code shows how the RAG system works in the background\n")

def debug_rag_system():
    """Debug the RAG system with callbacks"""
    from langchain_core.callbacks import StdOutCallbackHandler

    print("🔧 Setting up the RAG system in debug mode...")

    # LLM with callback for debugging
    debug_llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
    debug_memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    debug_retriever = vectorstore.as_retriever(search_kwargs={"k": MAX_RETRIEVAL_DOCS})

    # Conversation chain with callback handler
    debug_conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=debug_llm,
        retriever=debug_retriever,
        memory=debug_memory,
        callbacks=[StdOutCallbackHandler()]  # This line shows background processes
    )

    print("\n🔍 Running debug test query...")
    print("Below you will see step-by-step how the RAG system works:\n")

    debug_query = "Which certificates are there and from which institutions?"
    debug_result = debug_conversation_chain.invoke({"question": debug_query})

    print(f"\n📋 Debug Test Result:")
    print(f"Question: {debug_query}")
    print(f"Answer: {debug_result['answer']}")
    print("\n✅ Debug test completed!")

    return debug_conversation_chain

# Make the debug function ready to use
print("💡 If you face issues, run this command:")
print("debug_chain = debug_rag_system()")
print("This will show you detailed how the system works.\n")

In [None]:
debug_chain = debug_rag_system()

In [None]:
# @title Additional function to test retrieval quality

def test_retrieval(query, k=5):
    """Test retrieval results for a given query"""
    print(f"🔍 Retrieval Test - Query: '{query}'")
    print(f"📊 Fetching top {k} closest document chunks...\n")

    docs = vectorstore.similarity_search(query, k=k)

    for i, doc in enumerate(docs, 1):
        print(f"📄 Result {i}:")
        print(f"   File: {doc.metadata.get('source', 'Unknown')}")
        print(f"   Content: {doc.page_content[:200]}...")
        print(f"   Character Count: {len(doc.page_content)}")
        print("-" * 50)

    return docs

print("🔧 Additional debug function:")
print("test_retrieval('certificate', k=3)  # Tests retrieval quality")
print("test_retrieval('diploma', k=5)     # Tests diploma searches\n")

In [None]:
# @title Gradio Chat Interface

def chat(message, history):
    """Gradio chat function"""
    try:
        result = conversation_chain.invoke({"question": message})
        return result["answer"]
    except Exception as e:
        return f"Sorry, an error occurred: {str(e)}"

# Start Gradio interface

print("🚀 Starting chat interface...")

# Gradio interface
interface = gr.ChatInterface(
    chat,
    type="messages",
    title="📚 Certificate & Diploma AI Assistant",
    description="You can query information in your PDF documents. You can ask questions in Turkish!",
    examples=[
        "What certificates do I have?",
        "What is the latest certificate I received?",
        "From which institutions did I get certificates?",
        "Summarize the content of my certificates",
        "What topics are covered in these documents?"
    ],
    theme=gr.themes.Soft()
)

# Launch interface

interface.launch(
    share=True,  # Create public link
    inbrowser=True,
    height=600
)