In [None]:
# /content/drive/MyDrive/LLM_DATASET.zip

import zipfile
import os

zip_path = "/content/drive/MyDrive/LLM_DATASET.zip"
extract_path = "/content/LLM_DATASET"

# Create the extraction directory if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Extract zip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("Extraction complete!")

Extraction complete!


In [None]:
print("Installing required dependencies...")
import subprocess
subprocess.run(["pip", "install", "PyPDF2", "SpeechRecognition", "pydub", "moviepy","python-pptx", "Pillow", "PyMuPDF"], check=True)

Installing required dependencies...


CompletedProcess(args=['pip', 'install', 'PyPDF2', 'SpeechRecognition', 'pydub', 'moviepy', 'python-pptx', 'Pillow', 'PyMuPDF'], returncode=0)

In [None]:
import os
import subprocess
import PyPDF2
import speech_recognition as sr
from pydub import AudioSegment
from moviepy.editor import VideoFileClip
import shutil
import tempfile
import json

def extract_pdf_text(pdf_path):
    """
    Extract text from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file

    Returns:
        str: Extracted text from the PDF
    """
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            num_pages = len(reader.pages)

            for page_num in range(num_pages):
                page = reader.pages[page_num]
                text += page.extract_text() + "\n\n"

        return text
    except Exception as e:
        return f"Error extracting text from {pdf_path}: {str(e)}"

def extract_pdf_images(pdf_path, output_folder):
    """
    Extract images from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file
        output_folder (str): Path to save the extracted images

    Returns:
        list: Paths to extracted images
    """
    try:
        # Create temp folder to store the extracted images
        pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
        image_folder = os.path.join(output_folder, f"{pdf_name}_images")
        os.makedirs(image_folder, exist_ok=True)

        # Use pdfimages (from poppler-utils) to extract images
        # This is more reliable than PyPDF2 for image extraction
        try:
            subprocess.run(["pdfimages", "-j", pdf_path, os.path.join(image_folder, "img")],
                           check=True, stderr=subprocess.DEVNULL)
            print(f"Extracted images from {pdf_path} using pdfimages")
        except:
            # If pdfimages fails, try PyMuPDF (fitz) as a fallback
            print("pdfimages not available, trying PyMuPDF...")
            try:
                import fitz  # PyMuPDF

                doc = fitz.open(pdf_path)
                image_count = 0

                for page_num, page in enumerate(doc):
                    image_list = page.get_images(full=True)

                    for img_index, img in enumerate(image_list):
                        xref = img[0]
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]
                        image_ext = base_image["ext"]

                        with open(os.path.join(image_folder, f"img_{page_num+1}_{img_index+1}.{image_ext}"), "wb") as img_file:
                            img_file.write(image_bytes)
                            image_count += 1

                print(f"Extracted {image_count} images from {pdf_path} using PyMuPDF")
            except ImportError:
                print("PyMuPDF not installed. Install with: pip install PyMuPDF")
                return []

        # Return the list of images
        images = [os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]

        return images
    except Exception as e:
        print(f"Error extracting images from {pdf_path}: {str(e)}")
        return []

def convert_video_to_audio(video_path, audio_path):
    """
    Convert a video file to audio for transcription.

    Args:
        video_path (str): Path to the video file
        audio_path (str): Path to save the extracted audio
    """
    try:
        video = VideoFileClip(video_path)
        video.audio.write_audiofile(audio_path, codec='pcm_s16le')
    except Exception as e:
        print(f"Error converting video to audio: {str(e)}")

def transcribe_audio(audio_path):
    """
    Transcribe audio file using speech recognition.

    Args:
        audio_path (str): Path to the audio file

    Returns:
        str: Transcribed text
    """
    try:
        # Convert WAV to a format compatible with speech_recognition
        sound = AudioSegment.from_file(audio_path)
        sound = sound.set_channels(1)  # Convert to mono
        sound = sound.set_frame_rate(16000)  # Set frame rate
        sound.export(audio_path, format="wav")

        recognizer = sr.Recognizer()
        with sr.AudioFile(audio_path) as source:
            audio_data = recognizer.record(source)
            text = recognizer.recognize_google(audio_data)
            return text
    except Exception as e:
        return f"Error transcribing audio: {str(e)}"

def extract_video_frames(video_path, output_folder, frames_per_second=1):
    """
    Extract frames from a video file.

    Args:
        video_path (str): Path to the video file
        output_folder (str): Path to save the extracted frames
        frames_per_second (float): Number of frames to extract per second

    Returns:
        list: Paths to extracted frames
    """
    try:
        video_name = os.path.splitext(os.path.basename(video_path))[0]
        frames_folder = os.path.join(output_folder, f"{video_name}_frames")
        os.makedirs(frames_folder, exist_ok=True)

        video = VideoFileClip(video_path)
        duration = video.duration

        # Calculate frame extraction timestamps
        timestamps = [i for i in range(0, int(duration), int(1/frames_per_second))]

        # Extract frames
        frame_paths = []
        for i, timestamp in enumerate(timestamps):
            frame_path = os.path.join(frames_folder, f"frame_{i:04d}.jpg")
            video.save_frame(frame_path, timestamp)
            frame_paths.append(frame_path)

        print(f"Extracted {len(frame_paths)} frames from {video_path}")
        return frame_paths
    except Exception as e:
        print(f"Error extracting frames from {video_path}: {str(e)}")
        return []

def transcribe_video_with_whisper(video_path, output_path, model_size="medium"):
    """
    Transcribe video using OpenAI's Whisper model.
    This requires whisper to be installed: pip install openai-whisper

    Args:
        video_path (str): Path to the video file
        output_path (str): Path to save the transcription
        model_size (str): Whisper model size ('tiny', 'base', 'small', 'medium', 'large')

    Returns:
        str: Path to the transcript file
    """
    try:
        # Create output directory if it doesn't exist
        output_dir = os.path.dirname(output_path)
        os.makedirs(output_dir, exist_ok=True)

        # Extract audio from video to a temporary file
        audio_filename = f"temp_{os.path.basename(os.path.splitext(video_path)[0])}.wav"
        audio_path = os.path.join(output_dir, audio_filename)

        # Convert video to audio
        convert_video_to_audio(video_path, audio_path)

        # Use whisper to transcribe
        # Specify the exact output file name to force Whisper to use our desired name
        base_output_name = os.path.splitext(os.path.basename(output_path))[0]
        cmd = f"whisper {audio_path} --model {model_size} --output_dir {output_dir} --output_format txt"
        subprocess.run(cmd, shell=True, check=True)

        # Whisper typically saves the file as {audio_filename}.txt
        whisper_output = os.path.join(output_dir, os.path.splitext(audio_filename)[0] + ".txt")

        # If the file exists, rename it to match our desired output path
        if os.path.exists(whisper_output) and whisper_output != output_path:
            shutil.move(whisper_output, output_path)
            print(f"Moved transcript to: {output_path}")

        # Clean up temporary audio file
        if os.path.exists(audio_path):
            os.remove(audio_path)

        return output_path
    except Exception as e:
        error_msg = f"Error using Whisper: {str(e)}"
        print(error_msg)
        return error_msg

def extract_pptx_text(pptx_path):
    """
    Extract text from a PowerPoint (PPTX) file.

    Args:
        pptx_path (str): Path to the PPTX file

    Returns:
        str: Extracted text from the PPTX
    """
    try:
        # Import here to avoid errors if library isn't installed
        from pptx import Presentation

        prs = Presentation(pptx_path)
        text = ""

        # Add title slide information if available
        if hasattr(prs, 'core_properties') and prs.core_properties.title:
            text += f"Title: {prs.core_properties.title}\n\n"

        # Process each slide
        for slide_num, slide in enumerate(prs.slides, 1):
            text += f"Slide {slide_num}:\n"

            # Extract slide title if available
            if slide.shapes.title and slide.shapes.title.has_text_frame:
                text += f"Title: {slide.shapes.title.text}\n"

            # Extract text from all shapes in the slide
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += f"{shape.text}\n"
                elif hasattr(shape, "text_frame"):
                    for paragraph in shape.text_frame.paragraphs:
                        text += f"{paragraph.text}\n"

            # Add notes if available
            if slide.has_notes_slide and slide.notes_slide.notes_text_frame:
                notes_text = slide.notes_slide.notes_text_frame.text
                if notes_text:
                    text += f"Notes: {notes_text}\n"

            text += "\n" + "-" * 40 + "\n\n"

        return text
    except ImportError:
        return f"Error: python-pptx library not installed. Please run: pip install python-pptx"
    except Exception as e:
        return f"Error extracting text from {pptx_path}: {str(e)}"

def extract_pptx_images(pptx_path, output_folder):
    """
    Extract images from a PowerPoint (PPTX) file.

    Args:
        pptx_path (str): Path to the PPTX file
        output_folder (str): Path to save the extracted images

    Returns:
        list: Paths to extracted images
    """
    try:
        from pptx import Presentation
        from pptx.enum.shapes import MSO_SHAPE_TYPE
        import io
        from PIL import Image

        pptx_name = os.path.splitext(os.path.basename(pptx_path))[0]
        images_folder = os.path.join(output_folder, f"{pptx_name}_images")
        os.makedirs(images_folder, exist_ok=True)

        prs = Presentation(pptx_path)
        image_count = 0
        image_paths = []

        for slide_num, slide in enumerate(prs.slides, 1):
            for shape_num, shape in enumerate(slide.shapes, 1):
                if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                    image = shape.image
                    image_ext = image.ext.lower() if hasattr(image, 'ext') else 'jpg'

                    # Make sure ext is valid
                    if image_ext not in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
                        image_ext = 'jpg'

                    image_path = os.path.join(images_folder, f"slide_{slide_num}_image_{shape_num}.{image_ext}")

                    # Save the image
                    with open(image_path, 'wb') as f:
                        f.write(image.blob)

                    image_paths.append(image_path)
                    image_count += 1

        print(f"Extracted {image_count} images from {pptx_path}")
        return image_paths
    except ImportError:
        print(f"Error: python-pptx or PIL library not installed. Please run: pip install python-pptx Pillow")
        return []
    except Exception as e:
        print(f"Error extracting images from {pptx_path}: {str(e)}")
        return []

def process_folder(folder_path, output_folder, whisper_model_size="medium", extract_images=True, frames_per_second=0):
    """
    Process all PDF, PowerPoint, and video files in a folder.

    Args:
        folder_path (str): Path to the folder containing files
        output_folder (str): Path to save the extracted text, images, and transcriptions
        whisper_model_size (str): Whisper model size to use for transcription
        extract_images (bool): Whether to extract images from PDFs and PPTXs
        frames_per_second (float): Number of frames to extract per second from videos. Set to 0 to disable frame extraction.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Dictionary to store extraction results
    extraction_results = {}

    # Get all files in the folder
    files = os.listdir(folder_path)

    # Process each file
    for file in files:
        file_path = os.path.join(folder_path, file)

        # Skip directories
        if os.path.isdir(file_path):
            continue

        # Get file extension
        _, ext = os.path.splitext(file)
        ext = ext.lower()

        file_results = {"file": file, "type": None, "text_path": None, "images": [], "frames": []}
        output_path = os.path.join(output_folder, os.path.splitext(file)[0] + ".txt")

        # Process PDF files
        if ext == '.pdf':
            print(f"Processing PDF: {file}")
            file_results["type"] = "pdf"

            # Extract text
            text = extract_pdf_text(file_path)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(text)
            file_results["text_path"] = output_path

            # Extract images if requested
            if extract_images:
                image_paths = extract_pdf_images(file_path, output_folder)
                file_results["images"] = image_paths
                print(f"Extracted {len(image_paths)} images from {file}")

        # Process PowerPoint files
        elif ext in ['.pptx', '.ppt']:
            print(f"Processing PowerPoint: {file}")
            file_results["type"] = "powerpoint"

            if ext == '.pptx':
                text = extract_pptx_text(file_path)
                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(text)
                file_results["text_path"] = output_path

                # Extract images if requested
                if extract_images:
                    image_paths = extract_pptx_images(file_path, output_folder)
                    file_results["images"] = image_paths
                    print(f"Extracted {len(image_paths)} images from {file}")
            else:
                print(f"Warning: .ppt format not supported directly. Please convert to .pptx")

        # Process video files
        elif ext in ['.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv']:
            print(f"Processing video: {file}")
            file_results["type"] = "video"

            # Extract frames if requested and frames_per_second > 0
            if extract_images and frames_per_second > 0:
                frame_paths = extract_video_frames(file_path, output_folder, frames_per_second)
                file_results["frames"] = frame_paths
                print(f"Extracted {len(frame_paths)} frames from {file}")
            else:
                print(f"Skipping frame extraction for {file}")

            # Using Whisper for better transcription results
            print(f"Transcribing with Whisper: {file}")
            transcript_path = transcribe_video_with_whisper(file_path, output_path, whisper_model_size)

            # If Whisper fails for some reason, fall back to basic transcription
            if isinstance(transcript_path, str) and transcript_path.startswith("Error"):
                print(f"Whisper failed, falling back to basic transcription: {file}")
                temp_audio_path = os.path.join(output_folder, f"temp_{os.path.splitext(file)[0]}.wav")
                convert_video_to_audio(file_path, temp_audio_path)
                transcript = transcribe_audio(temp_audio_path)

                with open(output_path, 'w', encoding='utf-8') as f:
                    f.write(transcript)

                # Clean up temporary files
                if os.path.exists(temp_audio_path):
                    os.remove(temp_audio_path)

                file_results["text_path"] = output_path
            else:
                print(f"Whisper transcription saved to: {transcript_path}")
                file_results["text_path"] = transcript_path

                # Verify file exists
                if not os.path.exists(transcript_path):
                    print(f"WARNING: Expected transcript file not found at {transcript_path}")
                else:
                    print(f"Confirmed transcript file exists at {transcript_path}")
                    # Print first few lines of transcript for verification
                    try:
                        with open(transcript_path, 'r', encoding='utf-8') as f:
                            first_lines = "".join([next(f) for _ in range(3)])
                            print(f"First few lines of transcript: {first_lines}")
                    except Exception as e:
                        print(f"Could not read transcript file: {e}")

        # Add to results dictionary
        extraction_results[file] = file_results

    # Save extraction results as JSON
    results_path = os.path.join(output_folder, "extraction_results.json")
    with open(results_path, 'w', encoding='utf-8') as f:
        json.dump(extraction_results, f, indent=2)

    print(f"Processing complete. Results saved to {output_folder}")
    print(f"Extraction summary saved to {results_path}")

    return extraction_results

if __name__ == "__main__":
    # For Google Colab usage - no command line arguments needed
    # Just set your input and output folder paths directly:
    input_folder = "/content/LLM_DATASET/LLM_DATASET/BST"  # Change this to your input folder path
    output_folder = "/content/output_folder"  # Change this to your output folder path
    whisper_model = "medium"  # Options: 'tiny', 'base', 'small', 'medium', 'large'
    extract_images = True  # Set to False if you only want text
    frames_per_second = 0  # Set to 0 to disable video frame extraction

    # Install required dependencies first
    print("Installing required dependencies...")
    subprocess.run(["pip", "install", "PyPDF2", "SpeechRecognition", "pydub", "moviepy",
                   "python-pptx", "Pillow", "PyMuPDF"], check=True)

    # Check if poppler-utils is installed (for pdfimages), install if not
    try:
        subprocess.run(["pdfimages", "-v"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print("poppler-utils is already installed.")
    except:
        print("Installing poppler-utils for better PDF image extraction...")
        subprocess.run(["apt-get", "update"], check=True)
        subprocess.run(["apt-get", "install", "-y", "poppler-utils"], check=True)
        print("poppler-utils installed successfully.")

    # Check if whisper is installed, install if not
    try:
        subprocess.run(["whisper", "--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print("Whisper is already installed.")
    except:
        print("Installing Whisper...")
        subprocess.run(["pip", "install", "openai-whisper"], check=True)
        print("Whisper installed successfully.")

    # Check if FFmpeg is installed, install if not
    try:
        subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print("FFmpeg is already installed.")
    except:
        print("Installing FFmpeg...")
        subprocess.run(["apt-get", "install", "-y", "ffmpeg"], check=True)
        print("FFmpeg installed successfully.")

    print(f"Processing files from {input_folder}")
    process_folder(input_folder, output_folder, whisper_model, extract_images, frames_per_second)
    print(f"Done! Extracted texts, images, and transcriptions saved to {output_folder}")
    print(f"Note: Video frame extraction is disabled (frames_per_second=0)")

  if event.key is 'enter':



Installing required dependencies...
Installing poppler-utils for better PDF image extraction...
poppler-utils installed successfully.
Installing Whisper...
Whisper installed successfully.
FFmpeg is already installed.
Processing files from /content/LLM_DATASET/LLM_DATASET/BST
Processing PowerPoint: Class2_Unit3_Tree_BST_DynamicInsert.pptx
Extracted 13 images from /content/LLM_DATASET/LLM_DATASET/BST/Class2_Unit3_Tree_BST_DynamicInsert.pptx
Extracted 13 images from Class2_Unit3_Tree_BST_DynamicInsert.pptx
Processing PowerPoint: Class4_Unit3_Trees_BST_ArrayInsert.pptx
Extracted 21 images from /content/LLM_DATASET/LLM_DATASET/BST/Class4_Unit3_Trees_BST_ArrayInsert.pptx
Extracted 21 images from Class4_Unit3_Trees_BST_ArrayInsert.pptx
Processing PowerPoint: Class3_Unit3_Trees_BSTDeletion.pptx
Extracted 14 images from /content/LLM_DATASET/LLM_DATASET/BST/Class3_Unit3_Trees_BSTDeletion.pptx
Extracted 14 images from Class3_Unit3_Trees_BSTDeletion.pptx
Processing complete. Results saved to /cont

In [None]:
# Install dependencies
!pip install -U "transformers[torch]" "tokenizers>=0.21,<0.22"
!pip install langchain langchain_community
!pip install faiss-cpu
!pip install accelerate bitsandbytes sentence-transformers tqdm huggingface_hub

# Check for GPU and install faiss-gpu if available
import torch
if torch.cuda.is_available():
    print(f"✅ GPU is available: {torch.cuda.get_device_name(0)}")
    !pip install faiss-gpu==1.7.2
else:
    print("⚠️ WARNING: GPU not available. Processing will be much slower.")

# Set environment variable to avoid tokenizer parallelism warning
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Hugging Face authentication
from huggingface_hub import login

# Either securely prompt for the token, or hardcode it (not recommended)
# Uncomment below if you want to enter securely
# import getpass
# HF_TOKEN = getpass.getpass("Enter your Hugging Face token: ")
HF_TOKEN = "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"  # Replace with your actual token

login(token=HF_TOKEN)

# Test model access with AutoTokenizer
from transformers import AutoTokenizer

try:
    tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-27b-it", token=HF_TOKEN)
    print("✅ Successfully authenticated and can access Gemma model!")
except Exception as e:
    print(f"❌ Error accessing Gemma model: {e}")
    print("Please check your Hugging Face token and make sure you've accepted the model terms.")


Collecting langchain_community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain_community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

✅ Successfully authenticated and can access Gemma model!


In [None]:
import os
import glob
import json
import torch
import numpy as np
from tqdm import tqdm
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from huggingface_hub import login

# Safely import FAISS
try:
    from langchain_community.vectorstores import FAISS
except ImportError:
    print("Error importing FAISS from langchain. Trying direct import...")
    try:
        import faiss
        print("Successfully imported faiss directly.")
        from langchain_community.vectorstores import FAISS
    except ImportError:
        print("Could not import faiss. Installing faiss-cpu...")
        os.system("pip install faiss-cpu")
        from langchain_community.vectorstores import FAISS

# Set up paths
INPUT_DIR = "/content/output_folder"  # Directory containing extracted text files
VECTOR_DB_PATH = "/content/vector_db"  # Path to save the vector database
HF_TOKEN = "**"  # Your Hugging Face token

# Model configuration - using Mistral 7B Instruct v0.3
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"  # Mistral 7B Instruction model
CHUNK_SIZE = 1200  # Chunk size for document splitting
CHUNK_OVERLAP = 300  # Overlap between chunks
MAX_NEW_TOKENS = 768  # Maximum new tokens to generate
TEMPERATURE = 0.6  # Temperature for generation

# Create necessary directories
os.makedirs(VECTOR_DB_PATH, exist_ok=True)

def load_extracted_texts(input_dir):
    """
    Load all extracted text files from the output directory.

    Args:
        input_dir (str): Directory containing extracted text files

    Returns:
        list: List of Document objects with text content and metadata
    """
    print("Loading extracted text files...")
    documents = []

    # Try to load extraction summary if available
    summary_path = os.path.join(input_dir, "extraction_results.json")
    file_metadata = {}

    if os.path.exists(summary_path):
        try:
            with open(summary_path, 'r', encoding='utf-8') as f:
                extraction_results = json.load(f)

            # Extract metadata from the summary
            for filename, info in extraction_results.items():
                if info.get("text_path"):
                    file_metadata[os.path.basename(info["text_path"])] = {
                        "source_file": filename,
                        "file_type": info.get("type", "unknown"),
                        "has_images": len(info.get("images", [])) > 0 or len(info.get("frames", [])) > 0
                    }
        except Exception as e:
            print(f"Error loading extraction summary: {e}")

    # Get all text files
    text_files = glob.glob(os.path.join(input_dir, "*.txt"))

    # Load each text file
    for text_file in tqdm(text_files):
        filename = os.path.basename(text_file)
        try:
            with open(text_file, 'r', encoding='utf-8') as f:
                text = f.read()

            # Skip empty files or very short files
            if len(text.strip()) < 50:
                print(f"Skipping short/empty file: {filename}")
                continue

            # Get metadata if available
            meta = file_metadata.get(filename, {})
            meta.update({
                "source": filename,
                "path": text_file
            })

            documents.append(Document(page_content=text, metadata=meta))
        except Exception as e:
            print(f"Error loading {text_file}: {e}")

    print(f"Loaded {len(documents)} documents")
    return documents

def create_vector_database(documents):
    """
    Create a vector database from the documents.

    Args:
        documents (list): List of Document objects

    Returns:
        FAISS: FAISS vector store
    """
    print("Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    try:
        chunks = text_splitter.split_documents(documents)
        print(f"Split into {len(chunks)} chunks")
    except Exception as e:
        print(f"Error during text splitting: {e}")
        # Fallback to simpler splitting if recursive splitter fails
        from langchain.text_splitter import CharacterTextSplitter
        text_splitter = CharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP,
            separator="\n"
        )
        chunks = text_splitter.split_documents(documents)
        print(f"Used fallback splitter. Split into {len(chunks)} chunks")

    print("Creating embeddings (this may take a while)...")
    # Use a sentence-transformer model for embeddings
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"}
    )

    print("Building vector database...")
    try:
        vector_db = FAISS.from_documents(chunks, embedding_model)
        print(f"Saving vector database to {VECTOR_DB_PATH}")
        vector_db.save_local(VECTOR_DB_PATH)
        return vector_db
    except Exception as e:
        print(f"Error building vector database: {e}")
        raise

def load_vector_database():
    """
    Load the vector database if it exists, otherwise create it.

    Returns:
        FAISS: FAISS vector store
    """
    if os.path.exists(os.path.join(VECTOR_DB_PATH, "index.faiss")):
        print("Loading existing vector database...")
        try:
            embedding_model = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-MiniLM-L6-v2",
                model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"}
            )
            vector_db = FAISS.load_local(VECTOR_DB_PATH, embedding_model, allow_dangerous_deserialization=True)
            return vector_db
        except Exception as e:
            print(f"Error loading vector database: {e}")
            print("Creating a new vector database instead...")
            # Remove corrupted database files
            for file in ["index.faiss", "index.pkl"]:
                file_path = os.path.join(VECTOR_DB_PATH, file)
                if os.path.exists(file_path):
                    os.remove(file_path)

    print("Creating a new vector database...")
    documents = load_extracted_texts(INPUT_DIR)
    return create_vector_database(documents)

def setup_mistral_model():
    """
    Set up the Mistral model for text generation.

    Returns:
        pipeline: Hugging Face text generation pipeline
    """
    print(f"Setting up model: {MODEL_ID}")

    try:
        # Load the tokenizer
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

        # Configure 4-bit quantization for memory efficiency
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
        )

        # Load the model with quantization
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            device_map="auto",
            quantization_config=quantization_config,
            trust_remote_code=True
        )

        # Create the pipeline
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            temperature=TEMPERATURE,
            top_p=0.9,
        )

        return pipe
    except Exception as e:
        print(f"Error setting up Mistral model: {e}")
        raise

def generate_summary(query, vector_db, generator, num_docs=5):
    """
    Generate a summary of the relevant documents based on a query.

    Args:
        query (str): Query to search for
        vector_db (FAISS): Vector database
        generator (pipeline): Text generation pipeline
        num_docs (int): Number of documents to retrieve

    Returns:
        str: Generated summary
    """
    print(f"Retrieving documents for query: '{query}'")

    try:
        # Retrieve relevant documents
        retrieved_docs = vector_db.similarity_search(query, k=num_docs)

        # Handle case where no documents are retrieved
        if not retrieved_docs:
            return "No relevant documents found for your query."

        # Construct context from retrieved documents
        context = ""
        for i, doc in enumerate(retrieved_docs):
            source = doc.metadata.get("source", f"Document {i+1}")
            # Limit very long chunks to prevent context overflow
            truncated_content = doc.page_content[:1500] + "..." if len(doc.page_content) > 1500 else doc.page_content
            context += f"\n--- Document {i+1}: {source} ---\n{truncated_content}\n"

        # Mistral-specific prompt format with proper instruction format
        prompt = f"""<s>[INST] I need you to summarize information from several documents based on a specific query.

Query: {query}

Here are relevant passages from the documents:
{context}

Please provide a comprehensive and accurate summary that addresses the query based solely on the information in these documents. Include the most important points and insights, but keep your response concise and well-organized. [/INST]"""

        print("Generating summary with Mistral...")
        # Generate the summary
        result = generator(prompt)

        # Extract the generated text
        summary = result[0]["generated_text"]

        # Remove the prompt from the response
        summary = summary[len(prompt):]

        # Clean up any Mistral-specific tokens
        if "</s>" in summary:
            summary = summary.split("</s>")[0].strip()

        return summary
    except Exception as e:
        print(f"Error during summary generation: {e}")
        return f"An error occurred while generating the summary: {str(e)}"

def save_conversation(queries, summaries):
    """
    Save the conversation history to a file.

    Args:
        queries (list): List of queries
        summaries (list): List of summaries
    """
    conversation = []
    for q, s in zip(queries, summaries):
        conversation.append({"query": q, "summary": s})

    output_path = os.path.join(INPUT_DIR, "conversation_history.json")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(conversation, f, indent=2)

    print(f"Saved conversation history to {output_path}")

def interactive_rag():
    """
    Run the RAG system in an interactive mode.
    """
    print("Loading vector database...")
    try:
        vector_db = load_vector_database()
    except Exception as e:
        print(f"Failed to load vector database: {e}")
        return

    print("Setting up Mistral model...")
    try:
        generator = setup_mistral_model()
    except Exception as e:
        print(f"Failed to set up Mistral model: {e}")
        return

    print("\n=== RAG System with Mistral-7B-Instruct-v0.3 Ready ===")
    print("Enter your query (or 'exit' to quit, 'save' to save conversation):")

    queries = []
    summaries = []

    while True:
        query = input("\nQuery: ")
        if query.lower() in ['exit', 'quit']:
            break
        elif query.lower() == 'save':
            save_conversation(queries, summaries)
            continue

        try:
            summary = generate_summary(query, vector_db, generator)
            print("\n=== Summary ===")
            print(summary)

            # Save to conversation history
            queries.append(query)
            summaries.append(summary)
        except Exception as e:
            print(f"Error generating summary: {e}")

    # Save conversation at the end
    if queries:
        save_conversation(queries, summaries)

def batch_summarize(queries):
    """
    Summarize a list of predefined queries in batch mode.

    Args:
        queries (list): List of query strings

    Returns:
        dict: Dictionary mapping queries to summaries
    """
    try:
        vector_db = load_vector_database()
        generator = setup_mistral_model()

        results = {}
        for query in tqdm(queries):
            print(f"Processing query: {query}")
            try:
                summary = generate_summary(query, vector_db, generator)
                results[query] = summary
            except Exception as e:
                results[query] = f"Error: {str(e)}"

        # Save results to file
        output_path = os.path.join(INPUT_DIR, "rag_summaries.json")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)

        print(f"Saved summaries to {output_path}")
        return results
    except Exception as e:
        print(f"Error in batch summarization: {e}")
        return {}

if __name__ == "__main__":
    # Choose whether to run in interactive or batch mode
    mode = "interacti"  # or "batch"

    if mode == "interactive":
        interactive_rag()
    else:
        # Example predefined queries for batch mode
        queries = [
            "What are the main concepts covered in these documents?",
            "Summarize the key points about bst",
            "What algorithms are discussed in the lectures?",
            "Explain the main technical concepts in simple terms",
            "What is threaded bst"
        ]
        batch_summarize(queries)

Loading existing vector database...
Setting up model: mistralai/Mistral-7B-Instruct-v0.3


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0
  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing query: What are the main concepts covered in these documents?
Retrieving documents for query: 'What are the main concepts covered in these documents?'
Generating summary with Mistral...


 20%|██        | 1/5 [00:31<02:05, 31.45s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing query: Summarize the key points about bst
Retrieving documents for query: 'Summarize the key points about bst'
Generating summary with Mistral...


 40%|████      | 2/5 [01:14<01:55, 38.44s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing query: What algorithms are discussed in the lectures?
Retrieving documents for query: 'What algorithms are discussed in the lectures?'
Generating summary with Mistral...


 60%|██████    | 3/5 [02:04<01:26, 43.37s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing query: Explain the main technical concepts in simple terms
Retrieving documents for query: 'Explain the main technical concepts in simple terms'
Generating summary with Mistral...


 80%|████████  | 4/5 [02:43<00:41, 41.74s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Processing query: What is threaded bst
Retrieving documents for query: 'What is threaded bst'
Generating summary with Mistral...


100%|██████████| 5/5 [03:21<00:00, 40.37s/it]

Saved summaries to /content/output_folder/rag_summaries.json





In [None]:
import os
import glob
import json
import torch
import numpy as np
from tqdm import tqdm
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from huggingface_hub import login

# Safely import FAISS
try:
    from langchain_community.vectorstores import FAISS
except ImportError:
    print("Error importing FAISS from langchain. Trying direct import...")
    try:
        import faiss
        print("Successfully imported faiss directly.")
        from langchain_community.vectorstores import FAISS
    except ImportError:
        print("Could not import faiss. Installing faiss-cpu...")
        os.system("pip install faiss-cpu")
        from langchain_community.vectorstores import FAISS

# Set up paths
EXISTING_VECTOR_DB_PATH = "/content/vector_db"  # Path to existing vector database
NEW_FILES_DIR = "/content/new_files"  # Directory containing new text files to add
HF_TOKEN = "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"  # Your Hugging Face token

# Model configuration
CHUNK_SIZE = 1200  # Chunk size for document splitting
CHUNK_OVERLAP = 300  # Overlap between chunks

def load_new_text_files(input_dir):
    """
    Load all new text files to be added to the vector database.

    Args:
        input_dir (str): Directory containing new text files

    Returns:
        list: List of Document objects with text content and metadata
    """
    print(f"Loading new text files from {input_dir}...")
    documents = []

    # Try to load extraction summary if available
    summary_path = os.path.join(input_dir, "extraction_results.json")
    file_metadata = {}

    if os.path.exists(summary_path):
        try:
            with open(summary_path, 'r', encoding='utf-8') as f:
                extraction_results = json.load(f)

            # Extract metadata from the summary
            for filename, info in extraction_results.items():
                if info.get("text_path"):
                    file_metadata[os.path.basename(info["text_path"])] = {
                        "source_file": filename,
                        "file_type": info.get("type", "unknown"),
                        "has_images": len(info.get("images", [])) > 0 or len(info.get("frames", [])) > 0
                    }
        except Exception as e:
            print(f"Error loading extraction summary: {e}")

    # Get all text files
    text_files = glob.glob(os.path.join(input_dir, "*.txt"))

    # Load each text file
    for text_file in tqdm(text_files):
        filename = os.path.basename(text_file)
        try:
            with open(text_file, 'r', encoding='utf-8') as f:
                text = f.read()

            # Skip empty files or very short files
            if len(text.strip()) < 50:
                print(f"Skipping short/empty file: {filename}")
                continue

            # Get metadata if available
            meta = file_metadata.get(filename, {})
            meta.update({
                "source": filename,
                "path": text_file,
                "is_new": True  # Mark as new document
            })

            documents.append(Document(page_content=text, metadata=meta))
        except Exception as e:
            print(f"Error loading {text_file}: {e}")

    print(f"Loaded {len(documents)} new documents")
    return documents

def process_documents_into_chunks(documents):
    """
    Process documents into chunks ready for embedding.

    Args:
        documents (list): List of Document objects

    Returns:
        list: List of chunks
    """
    print("Splitting documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )

    try:
        chunks = text_splitter.split_documents(documents)
        print(f"Split into {len(chunks)} chunks")
        return chunks
    except Exception as e:
        print(f"Error during text splitting: {e}")
        # Fallback to simpler splitting if recursive splitter fails
        from langchain.text_splitter import CharacterTextSplitter
        text_splitter = CharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP,
            separator="\n"
        )
        chunks = text_splitter.split_documents(documents)
        print(f"Used fallback splitter. Split into {len(chunks)} chunks")
        return chunks

def update_vector_database(new_files_dir, vector_db_path):
    """
    Update an existing vector database with new files.

    Args:
        new_files_dir (str): Directory containing new text files
        vector_db_path (str): Path to the existing vector database

    Returns:
        FAISS: Updated FAISS vector store
    """
    # Set up the embedding model
    embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"}
    )

    # Check if the vector database exists
    if not os.path.exists(os.path.join(vector_db_path, "index.faiss")):
        print(f"Error: Vector database not found at {vector_db_path}")
        print("Please make sure the vector database exists before updating.")
        return None

    # Load the existing vector database
    print(f"Loading existing vector database from {vector_db_path}...")
    try:
        vector_db = FAISS.load_local(vector_db_path, embedding_model, allow_dangerous_deserialization=True)
        print(f"Successfully loaded existing vector database with {len(vector_db.index_to_docstore_id)} entries")
    except Exception as e:
        print(f"Error loading vector database: {e}")
        return None

    # Load and process new documents
    new_documents = load_new_text_files(new_files_dir)

    if not new_documents:
        print("No new documents to add. Vector database remains unchanged.")
        return vector_db

    # Process the new documents into chunks
    new_chunks = process_documents_into_chunks(new_documents)

    if not new_chunks:
        print("No chunks created from new documents. Vector database remains unchanged.")
        return vector_db

    # Add the new chunks to the vector database
    print(f"Adding {len(new_chunks)} new chunks to the vector database...")
    try:
        vector_db.add_documents(new_chunks)
        print("Successfully added new documents to vector database")

        # Save the updated vector database
        print(f"Saving updated vector database to {vector_db_path}")
        vector_db.save_local(vector_db_path)
        print("Vector database successfully updated and saved")

        return vector_db
    except Exception as e:
        print(f"Error updating vector database: {e}")
        return None

def main():
    # Configuration - modify these paths as needed
    existing_vector_db_path = "/content/vector_db"
    new_files_dir = "/content/output_folder"

    # Create the new files directory if it doesn't exist
    os.makedirs(new_files_dir, exist_ok=True)

    print(f"Updating vector database at {existing_vector_db_path} with files from {new_files_dir}")

    # Update the vector database
    updated_db = update_vector_database(new_files_dir, existing_vector_db_path)

    if updated_db:
        print("Vector database update complete!")
        # You can use the updated_db for queries right away if needed
    else:
        print("Vector database update failed.")

if __name__ == "__main__":
    main()

Updating vector database at /content/vector_db with files from /content/output_folder
Loading existing vector database from /content/vector_db...
Successfully loaded existing vector database with 8 entries
Loading new text files from /content/output_folder...


100%|██████████| 6/6 [00:00<00:00, 10284.36it/s]

Loaded 6 new documents
Splitting documents into chunks...
Split into 32 chunks
Adding 32 new chunks to the vector database...
Successfully added new documents to vector database
Saving updated vector database to /content/vector_db
Vector database successfully updated and saved
Vector database update complete!



