In [4]:
import os
import pandas as pd
import PyPDF2
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA


In [36]:

import os
import pandas as pd
import PyPDF2

# Set the path to your data folder
data_folder = 'CMPE280_DataSources_20241107_Hackathon'

# Function to extract text from CSV files
def extract_text_from_csv(filepath):
    df = pd.read_csv(filepath)
    return " ".join(df.astype(str).values.flatten().tolist())

# Function to extract text from PDF files
def extract_text_from_pdf(filepath):
    text = ""
    with open(filepath, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            page_text = pdf_reader.pages[page_num].extract_text() or ""
            text += page_text
    return text

# Extract data from all files in the folder
documents = []

for filename in os.listdir(data_folder):
    filepath = os.path.join(data_folder, filename)
    if filename.endswith('.csv'):
        documents.append(extract_text_from_csv(filepath))
    elif filename.endswith('.pdf'):
        documents.append(extract_text_from_pdf(filepath))

# Combine all extracted text into one large string
full_text = " ".join(documents)



# Step 5: Split the Text and Create Embeddings Using FAISS

In [37]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Split the text into chunks for embedding
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_chunks = text_splitter.split_text(full_text)

# Create embeddings for each chunk using HuggingFace
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a FAISS index to store the embeddings for fast similarity search
faiss_index = FAISS.from_texts(text_chunks, embeddings)

faiss_index.save_local("faiss_store")


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
DEBUG:urllib3.connectionpool:Resetting dropped connection: huggingface.co
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/sentence_bert_config.j

# Step 6: Initialize ChatGroq Using Environment Variables and Set Up the Retrieval Chain

In [43]:
from groq import Groq
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os
from duckduckgo_search import DDGS
import logging
from pprint import pprint

# Load environment variables from .env file
load_dotenv()

# Initialize the ChatGroq model using API key from environment variable
llm = ChatGroq(api_key=os.getenv('GROQ_API_KEY'), max_tokens=200)  # Set max tokens to limit response length

# Create a retriever from the FAISS index
retriever = faiss_index.as_retriever()

# Configure logging with pretty print
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()
pp = pprint

# DuckDuckGo search function using DDGS class
def duckduckgo_search(query):
    logger.info("Performing DuckDuckGo search...")
    
    ddgs = DDGS()  # Initialize the DuckDuckGo search object
    search_results = list(ddgs.text(query, max_results=3))  # Retrieve the top 3 results
    
    if search_results:
        logger.info("DuckDuckGo search successful.")
        pp(search_results)  # Pretty print the search results for debugging
        # Formatting the results to provide them as context
        return "\n".join([f"{result['title']}: {result['body']} (URL: {result['href']})" for result in search_results])
    else:
        logger.warning("No relevant results found from DuckDuckGo.")
        return "No relevant results found from DuckDuckGo."

# Function to query the system with chunk reduction strategies
def query_system(query):
    logger.info(f"Received query: {query}")

    # Retrieve relevant context using the FAISS retriever
    relevant_texts = retriever.get_relevant_documents(query)
    
    # Limit the number of relevant chunks to reduce context size
    MAX_CHUNKS = 5
    if len(relevant_texts) > MAX_CHUNKS:
        logger.info(f"Retrieved {len(relevant_texts)} chunks. Limiting to {MAX_CHUNKS} most relevant chunks.")
        relevant_texts = relevant_texts[:MAX_CHUNKS]  # Keep only the top 5 relevant chunks

    # Combine the selected chunks into a single context
    context = "\n".join([text.page_content for text in relevant_texts])

    # If the combined context is still too long, truncate or summarize
    MAX_CONTEXT_LENGTH = 2000
    if len(context) > MAX_CONTEXT_LENGTH:
        logger.info(f"Context size ({len(context)}) exceeds {MAX_CONTEXT_LENGTH} characters. Truncating context.")
        context = context[:MAX_CONTEXT_LENGTH] + "... [truncated]"

    # If context is still too small or unavailable, fall back to DuckDuckGo
    if len(context.strip()) == 0:
        logger.warning("No relevant information found in FAISS index.")
        search_results = duckduckgo_search(query)
        context = "No relevant internal data was found. Here are some web search results:\n" + search_results
        source_info = "Note: The information provided below is based on a DuckDuckGo web search."
    else:
        logger.info("Using information retrieved from the FAISS index.")
        context += "\n\nNote: Additional web information can be provided if needed."
        source_info = "Note: The following information was retrieved internally from the FAISS index."

    # Prepare the messages for ChatGroq invocation with updated prompt for styled and concise responses
    messages = [
        (
            "system",
            "You are a helpful assistant that provides answers in a well-structured, styled format. "
            "Use bullet points, lists, or sections to make the response easy to read. Keep your response concise, "
            "Keep the output tokens as less as possible, and only provide the most critical information needed to answer the user's question. "
            "Avoid excessive details, and focus on clarity and brevity.\n\n"
            "Context:\n"
            f"{context}\n\n"
            f"{source_info} Please clearly indicate if any part of the answer is based on external web searches."
        ),
        ("user", query)
    ]

    # Invoke the model with the provided messages
    logger.info("Invoking the ChatGroq model with the provided context and query...")
    response = llm.invoke(messages)

    # Print debug information about the type of data source used
    if "DuckDuckGo" in source_info:
        logger.info("The answer is based on a DuckDuckGo web search.")
    else:
        logger.info("The answer is based on information retrieved from the FAISS index.")
    
    pp(response.content)  # Pretty print the final response for better debugging

    return response.content


DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='/Users/shivavardhineedi/Desktop/semester3/CMPE-280/Hackathon_RAG/rag/lib/python3.9/site-packages/certifi/cacert.pem'
DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='/Users/shivavardhineedi/Desktop/semester3/CMPE-280/Hackathon_RAG/rag/lib/python3.9/site-packages/certifi/cacert.pem'


# Step 7: Query the System

In [47]:
# Example query to test the system
query = "What is KMP algorithm?"
response = query_system(query)
print(response)


INFO:root:Received query: What is KMP algorithm?
INFO:root:Context size (2942514) exceeds 2000 characters. Truncating context.
INFO:root:Using information retrieved from the FAISS index.
INFO:root:Invoking the ChatGroq model with the provided context and query...
DEBUG:groq._base_client:Request options: {'method': 'post', 'url': '/openai/v1/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': "You are a helpful assistant that provides answers in a well-structured, styled format. Use bullet points, lists, or sections to make the response easy to read. Keep your response concise, Keep the output tokens as less as possible, and only provide the most critical information needed to answer the user's question. Avoid excessive details, and focus on clarity and brevity.\n\nContext:\n1960 nan nan nan 1961 nan nan nan 1962 nan nan nan 1963 nan nan nan 1964 nan nan nan 1965 nan nan nan 1966 nan nan nan 1967 nan nan nan 1968 nan nan nan 1969 nan 0.072826324 0.

('The KMP (Knuth-Morris-Pratt) algorithm is a string matching algorithm that '
 'searches for a pattern within a text in a more efficient way than a '
 'brute-force approach. It utilizes a preprocessed pattern to skip certain '
 'comparisons, achieving a time complexity of O(n + m) where n is the length '
 'of the text and m is the length of the pattern. The KMP algorithm is useful '
 'in various applications, such as text editing, searching for specific '
 'patterns in DNA sequences, and more.\n'
 '\n'
 'The KMP algorithm works by creating a prefix function, which calculates the '
 'length of the longest proper prefix of the pattern that is also a suffix of '
 'the pattern. During the search process, when a mismatch occurs, the '
 'algorithm uses the information from the prefix function to skip characters '
 'in the text instead of starting the comparison from the beginning of the '
 'pattern.\n'
 '\n'
 'The KMP algorithm consists of two main steps:\n'
 '\n'
 '1.')
The KMP (Knuth-Morr