In [None]:
import openai 
import nltk
from openai import OpenAI
import PyPDF2
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
def read_pdf(file_path):
    """
    Reads a PDF file and extracts the text.
    
    :param file_path: Path to the PDF file.
    :return: Extracted text from the PDF.
    """
    text = ""
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
    except Exception as e:
        print(f"Error reading PDF: {e}")
    
    return text

In [None]:
def tokenize_and_chunk(text, chunk_size):
    """
    Tokenizes the text into sentences and chunks them into groups of approximately chunk_size tokens.
    
    :param text: The text to be tokenized and chunked.
    :param chunk_size: The maximum number of tokens per chunk.
    :return: A list of text chunks, each containing approximately chunk_size tokens.
    """
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        # Tokenize the sentence to count tokens
        tokens = nltk.word_tokenize(sentence)
        token_length = len(tokens)
        
        # Check if adding the sentence would exceed the chunk size
        if current_length + token_length > chunk_size:
            # If so, save the current chunk and start a new one
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
        
        # Add the sentence to the current chunk
        current_chunk.append(sentence)
        current_length += token_length
    
    # Add the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

In [None]:
from openai import OpenAI
client = OpenAI(api_key="API_KEY)

# Function to get the embedding
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


In [None]:
pdf_path = "data\pdf1.pdf"  # Replace with the path to your PDF file
pdf_text = read_pdf(pdf_path)

chunk_size = 1024  # Define your desired chunk size in terms of tokens
chunks = tokenize_and_chunk(pdf_text, chunk_size)

In [None]:
import psycopg2
# Connect to PostgreSQL
conn = psycopg2.connect(
    dbname="postgres",
    user="postgres",
    password="Rex1993",
    host="localhost",
    port="5432"
)

In [None]:
cursor = conn.cursor()


In [None]:
# Assuming 'chunk' is a list or similar iterable containing your text chunks
chunk_embeddings = []
for text_chunk in chunks:
    # Generate the embedding for each chunk
    embedding_vector = get_embedding(text_chunk, model='text-embedding-3-small')
    
    # Store the chunk and its embedding as a tuple in the list
    chunk_embeddings.append((text_chunk, embedding_vector))

In [None]:
# SQL insert statement to store the embedding in vectordatabase
insert_query = """
INSERT INTO vectordatabase (text, embedding)
VALUES (%s, %s::vector)
RETURNING id;
"""

for text_chunk, embedding_vector in chunk_embeddings:
    cursor.execute(insert_query, (text_chunk, embedding_vector))
    conn.commit()

    # Get the ID of the newly inserted row
    inserted_id = cursor.fetchone()[0]
    print(f"Inserted embedding with ID: {inserted_id}")

# Close the cursor and connection after all chunks are processed
cursor.close()
conn.close()

In [None]:
len(chunks[0])

In [None]:
create_index_query = """
CREATE INDEX ON vectordatabase USING ivfflat (embedding vector_l2_ops) WITH (lists = 100);
"""

# Execute the index creation
cursor.execute(create_index_query)
conn.commit()

print("Index created successfully.")

# Close the cursor and connection
cursor.close()
conn.close()

In [None]:
query_vector = get_embedding("what is the Endocoder used in transformer", model='text-embedding-3-small')

nn_query = """
SELECT id, text, embedding
FROM vectordatabase
ORDER BY embedding <-> %s::vector
LIMIT 5;
"""

cursor.execute(nn_query, (query_vector,))
results = cursor.fetchall()

for row in results:
    print(f"ID: {row[0]}, Text: {row[1]}")

# Close the cursor and connection
cursor.close()
conn.close()

In [None]:
retrieved_chunks = [row[1] for row in results]
query_text="what is the Endocoder used in transformer"

In [None]:
def create_contextual_prompt(query_text, retrieved_chunks):
    context = "\n\n".join(retrieved_chunks)
    prompt = f"Context:\n{context}\n\nQuery: {query_text}\n\nAnswer:"
    return prompt

In [None]:
def generate_response_with_context(prompt, model="gpt-3.5-turbo"):
  response = client.chat.completions.create(
    model=model,
    messages=[
      {"role": "system", "content": "You are a helpful assistant. your task is to as give me answer in  layman language as much as possible"},
      {"role": "user", "content": prompt},
    
    ],temperature=0.7,
  )
  return response.choices[0].message.content

In [None]:
def retrieve_relevant_chunks(query_text, model='text-embedding-3-small'):
    query_vector = get_embedding(query_text, model=model)
    
    conn = psycopg2.connect(
    dbname="postgres",
    user="postgres",
    password="Rex1993",
    host="localhost",
    port="5432"
)
    cursor = conn.cursor()
    
    nn_query = """
    SELECT text, embedding
    FROM vectordatabase
    ORDER BY embedding <-> %s::vector
    LIMIT 5;
    """
    
    cursor.execute(nn_query, (query_vector,))
    results = cursor.fetchall()
    
    cursor.close()
    conn.close()
    
    retrieved_chunks = [row[0] for row in results]
    return retrieved_chunks

In [None]:
def answer_query(query_text):
    retrieved_chunks = retrieve_relevant_chunks(query_text)
    if not retrieved_chunks:
        return "No relevant information found."
    
    prompt = create_contextual_prompt(query_text, retrieved_chunks)
    response = generate_response_with_context(prompt)
    return response

In [None]:
query = "Explain how transformer works"

response = answer_query(query)
print(response)

In [None]:
response=generate_response_with_context(query)

In [None]:
import gradio as gr


In [None]:
def answer_query(query_text):
    retrieved_chunks = retrieve_relevant_chunks(query_text)
    if not retrieved_chunks:
        return "No relevant information found."
    
    prompt = create_contextual_prompt(query_text, retrieved_chunks)
    response = generate_response_with_context(prompt)
    return response

# Define custom CSS for dark mode
# Define custom CSS for a comprehensive dark mode
# Define a more refined dark mode CSS


# Create the Gradio interface with custom CSS for dark mode
iface = gr.Interface(
    fn=answer_query,
    inputs="text",
    outputs="text",
    title="RAG Query Answering System",
    description="Ask a question and get an AI-generated response based on relevant information retrieved from a database.",
      # Apply refined custom CSS for a dark mode experience
)

# Launch the interface
iface.launch()