In [14]:
import openai 
import nltk
from openai import OpenAI
import PyPDF2

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [16]:
text = "Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora. Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof."
  


In [17]:
  
print(sent_tokenize(text)) 


['Natural language processing (NLP) is a field of computer science, artificial intelligence and computational linguistics concerned with the interactions between computers and human (natural) languages, and, in particular, concerned with programming computers to fruitfully process large natural language corpora.', 'Challenges in natural language processing frequently involve natural language understanding, natural language generation (frequently from formal, machine-readable logical forms), connecting language and machine perception, managing human-computer dialog systems, or some combination thereof.']


In [19]:
def read_pdf(file_path):
    """
    Reads a PDF file and extracts the text.
    
    :param file_path: Path to the PDF file.
    :return: Extracted text from the PDF.
    """
    text = ""
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
    except Exception as e:
        print(f"Error reading PDF: {e}")
    
    return text

In [20]:
def tokenize_and_chunk(text, chunk_size):
    """
    Tokenizes the text into sentences and chunks them into groups of approximately chunk_size tokens.
    
    :param text: The text to be tokenized and chunked.
    :param chunk_size: The maximum number of tokens per chunk.
    :return: A list of text chunks, each containing approximately chunk_size tokens.
    """
    # Tokenize the text into sentences
    sentences = nltk.sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        # Tokenize the sentence to count tokens
        tokens = nltk.word_tokenize(sentence)
        token_length = len(tokens)
        
        # Check if adding the sentence would exceed the chunk size
        if current_length + token_length > chunk_size:
            # If so, save the current chunk and start a new one
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
        
        # Add the sentence to the current chunk
        current_chunk.append(sentence)
        current_length += token_length
    
    # Add the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

In [35]:
from openai import OpenAI
client = OpenAI(api_key="API_KEY)

# Function to get the embedding
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


In [23]:
pdf_path = "data\pdf1.pdf"  # Replace with the path to your PDF file
pdf_text = read_pdf(pdf_path)

chunk_size = 1024  # Define your desired chunk size in terms of tokens
chunks = tokenize_and_chunk(pdf_text, chunk_size)

In [24]:
len(chunks)

8

In [68]:
import psycopg2
# Connect to PostgreSQL
conn = psycopg2.connect(
    dbname="postgres",
    user="postgres",
    password="Rex1993",
    host="localhost",
    port="5432"
)

In [69]:
cursor = conn.cursor()


In [40]:
# Assuming 'chunk' is a list or similar iterable containing your text chunks
chunk_embeddings = []
for text_chunk in chunks:
    # Generate the embedding for each chunk
    embedding_vector = get_embedding(text_chunk, model='text-embedding-3-small')
    
    # Store the chunk and its embedding as a tuple in the list
    chunk_embeddings.append((text_chunk, embedding_vector))

In [42]:
chunk_embeddings

[('Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works. Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and co

In [61]:
# SQL insert statement to store the embedding in vectordatabase
insert_query = """
INSERT INTO vectordatabase (text, embedding)
VALUES (%s, %s::vector)
RETURNING id;
"""

for text_chunk, embedding_vector in chunk_embeddings:
    cursor.execute(insert_query, (text_chunk, embedding_vector))
    conn.commit()

    # Get the ID of the newly inserted row
    inserted_id = cursor.fetchone()[0]
    print(f"Inserted embedding with ID: {inserted_id}")

# Close the cursor and connection after all chunks are processed
cursor.close()
conn.close()

Inserted embedding with ID: 1
Inserted embedding with ID: 2
Inserted embedding with ID: 3
Inserted embedding with ID: 4
Inserted embedding with ID: 5
Inserted embedding with ID: 6
Inserted embedding with ID: 7
Inserted embedding with ID: 8


In [51]:
len(chunks[0])

6232

In [65]:
create_index_query = """
CREATE INDEX ON vectordatabase USING ivfflat (embedding vector_l2_ops) WITH (lists = 100);
"""

# Execute the index creation
cursor.execute(create_index_query)
conn.commit()

print("Index created successfully.")

# Close the cursor and connection
cursor.close()
conn.close()

Index created successfully.


In [70]:
query_vector = get_embedding("what is the Endocoder used in transformer", model='text-embedding-3-small')

nn_query = """
SELECT id, text, embedding
FROM vectordatabase
ORDER BY embedding <-> %s::vector
LIMIT 5;
"""

cursor.execute(nn_query, (query_vector,))
results = cursor.fetchall()

for row in results:
    print(f"ID: {row[0]}, Text: {row[1]}")

# Close the cursor and connection
cursor.close()
conn.close()

ID: 2, Text: To the best of our knowledge, however, the Transformer is the first transduction model relying
entirely on self-attention to compute representations of its input and output without using sequence-
aligned RNNs or convolution. In the following sections, we will describe the Transformer, motivate
self-attention and discuss its advantages over models such as [17, 18] and [9]. 3 Model Architecture
Most competitive neural sequence transduction models have an encoder-decoder structure [ 5,2,35]. Here, the encoder maps an input sequence of symbol representations (x1, ..., x n)to a sequence
of continuous representations z= (z1, ..., z n). Given z, the decoder then generates an output
sequence (y1, ..., y m)of symbols one element at a time. At each step the model is auto-regressive
[10], consuming the previously generated symbols as additional input when generating the next. 2Figure 1: The Transformer - model architecture. The Transformer follows this overall architecture using sta

In [78]:
retrieved_chunks = [row[1] for row in results]
query_text="what is the Endocoder used in transformer"

In [94]:
def create_contextual_prompt(query_text, retrieved_chunks):
    context = "\n\n".join(retrieved_chunks)
    prompt = f"Context:\n{context}\n\nQuery: {query_text}\n\nAnswer:"
    return prompt

In [137]:
def generate_response_with_context(prompt, model="gpt-3.5-turbo"):
  response = client.chat.completions.create(
    model=model,
    messages=[
      {"role": "system", "content": "You are a helpful assistant. your task is to as give me answer in  layman language as much as possible"},
      {"role": "user", "content": prompt},
    
    ],temperature=0.7,
  )
  return response.choices[0].message.content

In [121]:
def retrieve_relevant_chunks(query_text, model='text-embedding-3-small'):
    query_vector = get_embedding(query_text, model=model)
    
    conn = psycopg2.connect(
    dbname="postgres",
    user="postgres",
    password="Rex1993",
    host="localhost",
    port="5432"
)
    cursor = conn.cursor()
    
    nn_query = """
    SELECT text, embedding
    FROM vectordatabase
    ORDER BY embedding <-> %s::vector
    LIMIT 5;
    """
    
    cursor.execute(nn_query, (query_vector,))
    results = cursor.fetchall()
    
    cursor.close()
    conn.close()
    
    retrieved_chunks = [row[0] for row in results]
    return retrieved_chunks

In [117]:
def answer_query(query_text):
    retrieved_chunks = retrieve_relevant_chunks(query_text)
    if not retrieved_chunks:
        return "No relevant information found."
    
    prompt = create_contextual_prompt(query_text, retrieved_chunks)
    response = generate_response_with_context(prompt)
    return response

In [124]:
query = "Explain how transformer works"

response = answer_query(query)
print(response)

The Transformer is a model architecture that uses self-attention to process input and output data without relying on traditional recurrent neural networks or convolutional layers. Here's a simpler explanation of how it works:

1. **Input Representation**: In the Transformer model, the input sequence is represented by symbol representations that are transformed into continuous representations.

2. **Encoder and Decoder**: The Transformer consists of an encoder and a decoder. The encoder converts the input sequence into continuous representations, while the decoder generates the output sequence step by step based on the encoder's representations.

3. **Self-Attention Mechanism**: The key feature of the Transformer is the self-attention mechanism. This mechanism allows each position in the input sequence to focus on other positions, capturing dependencies between different parts of the sequence.

4. **Multi-Head Attention**: Instead of using single attention, the Transformer employs multi

In [125]:
response=generate_response_with_context(query)

In [126]:
response


'A transformer is like a magic box that can change the voltage of electricity. It has two coils of wire called the primary and secondary coils. When you pass electricity through the primary coil, it creates a magnetic field. This magnetic field then induces a voltage in the secondary coil. By changing the number of turns in each coil, a transformer can increase or decrease the voltage of electricity. This helps in transferring electricity efficiently over long distances and matching the right voltage for different devices.'

ChatCompletion(id='chatcmpl-9z2LdDO6PetTMbRqy8Z2po3HrCKkE', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="A LLM, or Large Language Model, is a type of artificial intelligence model designed to understand, generate, and manipulate human language at a large scale. These models are typically trained on vast amounts of text data, enabling them to learn the intricacies of human language patterns, grammar, context, and even some level of reasoning.\n\nKey features of LLMs include:\n\n1. **Scale**: LLMs contain billions to trillions of parameters, which are the quantities adjusted during training that help the model to learn language representations.\n\n2. **Pre-training and Fine-tuning**: LLMs often undergo a two-step training process—first, pre-training on a broad dataset to learn general language characteristics, and then fine-tuning on a more specific dataset for particular tasks.\n\n3. **Versatility**: They can be used for various ta

"A LLM, or Large Language Model, is a type of artificial intelligence model designed to understand, generate, and manipulate human language at a large scale. These models are typically trained on vast amounts of text data, enabling them to learn the intricacies of human language patterns, grammar, context, and even some level of reasoning.\n\nKey features of LLMs include:\n\n1. **Scale**: LLMs contain billions to trillions of parameters, which are the quantities adjusted during training that help the model to learn language representations.\n\n2. **Pre-training and Fine-tuning**: LLMs often undergo a two-step training process—first, pre-training on a broad dataset to learn general language characteristics, and then fine-tuning on a more specific dataset for particular tasks.\n\n3. **Versatility**: They can be used for various tasks such as text generation, translation, summarization, sentiment analysis, and more.\n\n4. **Contextual Understanding**: LLMs can generate contextually releva

'A LLM stands for Master of Laws, which is an advanced postgraduate degree in law that allows individuals to specialize in a particular area of law or jurisdiction. It is typically pursued by individuals who already have a law degree and want to further their legal education and expertise in a specific area.'

In [130]:
import gradio as gr


  from .autonotebook import tqdm as notebook_tqdm


In [143]:
def answer_query(query_text):
    retrieved_chunks = retrieve_relevant_chunks(query_text)
    if not retrieved_chunks:
        return "No relevant information found."
    
    prompt = create_contextual_prompt(query_text, retrieved_chunks)
    response = generate_response_with_context(prompt)
    return response

# Define custom CSS for dark mode
# Define custom CSS for a comprehensive dark mode
# Define a more refined dark mode CSS


# Create the Gradio interface with custom CSS for dark mode
iface = gr.Interface(
    fn=answer_query,
    inputs="text",
    outputs="text",
    title="RAG Query Answering System",
    description="Ask a question and get an AI-generated response based on relevant information retrieved from a database.",
      # Apply refined custom CSS for a dark mode experience
)

# Launch the interface
iface.launch()

Running on local URL:  http://127.0.0.1:7868

To create a public link, set `share=True` in `launch()`.


