In [1]:
!pip install llama-index llama-parse langchain-community openai pinecone-client groq gradio
# Imports
import os
from google.colab import userdata
import re
import nest_asyncio
import openai
from openai import OpenAI
from pinecone import Pinecone
from groq import Groq
import gradio as gr

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from llama_parse import LlamaParse
from langchain.vectorstores import Pinecone
import pinecone


Collecting llama-index
  Downloading llama_index-0.10.64-py3-none-any.whl.metadata (11 kB)
Collecting llama-parse
  Downloading llama_parse-0.4.9-py3-none-any.whl.metadata (4.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.2.11-py3-none-any.whl.metadata (2.7 kB)
Collecting openai
  Downloading openai-1.40.3-py3-none-any.whl.metadata (22 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting groq
  Downloading groq-0.9.0-py3-none-any.whl.metadata (13 kB)
Collecting gradio
  Downloading gradio-4.41.0-py3-none-any.whl.metadata (15 kB)
Collecting llama-index-agent-openai<0.3.0,>=0.1.4 (from llama-index)
  Downloading llama_index_agent_openai-0.2.9-py3-none-any.whl.metadata (729 bytes)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama-index)
  Downloading llama_index_cli-0.1.13-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.11.0,>=0.10.64 (from llama-index)
  Downloading llama_index_core-0

In [6]:

# Setup
os.environ['LLAMA_CLOUD_API_KEY'] = userdata.get("LLAMAPARSE")
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI')



nest_asyncio.apply()

parser = LlamaParse(
    api_key="LLAMA PARSE API KEY",
    result_type="markdown"
)

file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=['data/LLAMA-TEST.pdf'], file_extractor=file_extractor).load_data()
print(len(documents))

openai.api_key = os.environ['OPENAI_API_KEY']
client = OpenAI()

from pinecone import Pinecone
pc = Pinecone(api_key="PINECONE API KEY")
index = pc.Index("chat")

groq_client = Groq(api_key='GROQ API KEY')

# Functions
def tokenize_paragraphs(text):
    send = []
    count = 1
    paragraphs = re.split(r'\n\s*\n', text.strip())
    for para in paragraphs:
        send.append({'text': para, 'section_no': count})
        count += 1
    return send

def metadata_documents_section():
    super_document_metadata = []
    for metadata_page in metadata_documents:
        document_metadata = {}
        page_metadata = []

        page_no = metadata_page['page_no']
        rt = tokenize_paragraphs(metadata_page['text'])
        for section in rt:
            temp = {}
            temp['section'] = section['text']
            temp['section_number'] = section['section_no']
            page_metadata.append(temp)
        document_metadata['page_no'] = page_no
        document_metadata['data'] = page_metadata
        super_document_metadata.append(document_metadata)
    return super_document_metadata

def generate_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

def query_pinecone(query_text):
    query_embedding = generate_embedding(query_text)
    result = index.query(vector=query_embedding, top_k=1, include_values=True, include_metadata=True)

    matches = []
    for match in result['matches']:
        matches.append({
            'metadata': {'page_no': match['metadata']['page_no'], 'section_no': match['metadata']['section_no']},
            'content': match['metadata']['content'][0]
        })

    return matches

def collect_content(retrieved_docs):
    rt = []
    for content in retrieved_docs:
        rt.append(content['content'])
    return rt

def generate_response(query, only_content, docs):
    system_message = (
        "You are a helpful AI assistant. Answer the question using the provided context.\n\n"
        "CONTEXT:\n"
        "\n---\n".join(only_content))
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": query}
    ]
    response = groq_client.chat.completions.create(
        model="llama3-70b-8192",
        messages=messages
    )
    return response.choices[0].message.content


metadata_documents = [{'text': page.text, 'page_no': documents.index(page) + 1} for page in documents]
metadata = metadata_documents_section()

id = 0
vectors = []
for page in metadata:
    page_no = page['page_no']
    for section in page['data']:
        temp = {}
        temp['id'] = str(id)
        temp['values'] = generate_embedding(section['section'])
        temp['metadata'] = {'page_no': page_no, 'section_no': section['section_number'], 'content': [section['section']]}
        vectors.append(temp)
        id += 1
index.upsert(vectors)

index.query(vector=generate_embedding("when was japan attacked?"), top_k=2, include_values=False, include_metadata=True)



Started parsing the file under job_id 74f94e80-1035-4182-ada0-79df89113e3e
55


{'matches': [{'id': '49',
              'metadata': {'content': ['On 7 December 1941, while German '
                                       'armies were freezing before Moscow, '
                                       'Japan suddenly pushed the United '
                                       'States into the struggle by attacking '
                                       'the American naval base at Pearl '
                                       'Harbor, Hawaii. Four days later Hitler '
                                       'declared war on the United States. '
                                       'President Roosevelt called on Congress '
                                       'for immediate and massive expansion of '
                                       'the armed forces. Twenty years of '
                                       'neglect and indifference, however, '
                                       'could not be overcome in a few days.'],
                           'page_no': 

In [12]:
user_query = "who was hitler"
retrieved_docs = query_pinecone(user_query)
metadata = retrieved_docs[0]['metadata']
only_content = collect_content(retrieved_docs)
answer = generate_response(user_query, only_content, retrieved_docs)

In [19]:
print(answer,metadata,retrieved_docs[0]['content'],sep="\n--------\n")


Adolf Hitler was an Austrian-born German politician who was the leader of the Nazi Party and the dictator of Germany from 1934 to 1945. He rose to power during the 1920s and 1930s, and his policies and beliefs led to the deaths of millions of people during World War II and the Holocaust.

Early Life:

Hitler was born on April 20, 1889, in Braunau am Inn, Austria-Hungary, to Alois Hitler and Klara Pölzl. His father was a customs officer, and the family moved frequently during Hitler's childhood. Hitler was a poor student and dropped out of high school at the age of 16. He twice attempted to enter the Academy of Fine Arts Vienna, but was rejected.

Military Service:

In 1913, Hitler moved to Germany and served in the German Army during World War I. He was a dispatch runner and was decorated with the Iron Cross for bravery. He was gassed and temporarily blinded in 1918, and was hospitalized until the end of the war.

Political Career:

After the war, Hitler became involved in politics and

In [11]:
import gradio as gr


def processing(query):
        retrieved_docs = query_pinecone(query)
        metadata = retrieved_docs[0]['metadata']
        only_content = collect_content(retrieved_docs)
        answer = generate_response(user_query, only_content, retrieved_docs)
        print(answer)
        return str(answer)+'\n\n Context From : \n'+f"Page No : {metadata['page_no']}\tSection No : {metadata['section_no']}"

demo = gr.Interface(
    fn=processing,
    inputs=["text"],
    outputs=["text"],
)

demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://eb52c14b767a7a1e95.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


According to the passage, the B-29 bomber was used to bomb Japan.
The bomber used to bomb Japan during World War II was the B-29 Superfortress. Specifically, the B-29 was used in the Pacific Theater to carry out strategic bombing missions against Japanese cities and industrial targets.

The most famous B-29s used in these missions were:

1. Enola Gay: Which dropped the atomic bomb "Little Boy" on Hiroshima on August 6, 1945.
2. Bockscar: Which dropped the atomic bomb "Fat Man" on Nagasaki on August 9, 1945.

These bombing missions, along with the Soviet Union's declaration of war on Japan, contributed to Japan's eventual surrender and the end of World War II.
The bomber used to bomb Japan during World War II was the B-29 Superfortress. The B-29 was a four-engine heavy bomber developed by Boeing and used by the United States Army Air Forces (USAAF) to conduct strategic bombing missions against Japan.

The B-29 was a significant improvement over earlier bombers, with a larger payload cap

