In [2]:
import openai
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.environ.get("OPENAI_API_KEY")
openai.api_key = api_key
base_url = "http://0.0.0.0:8000" 
openai.base_url =base_url

In [6]:
import hashlib
from IPython.display import Markdown, display
import json
from pathlib import Path
import os
import textwrap
from typing import List, Union

from llama_index import (
    ServiceContext,
    VectorStoreIndex
)
from llama_index.callbacks import CallbackManager, OpenInferenceCallbackHandler
from llama_index.callbacks.open_inference_callback import (
    as_dataframe,
    QueryData,
    NodeData,
)
from llama_index.node_parser import SimpleNodeParser
import pandas as pd
from tqdm import tqdm

In [7]:
from llama_index import download_loader

SimpleWebPageReader = download_loader("SimpleWebPageReader")

loader = SimpleWebPageReader()

In [8]:
class ParquetCallback:
    def __init__(
        self, data_path: Union[str, Path], max_buffer_length: int = 1000
    ):
        self._data_path = Path(data_path)
        self._data_path.mkdir(parents=True, exist_ok=False)
        self._max_buffer_length = max_buffer_length
        self._batch_index = 0

    def __call__(
        self,
        query_data_buffer: List[QueryData],
        node_data_buffer: List[NodeData],
    ) -> None:
        if len(query_data_buffer) > self._max_buffer_length:
            query_dataframe = as_dataframe(query_data_buffer)
            file_path = self._data_path / f"log-{self._batch_index}.parquet"
            query_dataframe.to_parquet(file_path)
            self._batch_index += 1
            query_data_buffer.clear()  # ⚠️ clear the buffer or it will keep growing forever!
            node_data_buffer.clear() 

In [5]:
import pandas as pd
query_dataframes = []
data_path = "../prototype-search-application/resources/parquet/demo-graz.parquet"

df = pd.read_parquet(data_path)

In [6]:

df['warc_date']=df['warc_date'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))


In [7]:
# Assuming df is your DataFrame
from llama_index.schema import Document

docs = []
for index, row in df.iterrows():
    metadata = dict(row)
    metadata['plain_text'] = ""
    docs.append(Document(doc_id= row["id"], text=row["plain_text"],metadata=metadata ))


In [11]:
from langchain.chat_models import ChatOpenAI
from llama_index import LLMPredictor, ServiceContext,StorageContext,load_index_from_storage

callback_handler = OpenInferenceCallbackHandler()
callback_manager = CallbackManager([callback_handler])
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.9, model_name="mistral",api_key=api_key,base_url=base_url))
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor,
    callback_manager=callback_manager
)
import re
path = "demo-graz.parquet"
folder_name = re.sub('[^A-Za-z0-9]+', '_', path.strip())

try:
    storage_context = StorageContext.from_defaults(persist_dir= f"../storage/index/{folder_name}")
    graz_index = load_index_from_storage(storage_context)
    index_loaded = True
    print("Index loaded.")
except:
    index_loaded = False
    
if not index_loaded:
    parser = SimpleNodeParser.from_defaults(chunk_size=1024)
    nodes = parser.get_nodes_from_documents(docs,show_progress=True)
    print("Building index...")
    graz_index = VectorStoreIndex(
        nodes, service_context=service_context,show_progress=True
    )
    graz_index.storage_context.persist(persist_dir= f"../storage/index/{folder_name}")


from llama_index import (
    Prompt,
    get_response_synthesizer,
)
from llama_index.prompts import PromptTemplate


response_template = """
## Question

{question}


## Answer
```
{response}
```

"""
template = (
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given this information, please answer the question: {query_str}\n"
    "Find information from different urls. \n"
    "Find top 10 distinct related url which related to query. Rename field as results. \n"
    "Find some webp image urls among related urls. Name field as image_url. \n"
    "Reduce each document to url and title and one small sentence. Rename snippet to sentence. \n"
    "Result should includes url, tittle, sentence."
    "Return the response in json format.\n"
)
qa_template = PromptTemplate(template)
response_synthesizer = get_response_synthesizer(text_qa_template= qa_template)
graz_engine = graz_index.as_query_engine(similarity_top_k = 3,response_synthesizer=response_synthesizer)

def query(query: str, query_engine=graz_engine):
    
    response_md = query_engine.query(query)
    display(Markdown(response_template.format(
        question=query,
        response=response_md,
    
    )))
    return json.loads(str(response_md))

ValueError: Unknown model 'mistral'. Please provide a valid OpenAI model name in: gpt-4, gpt-4-32k, gpt-4-1106-preview, gpt-4-vision-preview, gpt-4-0613, gpt-4-32k-0613, gpt-4-0314, gpt-4-32k-0314, gpt-3.5-turbo, gpt-3.5-turbo-16k, gpt-3.5-turbo-1106, gpt-3.5-turbo-0613, gpt-3.5-turbo-16k-0613, gpt-3.5-turbo-0301, text-davinci-003, text-davinci-002, gpt-3.5-turbo-instruct, text-ada-001, text-babbage-001, text-curie-001, ada, babbage, curie, davinci, gpt-35-turbo-16k, gpt-35-turbo

In [27]:
data = query("'Graz'")
data


## Question

'Graz'


## Answer
```
{
  "results": [
    {
      "url": "https://en.wikipedia.org/wiki/Graz",
      "title": "Graz - Wikipedia",
      "sentence": "Graz is the capital city of the Austrian state of Styria and second-largest city in Austria after Vienna."
    },
    {
      "url": "https://www.graz.at/",
      "title": "City of Graz",
      "sentence": "Official website of the city of Graz."
    },
    {
      "url": "https://www.graztourismus.at/en",
      "title": "Graz Tourism",
      "sentence": "Official tourism website of Graz."
    },
    {
      "url": "https://www.britannica.com/place/Graz",
      "title": "Graz - Encyclopedia Britannica",
      "sentence": "Graz is the second largest city in Austria and the capital of the Bundesland (federal state) of Styria."
    },
    {
      "url": "https://www.lonelyplanet.com/austria/styria/graz",
      "title": "Graz - Lonely Planet",
      "sentence": "Graz is a vibrant city in the heart of Styria, known for its rich history and beautiful architecture."
    },
    {
      "url": "https://www.tripadvisor.com/Tourism-g190432-Graz_Styria-Vacations.html",
      "title": "Graz - TripAdvisor",
      "sentence": "Plan your visit to Graz, Austria with the help of TripAdvisor's traveler reviews and photos."
    },
    {
      "url": "https://www.austria.info/us/where-to-go/cities/graz",
      "title": "Graz - Austria.info",
      "sentence": "Discover the city of Graz, a UNESCO World Heritage site, with its historic center and Schloss Eggenberg."
    },
    {
      "url": "https://www.graz-airport.com/",
      "title": "Graz Airport",
      "sentence": "Official website of Graz Airport, providing information on flights, services, and facilities."
    },
    {
      "url": "https://www.grazmuseum.at/en/",
      "title": "Graz Museum",
      "sentence": "Explore the history and culture of Graz at the Graz Museum."
    },
    {
      "url": "https://www.graz-kulturjahr2020.at/en/",
      "title": "Graz Cultural Capital 2020",
      "sentence": "Learn about the cultural events and activities happening in Graz as the Cultural Capital of Europe in 2020."
    }
  ],
  "image_url": [
    "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4f/Innere_Stadt%2C_8010_Graz%2C_Austria_-_panoramio_%2822%29.jpg/800px-Innere_Stadt%2C_8010_Graz%2C_Austria_-_panoramio_%2822%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/2/2e/Innere_Stadt%2C_8010_Graz%2C_Austria_-_panoramio_%2814%29.jpg/800px-Innere_Stadt%2C_8010_Graz%2C_Austria_-_panoramio_%2814%29.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4f/19-06-14-Graz-Murinsel-Schlo%C3%9Fberg-RalfR.jpg/800px-19-06-14-Graz-Murinsel-Schlo%C3%9Fberg-RalfR.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4f/Catedrala_Sf._Egidiu_din_Graz3.jpg/800px-Catedrala_Sf._Egidiu_din_Graz3.jpg",
    "https://upload.wikimedia.org/wikipedia/commons/thumb/4/4f/Katharinenkirche%2C_Graz.jpg/800px-Katharinenkirche%2C_Graz.jpg"
  ]
}
```



{'results': [{'url': 'https://en.wikipedia.org/wiki/Graz',
   'title': 'Graz - Wikipedia',
   'sentence': 'Graz is the capital city of the Austrian state of Styria and second-largest city in Austria after Vienna.'},
  {'url': 'https://www.graz.at/',
   'title': 'City of Graz',
   'sentence': 'Official website of the city of Graz.'},
  {'url': 'https://www.graztourismus.at/en',
   'title': 'Graz Tourism',
   'sentence': 'Official tourism website of Graz.'},
  {'url': 'https://www.britannica.com/place/Graz',
   'title': 'Graz - Encyclopedia Britannica',
   'sentence': 'Graz is the second largest city in Austria and the capital of the Bundesland (federal state) of Styria.'},
  {'url': 'https://www.lonelyplanet.com/austria/styria/graz',
   'title': 'Graz - Lonely Planet',
   'sentence': 'Graz is a vibrant city in the heart of Styria, known for its rich history and beautiful architecture.'},
  {'url': 'https://www.tripadvisor.com/Tourism-g190432-Graz_Styria-Vacations.html',
   'title': 'Gra

In [9]:
data = query("Graz airport")
founded_webpages =[]
final_results = []
for r in data:
    if r["url"] not in founded_webpages:
        founded_webpages.append(r["url"])
        final_results.append(r)

markdown_content = ""
for item in final_results:
    markdown_content += f"[{item['title']}]({item['url']})\n- {item['sentence']}\n\n"

display(Markdown(markdown_content))

JSONDecodeError: Unterminated string starting at: line 8 column 20 (char 271)

In [155]:
import requests

url = "http://localhost:8000/search?q=graz&index=demo-graz&lang=en&ranking=asc&limit=10"
response = requests.get(url)

# Ensure the request was successful
if response.status_code == 200:
    data2 = response.json()['results']
else:
    print(f"Request failed with status code {response.status_code}")

for item in data2:
    item['sentence'] = item.pop('textSnippet')
    item['title'] = 'No Tittle'

data2

[{'url': 'https://www.graztourismus.at/en/sightseeing-culture/sights/stiegenkirche_shg_6987',
  'id': '<urn:uuid:516fbad4-ab1b-4b0d-b159-4c4a46d7c6e8>',
  'title': 'No Tittle',
  'language': 'en',
  'warcDate': '1684768605000000',
  'wordCount': 184,
  'sentence': 'The oldest parish church in Graz was first mentioned in a historical document in 1343. Known chiefly as a church for students, it lies in the historic centre of Graz close up against the southern side of the Schlossberg,concealed behind the heavy walls of a former Augustinian monastery.'},
 {'url': 'https://www.graztourismus.at/en/sightseeing-culture/sights/generalihof_shg_6958',
  'id': '<urn:uuid:9907615a-d4be-491c-8c5d-ab60fc8bee1d>',
  'title': 'No Tittle',
  'language': 'en',
  'warcDate': '1684768609000000',
  'wordCount': 195,
  'sentence': 'What would the old town of Graz be without the wonderful inner courtyards? What would a summer in Graz be without jazz concerts in the Generalihof? Every year in summer, the Gener

In [6]:

from openai import OpenAI

client = OpenAI(api_key=api_key, base_url=base_url)

completion = client.completions.create(model='mistral', 
    prompt=f"Q: who is the president of USA. Send result in json format\nA:",
    temperature=0.3,
    max_tokens=100,
    top_p=1,
    frequency_penalty=0.5,
    stop=["\n"])
print(completion.choices[0].text)

{
  "president": "Joe Biden"
}



<iframe src="https://en.wikipedia.org/wiki/File:Innere_Stadt,_8010_Graz,_Austria_-_panoramio_(22).jpg" width="700" height="450"></iframe>

# With local models

In [13]:
from llama_index.llms import Ollama
from llama_index.llms import ChatMessage
from llama_index import ServiceContext
from llama_index import LLMPredictor, ServiceContext

LLM = Ollama(model="mistral")
service_context2 = ServiceContext.from_defaults(llm=LLM,embed_model="local")

# messages = [
#     ChatMessage(
#         role="system", content="You are a cat with a colorful personality"
#     ),
#     ChatMessage(role="user", content="What is your name"),
# ]
callback_handler = OpenInferenceCallbackHandler()
callback_manager = CallbackManager([callback_handler])
llm_predictor = LLMPredictor(llm=LLM)

In [14]:
parser = SimpleNodeParser.from_defaults(chunk_size=1024)
nodes = parser.get_nodes_from_documents(docs,show_progress=True)
print("Building index...")
index = VectorStoreIndex(
    nodes, service_context=service_context2,show_progress=True
)

Parsing nodes:   0%|          | 0/59 [00:00<?, ?it/s]

Building index...


Generating embeddings:   0%|          | 0/153 [00:00<?, ?it/s]

In [19]:

from llama_index import (
    Prompt,
    get_response_synthesizer,
)
from llama_index.prompts import PromptTemplate


response_template = """
## Question

{question}


## Answer
```
{response}
```

"""

template = (
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given this information, please answer the question: {query_str}\n"
    "Find information from different urls. \n"
    "Find at least one photo url with related query. \n"
    "Find top 10 distinct related url which related to query. Rename field as results. \n"
    "Reduce each document to url and title and one small sentence and if photo url exits put photo url to the json. Rename snippet to sentence. \n"
    "Result should includes url, tittle, sentence."
    "Return the response in json format.\n"
    "Do not return in markdown format, return in json format.\n"
)
qa_template = PromptTemplate(template)
response_synthesizer = get_response_synthesizer(text_qa_template= qa_template,service_context=service_context2)
query_engine2 = index.as_query_engine(similarity_top_k = 3,response_synthesizer=response_synthesizer)

def ask(query: str, top_k: int = 12):
    
    response_md = query_engine2.query(query)
    display(Markdown(response_template.format(
        question=query,
        response=response_md,
    
    )))
    return response_md

In [20]:
ask("Graz")

KeyboardInterrupt: 

# Mistral embeddings

In [7]:
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
from llama_index.llms import Ollama
from llama_index.llms import ChatMessage
from llama_index import ServiceContext
from llama_index import LLMPredictor, ServiceContext

LLM = Ollama(model="mistral")
oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="mistral")

In [3]:
r1 = oembed.embed_documents(
            [
                "Alpha is the first letter of Greek alphabet",
                "Beta is the second letter of Greek alphabet",
            ]
        )

In [8]:
from llama_index.schema import Document
ds = [
    Document(doc_id= "1", text="Alpha is the first letter of Greek alphabet",metadata={"name":"doc1"} ),
    Document(doc_id= "2", text="Beta is the second letter of Greek alphabet",metadata={"name":"doc2"} )
    
]
service_context3 = ServiceContext.from_defaults(llm=LLM,embed_model=oembed)
parser = SimpleNodeParser.from_defaults(chunk_size=1024)
nodes = parser.get_nodes_from_documents(ds,show_progress=True)
print("Building index...")
index = VectorStoreIndex(
    nodes, service_context=service_context3, show_progress=True
)

Parsing nodes:   0%|          | 0/2 [00:00<?, ?it/s]

Building index...


Generating embeddings:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
str(index.as_query_engine().query("Delta"))

'\nDelta is the fourth letter of Greek alphabet.'

Graz, city in Austria <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/17-05-16-Graz_Rathaus-aDSC_1282.jpg/2560px-17-05-16-Graz_Rathaus-aDSC_1282.jpg" alt="drawing" width="200"/>