In [1]:
from bs4 import BeautifulSoup
import requests
from dotenv import load_dotenv
import os
load_dotenv()
ASTRA_DB_API_ENDPOINT=os.getenv("ASTRA_DB_API_ENDPOINT")
ASTRA_DB_APPLICATION_TOKEN=os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_KEYSPACE=os.getenv("ASTRA_DB_KEYSPACE")
HF_TOKEN = os.getenv("HF_TOKEN")
GROQ_API = os.getenv("GROQ_API")

Python-dotenv could not parse statement starting at line 7
Python-dotenv could not parse statement starting at line 8
Python-dotenv could not parse statement starting at line 9


In [2]:
def extract_info(soup):
    return [
        {
            'title': eachHead.find("h3", {"class": "dataset-heading"}).find('a').text,
            'link': eachHead.find("h3", {"class": "dataset-heading"}).find('a').attrs['href'],
            'datasetOrganiz': eachHead.find("div", {"class": "notes"}).find('p', {"class": "dataset-organization"}).text,
            'description': eachHead.find("div", {"class": "notes"}).find('div').text,
        } for eachHead in soup.find_all("div", {"class": "dataset-content"})]

In [3]:
url_list = [
"https://catalog.data.gov/dataset?res_format=XML",
# "http://aiweb.cs.washington.edu/research/projects/xmltk/xmldata/",
# "https://data.gov.ie/dataset?res_format=JSON",
# "https://catalog.data.gov/dataset?res_format=JSON",
# "https://data.worldbank.org/",
]
final_doc = []
for eachPage in url_list:
    r = requests.get(eachPage)
    soup = BeautifulSoup(r.content, 'html.parser')
    final_doc = final_doc + extract_info(soup)

In [4]:
final_doc

[{'title': 'Electric Vehicle Population Data',
  'link': '/dataset/electric-vehicle-population-data',
  'datasetOrganiz': 'State of Washington —',
  'description': 'This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department...'},
 {'title': 'Crime Data from 2020 to Present',
  'link': '/dataset/crime-data-from-2020-to-present',
  'datasetOrganiz': 'City of Los Angeles —',
  'description': 'Starting on March 7th, 2024, the Los Angeles Police Department (LAPD) will adopt a new Records Management System for reporting crimes and arrests. This new system is...'},
 {'title': 'Air Quality',
  'link': '/dataset/air-quality',
  'datasetOrganiz': 'City of New York —',
  'description': 'Dataset contains information on New York City air quality surveillance data. Air pollution is one of the most important environmental threats to urban populations...'},
 {'title': 'Lottery Mega Millions Winn

In [5]:
import pandas as pd
df =  pd.DataFrame(final_doc)

In [6]:
df.head()

Unnamed: 0,title,link,datasetOrganiz,description
0,Electric Vehicle Population Data,/dataset/electric-vehicle-population-data,State of Washington —,This dataset shows the Battery Electric Vehicl...
1,Crime Data from 2020 to Present,/dataset/crime-data-from-2020-to-present,City of Los Angeles —,"Starting on March 7th, 2024, the Los Angeles P..."
2,Air Quality,/dataset/air-quality,City of New York —,Dataset contains information on New York City ...
3,Lottery Mega Millions Winning Numbers: Beginni...,/dataset/lottery-mega-millions-winning-numbers...,State of New York —,Go to http://on.ny.gov/1J8tPSN on the New York...
4,"Death rates for suicide, by sex, race, Hispani...",/dataset/death-rates-for-suicide-by-sex-race-h...,U.S. Department of Health & Human Services —,"Data on death rates for suicide, by selected p..."


In [7]:
from langchain_core.documents import Document
docs = [ Document(page_content=obj['description'], metadata={'title':obj['title']})  for obj in final_doc]
docs

[Document(metadata={'title': 'Electric Vehicle Population Data'}, page_content='This dataset shows the Battery Electric Vehicles (BEVs) and Plug-in Hybrid Electric Vehicles (PHEVs) that are currently registered through Washington State Department...'),
 Document(metadata={'title': 'Crime Data from 2020 to Present'}, page_content='Starting on March 7th, 2024, the Los Angeles Police Department (LAPD) will adopt a new Records Management System for reporting crimes and arrests. This new system is...'),
 Document(metadata={'title': 'Air Quality'}, page_content='Dataset contains information on New York City air quality surveillance data. Air pollution is one of the most important environmental threats to urban populations...'),
 Document(metadata={'title': 'Lottery Mega Millions Winning Numbers: Beginning 2002'}, page_content='Go to http://on.ny.gov/1J8tPSN on the New York Lottery website for past Mega Millions results and payouts.'),
 Document(metadata={'title': 'Death rates for suicide, by

In [8]:
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key= HF_TOKEN, model_name= "BAAI/bge-base-en-v1.5")
from langchain_astradb import AstraDBVectorStore
vstore = AstraDBVectorStore(
    embedding= embeddings,
    collection_name= "datasetaggregator",
    api_endpoint = ASTRA_DB_API_ENDPOINT,
    token = ASTRA_DB_APPLICATION_TOKEN,
    namespace = ASTRA_DB_KEYSPACE
)

DataAPIHttpException: Client error '401 Unauthorized' for url 'https://e02ce0af-b2bb-40d7-bb49-7085f29fb0ba-us-east-2.apps.astra.datastax.com/api/json/v1/default_keyspace'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/401

In [95]:
insert_ids = vstore.add_documents(docs)

In [96]:
from langchain_groq import ChatGroq
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import MessagesPlaceholder
from langchain.chains import create_history_aware_retriever
model = ChatGroq(groq_api_key = GROQ_API, model="llama-3.1-70b-versatile", temperature=0.5)

In [97]:
retriever_prompt = ("Given a chat history and the latest user question which might reference context in the chat history,"
    "formulate a standalone question which can be understood without the chat history."
    "Do NOT answer the question, just reformulate it if needed and otherwise return it as is."
    )
retriever = vstore.as_retriever(search_kwargs={"k": 3})
from langchain_core.prompts import ChatPromptTemplate
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
    ("system", retriever_prompt),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(model, retriever, contextualize_q_prompt)

In [98]:
DATASET_BOT_TEMPLATE = """
    You are an expert bot for providing insights about datasets.
    Your role is to analyze dataset titles and descriptions to help users find relevant information and answer their queries accurately.
    Ensure your responses are concise, informative, and remain focused on the context of the dataset.
    Avoid providing irrelevant or off-topic information.

    CONTEXT:
    {context}

    QUESTION: {input}

    YOUR ANSWER:
    """
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", DATASET_BOT_TEMPLATE),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}")
    ]
)

In [99]:
question_answer_chain = create_stuff_documents_chain(model, qa_prompt)
chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)
chat_history = []
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
store = {}

In [100]:
def get_session_history(session_id: str)-> BaseChatMessageHistory:
  if session_id not in store:
    store[session_id]= ChatMessageHistory()
  return store[session_id]

In [101]:
chain_with_memmory = RunnableWithMessageHistory(
    chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [102]:
chain_with_memmory.invoke(
   {"input": "can you tell me the dataset to choose relate to electric?"},
    config={
        "configurable": {"session_id": "shuvo"}
    },
)["answer"]

'No, this dataset does not appear to be related to electric information, but rather focuses on air quality surveillance data in New York City.'

In [5]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
device = "mps" if torch.backends.mps.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(
    "facebook/nllb-200-distilled-600M", 
    torch_dtype=torch.float16
)
model = model.to(device).eval()
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
english_sentence = "did you sleep?"
inputs = tokenizer(english_sentence, return_tensors="pt").to(device)
translated_tokens = model.generate(
    **inputs, 
    forced_bos_token_id=tokenizer.convert_tokens_to_ids("mal_Mlym"), 
    max_length=50
)
translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
print("Translated text in Malayalam:", translation)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Translated text in Malayalam: നീ ഉറങ്ങിയിരുന്നോ?
