In [16]:
!pip install langchain chromadb sentence-transformers huggingface_hub transformers torch==2.6.0 torchvision==0.21.0 zipfile36 langchain-community langchain-huggingface gradio -U



In [17]:
import os
import zipfile
import requests
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_huggingface  import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from getpass import getpass

In [18]:
# Set Hugging Face API key
os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass("Enter your Hugging Face API key: ")

Enter your Hugging Face API key: ··········


In [19]:
# Download the ZIP file
url = "https://www.dropbox.com/scl/fi/p7d0zjnrefjne94j3ec2b/new_articles.zip?rlkey=izy5z44eofgx5fmjn8tif4y99&dl=1"
response = requests.get(url)
with open("/content/new_articles.zip", "wb") as f:
    f.write(response.content)

# Extract ZIP file
with zipfile.ZipFile("/content/new_articles.zip", "r") as zip_ref:
    zip_ref.extractall("/content/articles")

# Read and preprocess text files
articles = []
for file_name in os.listdir("/content/articles"):
    if file_name.endswith(".txt"):
        with open(f"/content/articles/{file_name}", "r", encoding="utf-8") as f:
            content = f.read().strip()
            articles.append({"content": content, "filename": file_name})

In [20]:
articles[1]

{'content': 'Slack has evolved from a pure communications platform to one that enables companies to link directly to enterprise applications without having to resort to dreaded task switching. Today, at the Salesforce World Tour event in NYC, the company announced the next step in its platform’s evolution where it will be putting AI at the forefront of the user experience, making it easier to get information and build workflows.\n\nIt’s important to note that these are announcements, and many of these features are not available yet.\n\nRob Seaman says that rather than slapping on an AI cover, they are working to incorporate it in a variety of ways across the platform. That started last month with a small step, a partnership with OpenAI to bring a ChatGPT app into Slack, the first piece of a much broader vision for AI on the platform. That part is in beta at the moment.\n\nToday’s announcement involves several new integrations, including SlackGPT, the company’s own flavor of generative 

In [21]:
# Chunk the articles
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = []
for article in articles:
    split_texts = splitter.split_text(article["content"])
    for i, text in enumerate(split_texts):
        chunks.append({
            "text": text,
            "metadata": {"filename": article["filename"], "chunk_id": i}
        })

In [22]:
chunks[1]

{'text': 'I won’t encumber the reader with a lengthy summary of this perfectly readable and interesting piece, but the gist is that while GPT-4 and other proprietary models have obtained the lion’s share of attention and indeed income, the head start they’ve gained with funding and infrastructure is looking slimmer by the day.\n\nWhile the pace of OpenAI’s releases may seem blistering by the standards of ordinary major software releases, GPT-3, ChatGPT and GPT-4 were certainly hot on each other’s heels if you compare them to versions of iOS or Photoshop. But they are still occurring on the scale of months and years.',
 'metadata': {'filename': '05-05-google-and-openai-are-walmarts-besieged-by-fruit-stands.txt',
  'chunk_id': 1}}

In [23]:
# Set up embeddings and ChromaDB
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma.from_texts(
    texts=[chunk["text"] for chunk in chunks],
    embedding=embeddings,
    metadatas=[chunk["metadata"] for chunk in chunks],
    persist_directory="/content/chroma_db"
)

In [24]:
# Set up FLAN-T5 model
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
print("Model loaded")
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=1024,
    device= 0  # Use GPU
)
llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cpu


Model loaded


In [25]:
# Create RetrievalQA chain
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

In [26]:
# Test the chatbot
query1 = "What is the main topic of the articles?"
print("Query 1:", query1)
result1 = qa_chain.invoke({"query": query1})
print("Answer 1:", result1["result"])

query2 = "What did Pando company do?"
print("Query 2:", query2)
result2 = qa_chain.invoke({"query": query2})
print("Answer 2:", result2["result"])

query3 = "Who did Databricks acquire?"
print("Query 3:", query3)
result3= qa_chain.invoke({"query": query3})
print("Answer 3:", result3["result"])

# print("\nSources:")
# for doc in result["source_documents"]:
#     print(f"- {doc.metadata['filename']} (Chunk {doc.metadata['chunk_id']})")

Token indices sequence length is longer than the specified maximum sequence length for this model (974 > 512). Running this sequence through the model will result in indexing errors


Query 1: What is the main topic of the articles?
Answer 1: Artificial intelligence
Query 2: What did Pando company do?
Answer 2: solve for global logistics through a software-as-a-service platform offering
Query 3: Who did Databricks acquire?
Answer 3: Okera


In [27]:
import gradio as gr

# Define the QA function using your qa_chain
def answer_question(question):
    if not question.strip():
        return "Please enter a question."
    try:
        response = qa_chain.invoke({"query": question})
        return response["result"]
    except Exception as e:
        return f"Error: {str(e)}"

# Launch the Gradio interface
gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(label="Question", placeholder="Ask your question here...", lines=1),
    outputs=gr.Textbox(label="Answer"),
    title="News Article QA Chatbot",
    description="Ask questions based on the article collection.",
    theme="default",
).launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4c324f5a5a7ed1bc7f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


