In [46]:
%pip install python-dotenv xmltodict requests beautifulsoup4 tqdm faiss-cpu -q

Note: you may need to restart the kernel to use updated packages.


In [61]:
import xmltodict
import requests
from bs4 import BeautifulSoup
from tqdm.autonotebook import tqdm
import pinecone
import openai
from dotenv import load_dotenv
import os

import textwrap

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

from langchain.memory import ConversationBufferMemory
from langchain import PromptTemplate

from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI

In [25]:
load_dotenv()

True

In [40]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [5]:
r = requests.get("https://news.itsfoss.com/sitemap-posts.xml")
xml = r.text
rss = xmltodict.parse(xml)

article_links = [entry["loc"] for entry in rss["urlset"]["url"]]
print(f"Found {len(article_links)} articles")

Found 978 articles


In [12]:
article_links

['https://news.itsfoss.com/zorin-os-16-3-release/',
 'https://news.itsfoss.com/gnome-window-management-plan/',
 'https://news.itsfoss.com/ansible-creator-new-rust-platform/',
 'https://news.itsfoss.com/rare/',
 'https://news.itsfoss.com/inkscape-1-3-release/',
 'https://news.itsfoss.com/black-box-release/',
 'https://news.itsfoss.com/gyroflow/',
 'https://news.itsfoss.com/anytype-open-beta/',
 'https://news.itsfoss.com/linux-mint-21-2/',
 'https://news.itsfoss.com/crystal-linux-dev/',
 'https://news.itsfoss.com/filiming-with-foss-tech/',
 'https://news.itsfoss.com/skiff-mail-review/',
 'https://news.itsfoss.com/almalinux-os-future/',
 'https://news.itsfoss.com/thunderbird-115/',
 'https://news.itsfoss.com/suse-rhel-fork/',
 'https://news.itsfoss.com/geary-44-release/',
 'https://news.itsfoss.com/solus-4-4-released/',
 'https://news.itsfoss.com/blendos-v3-released/',
 'https://news.itsfoss.com/fedora-40-privacy/',
 'https://news.itsfoss.com/openkylin-linux-os/',
 'https://news.itsfoss.c

In [14]:
def extract_content(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")

    elements = [
        soup.select_one(".c-topper__headline"),
        soup.select_one(".c-topper__standfirst"),
        soup.select_one(".c-content"),
    ]

    try:
        text = "".join([element.get_text() for element in elements])
        return text, True
    except:
        text = ""
        return text, False


In [16]:
failed = []
articles = []
for url in tqdm(article_links, desc="Extracting information from articles"):
    content, success = extract_content(url)
    articles.append({"source":url, "content":content})
    if not success:
        failed.append(url)

print(f"Failed to extract content from {len(failed)} articles")

Extracting information from articles:   0%|          | 0/978 [00:00<?, ?it/s]

Failed to extract content from 646 articles


In [39]:
rec_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
web_docs, meta = [], []

for article in tqdm(articles, desc= "Splitting articles into chunk"):
    if len(article["content"]) < 10:
        continue
    splits = rec_splitter.split_text(article["content"])
    web_docs.extend(splits)
    meta.extend([{"source": article["source"]}]*len(splits))

Splitting articles into chunk:   0%|          | 0/978 [00:00<?, ?it/s]

In [47]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

article_store = FAISS.from_texts(
    texts=web_docs, embedding=OpenAIEmbeddings(), metadatas=meta
)

In [50]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key="question",
    output_key="answer",
    return_messages=True,
)

In [52]:
template = """You are a chatbot having a conversation with a human.
Given the following extracted parts of a long document and a question, 
create a final answer.
{context}
{chat_history}
Human: {question}
Chatbot:"""

In [53]:
question_prompt = PromptTemplate(
    input_variables=["chat_history", "question", "context"], template=template
)

In [82]:
def print_result(result):
    for key, value in result.items():
        print(key)
        print(value)

In [72]:
article_chain = RetrievalQAWithSourcesChain.from_llm(
    llm=OpenAI(temperature=0.0),
    retriever=article_store.as_retriever(k=4),
    memory=memory,
    question_prompt=question_prompt,
)

result = article_chain({"question": "What is Skiff?"}, 
                        return_only_outputs=True)

In [83]:
print_result(result)

answer
 Skiff is an open-source, secure email alternative to Gmail and Proton Mail that provides end-to-end encryption, cloud storage, documents, calendar, private documents, wikis, note-taking, real-time collaboration, and the option to publish documents to the public using a generated link. It also supports encrypted cloud storage with IPFS support.

sources
https://news.itsfoss.com/skiff-mail-review/


In [85]:
print_result(article_chain(
    {"question": "What are its functionalities?"},
    return_only_outputs=True,
))

answer
 Skiff offers a variety of functionalities, including end-to-end encryption, cloud storage, documents, calendar, private documents, wikis, note-taking, real-time collaboration, and the option to publish documents to the public using a generated link. It also supports encrypted cloud storage with IPFS support.

sources
https://news.itsfoss.com/anytype-open-beta/, https://news.itsfoss.com/clipboard/, https://news.itsfoss.com/design-2d-cad/, https://news.itsfoss.com/framework-laptop-open-source-module/
