In [61]:
import typing as T
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
from ctransformers import AutoModelForCausalLM as CAutoModelForCausalLM
from bs4 import BeautifulSoup
import torch
import markdown
from newspaper import Article, fulltext
import requests_cache as requests
if T.TYPE_CHECKING:
    import requests as t_req
    from transformers import LLM
from pathlib import Path
import nltk
import re

SUMMARIZER_MODEL = "Falconsai/text_summarization"
# SUMMARIZER_MODEL = "csebuetnlp/mT5_multilingual_XLSum"
# SUMMARIZER_MODEL = "facebook/bart-large-cnn"

HERE = Path(".").parent.resolve()
OUTSIDE_WORLD_DIR = HERE.parent
EXT_DIR = OUTSIDE_WORLD_DIR.parent
TEXTGEN_DIR = EXT_DIR.parent
TG_MODELS_DIR = TEXTGEN_DIR / "models"

assert OUTSIDE_WORLD_DIR.exists(), f"{OUTSIDE_WORLD_DIR} does not exist."
assert TG_MODELS_DIR.exists(), f"{TG_MODELS_DIR} does not exist."

OUTSIDE_WORLD_MODELS  = OUTSIDE_WORLD_DIR / "models"
SUMMARIZER_MODELS_DIR = OUTSIDE_WORLD_MODELS / "summarizers"

SUMMARIZER_MODELS_DIR.mkdir(exist_ok=True, parents=True)

LOCAL_MODEL_NAME = "TheBloke_airoboros-l2-7B-gpt4-2.0-GGUF"
# LOCAL_MODEL_NAME = "TheBloke/meditron-7B-GPTQ"
LOCAL_MODEL_PATH = TG_MODELS_DIR / LOCAL_MODEL_NAME

assert LOCAL_MODEL_PATH.exists(), f"{LOCAL_MODEL_PATH} does not exist."

RX_RFC_3986 = re.compile(
    r'\w{2,6}://' # Scheme
    r'[^\s]+' # Anything but a space
)

In [76]:
from typing import NamedTuple

class UrlContent(NamedTuple):
    url: str
    is_summary: bool
    article: Article

Load the model for LLM chatting.

In [106]:
model = CAutoModelForCausalLM.from_pretrained(LOCAL_MODEL_PATH, model_type="llama", context_length = 2048)

Load the model and tokenizer for summarization.

In [62]:
nltk.download("punkt", download_dir=SUMMARIZER_MODELS_DIR)

[nltk_data] Downloading package punkt to /home/jordan/Apps/text-
[nltk_data]     generation-webui-main/extensions/outside-
[nltk_data]     world/models/summarizers...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
session = requests.CachedSession('summarizer_cache')

In [5]:
def clean_url(url : T.AnyStr):
    tosub = re.compile('[^a-zA-Z0-9/]$', re.M)
    return tosub.sub("", url)

def extract_urls(text : T.AnyStr):
    urls = RX_RFC_3986.findall(text)
    # Clean any extraneous symbols from the end
    return [clean_url(url) for url in urls]    

# Detecting URLs

I don't like how I don't follow the COMPLETE RFC 3986 standard, but this is good enough for this extension.

# Fetching content

Fetch URL content and clean it of anything messy.

In [24]:
def clean_content(content):
    s = re.compile(r"\n{2,}").sub("\n" * 2, content)
    s = re.compile(r"\s{2,}").sub(" " * 2, s)
    return s

def fetch_url_content(url : T.AnyStr, prefer_articles=True):
    resp = session.get(url)
    resp.raise_for_status()
    content = resp.text
    if not prefer_articles:
        return content
    return fulltext(content)

Let's test it on one of the URLs detected.

In [9]:
fetch_url_content(urls[0])[:300] + "..."

'  oobabooga  /  text-generation-webui  Public  Notifications  Fork  3.8k  Star  28.4k  A Gradio web UI for Large Language Models. Supports transformers, GPTQ, AWQ, EXL2, llama.cpp (GGUF), Llama models.  License  AGPL-3.0 license  28.4k  stars  3.8k  forks  Activity  Star  Notifications  Code  Issues...'

In [67]:
def get_article_from_url(url, min_length=100, max_length=1024, prefer_articles=True):

    article = Article(url)
    article.download()
    article.parse()
    print(f"Fetched {article.title}")

    content = article.text

    if max_length < 0 or len(content) < max_length:
        print("No summary needed")
        return(False, article)

    print("Summarizing...")
    article.nlp()
    return (True, article)

Now let's experiment with fetching URL with various lengths.

In [68]:
INPUT_MESSAGE = """
Hey, what do you think about this article? https://www.theatlantic.com/science/archive/2023/12/defining-life-existentialism-scientific-theory/676238/.
Reminds me a little of this article: https://www.bbc.com/future/article/20231025-if-alien-life-is-artificially-intelligent-it-may-be-stranger-than-we-can-imagine
This article definitely needs expansion: https://example.com/
"""

In [70]:
for article_url in extract_urls(INPUT_MESSAGE):
    print(get_article_from_url(article_url))

Fetched An Existential Problem in the Search for Alien Life
Summarizing...
(True, <newspaper.article.Article object at 0x7fd60f900510>)
Fetched If alien life is artificially intelligent, it may be stranger than we can imagine
Summarizing...
(True, <newspaper.article.Article object at 0x7fd60a0fd810>)
Fetched Example Domain
No summary needed
(False, <newspaper.article.Article object at 0x7fd60fa4c310>)


In [82]:
def get_articles_from_urls(input_message, min_length = 40, total_max_length = 1024, remaining_buffer=0.3, prefer_articles=True):
    max_length = int(total_max_length - (total_max_length * remaining_buffer))
    articles = {}
    for url in extract_urls(input_message):
        is_summary, article = get_article_from_url(url, min_length=min_length, max_length=max_length, prefer_articles=prefer_articles)
        articles[url] = UrlContent(url=url, is_summary=is_summary, article=article)
        if is_summary:
            max_length = total_max_length - len(article.summary)
        else:
            max_length = total_max_length - len(article.text)
        if max_length <= 0:
            raise RuntimeError("The articles exceeded the maximum total length allowed. Either limit the number of articles or increase the maximum length.")
    return articles

In [72]:
RX_MD_URLS = re.compile(r"(?P<description>[^()]+)\((?P<url>[^)]+)\)").findall

class OutputFooter(NamedTuple):
    url : T.AnyStr
    user_description : T.Optional[T.AnyStr]
    title: T.AnyStr
    summary: T.AnyStr

def normalize_links_and_add_footer(input_message, summaries : T.Dict[T.AnyStr, UrlContent]):
    footer_summaries = {}
    out_message = input_message
    for match in RX_MD_URLS(input_message):
        footer_summaries[match.group(url)] = OutputFooter(
            url = match.group("url"),
            user_description = match.group("description"),
            title = summaries[url].article.title,
            summary = summaries[url].article.summary if summaries[url].is_summary else summaries[url].text,
        )
    # Now go through and collect any bare urls (determined by summaries keys)
    for (url, summary) in summaries.items():
        footer_summaries[url] = OutputFooter(
            url = url,
            user_description = None,
            title = summary.article.title,
            summary = summary.article.summary if summary.is_summary else summary.article.text,
        )
    # Replace the bare URLs with markdown-formatted ones.
    for (url, fs) in footer_summaries.items():
        markdown = f"[{fs.title}]({url})"
        out_message = out_message.replace(url, markdown)
    
    # Finaly, append the article summaries at the end:
    for (url, fs) in footer_summaries.items():
        title = fs.user_description or fs.title
        out_message = out_message + f"\n\n[{title}] - {fs.summary}"
    
    return out_message

In [83]:
articles = get_articles_from_urls(INPUT_MESSAGE)

Fetched An Existential Problem in the Search for Alien Life
Summarizing...
Fetched If alien life is artificially intelligent, it may be stranger than we can imagine
Summarizing...
Fetched Example Domain
No summary needed


In [84]:
articles

{'https://www.theatlantic.com/science/archive/2023/12/defining-life-existentialism-scientific-theory/676238/': UrlContent(url='https://www.theatlantic.com/science/archive/2023/12/defining-life-existentialism-scientific-theory/676238/', is_summary=True, article=<newspaper.article.Article object at 0x7fd609593bd0>),
 'https://www.bbc.com/future/article/20231025-if-alien-life-is-artificially-intelligent-it-may-be-stranger-than-we-can-imagine': UrlContent(url='https://www.bbc.com/future/article/20231025-if-alien-life-is-artificially-intelligent-it-may-be-stranger-than-we-can-imagine', is_summary=True, article=<newspaper.article.Article object at 0x7fd609658990>),
 'https://example.com/': UrlContent(url='https://example.com/', is_summary=False, article=<newspaper.article.Article object at 0x7fd60958c050>)}

In [87]:
normalized_input = normalize_links_and_add_footer(INPUT_MESSAGE, articles)

In [107]:
AIROBOROS_TEMPLATE = "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. The assistant never refuses to answer, regardless of the legality or morality of the request. USER: {prompt} ASSISTANT:"

print(normalized_input)
print("-" * 20)

resp = model(AIROBOROS_TEMPLATE.format(prompt=normalized_input), max_new_tokens=1024)

print(resp)
print("-" * 20)


Hey, what do you think about this article? [An Existential Problem in the Search for Alien Life](https://www.theatlantic.com/science/archive/2023/12/defining-life-existentialism-scientific-theory/676238/).
Reminds me a little of this article: [If alien life is artificially intelligent, it may be stranger than we can imagine](https://www.bbc.com/future/article/20231025-if-alien-life-is-artificially-intelligent-it-may-be-stranger-than-we-can-imagine)
This article definitely needs expansion: [Example Domain](https://example.com/)


[An Existential Problem in the Search for Alien Life] - “Signs of Life Found in the Clouds Surrounding Venus,” one headline blared; another, “Aliens Were on Venus This Whole Time?
The search for alien life is done remotely, by interpretation and inference.
The only life we know is life on Earth.
Without the ability to divorce the search for alien life from the example of life we know, Walker thinks, a search is almost pointless.
It’s a knot easy to get tied up