In [1]:
from bs4 import BeautifulSoup
import requests

In [34]:
def extract_title(soup):
    try:
        # Custom tag 'shreddit-title'
        title_tag = soup.find('shreddit-title')
        if title_tag and 'title' in title_tag.attrs:
            print('')
            return title_tag['title'].strip()

        # Standard HTML title tag
        title_tag = soup.find('title')
        if title_tag:
            return title_tag.get_text(strip=True)

        # Open Graph title
        og_title = soup.find('meta', attrs={'property': 'og:title', 'content': True})
        if og_title:
            return og_title['content'].strip()

        # Twitter-specific title
        twitter_title = soup.find('meta', attrs={'name': 'twitter:title', 'content': True})
        if twitter_title:
            return twitter_title['content'].strip()

        # Schema.org title
        schema_title = soup.find('meta', attrs={'itemprop': 'name', 'content': True})
        if schema_title:
            return schema_title['content'].strip()

        # HTML5 microdata
        microdata_title = soup.find(attrs={'itemprop': 'headline'})
        if microdata_title:
            return microdata_title.get_text(strip=True)

        # Dublin Core title
        dc_title = soup.find('meta', attrs={'name': 'DC.title', 'content': True})
        if dc_title:
            return dc_title['content'].strip()

        # h1 tags
        h1_title = soup.find('h1')
        if h1_title:
            return h1_title.get_text(strip=True)

        # Apple-specific title
        apple_title = soup.find('meta', attrs={'name': 'apple-mobile-web-app-title', 'content': True})
        if apple_title:
            return apple_title['content'].strip()

        # Facebook-specific title
        fb_title = soup.find('meta', attrs={'property': 'fb:title', 'content': True})
        if fb_title:
            return fb_title['content'].strip()

        # h2 tags
        h2_title = soup.find('h2')
        if h2_title:
            return h2_title.get_text(strip=True)

        # Alternative meta tag for title
        alt_meta_title = soup.find('meta', attrs={'name': 'title', 'content': True})
        if alt_meta_title:
            return alt_meta_title['content'].strip()

        # Reddit-specific title
        reddit_title = soup.find('meta', attrs={'property': 'og:title', 'content': True, 'data-reddit': True})
        if reddit_title:
            return reddit_title['content'].strip()

        # Fallback to None if no title is found
        return None
    except Exception as e:
        print(e)
        return None
    
def extract_description(soup):
    try:
        # Standard meta tag with name='description'
        description_tag = soup.find('meta', attrs={'name': 'description'})
        if description_tag and 'content' in description_tag.attrs:
            print('Standard meta tag')
            return description_tag['content'].strip()

        # Open Graph description
        og_description = soup.find('meta', attrs={'property': 'og:description'})
        if og_description and 'content' in og_description.attrs:
            print('Open Graph description')
            return og_description['content'].strip()

        # Twitter card description
        twitter_description = soup.find('meta', attrs={'name': 'twitter:description'})
        if twitter_description and 'content' in twitter_description.attrs:
            print('Twitter card description')
            return twitter_description['content'].strip()

        # Additional meta tag checks
        additional_meta_tags = ['og:title', 'twitter:title', 'keywords']
        for tag in additional_meta_tags:
            result = soup.find('meta', attrs={'property': tag, 'name': tag})
            if result and 'content' in result.attrs:
                print(f'Additional meta tag checks - {tag}')
                return result['content'].strip()

        # HTML tags like p or div with specific IDs or classes (this is highly specific to the website)
        html_tags = [('p', 'description'), ('div', 'description'), ('section', 'description')]
        for tag, class_name in html_tags:
            result = soup.find(tag, {'class': class_name})
            if result:
                print(f'HTML tags like p or div with specific IDs or classes - {tag, class_name}')
                return result.get_text().strip()

        return None
    except Exception as e:
        return None


def extract_keywords(url):
    """
    Extract keywords from the meta tags of a webpage.

    Parameters:
    url (str): The URL of the webpage to extract keywords from.

    Returns:
    list: A list of keywords found in the meta tags.
    """
    try:
        # Fetch the content of the webpage
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the webpage content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Initialize an empty set to store unique keywords
        keywords = set()

        # Look for meta tags that commonly contain keywords
        for meta in soup.find_all('meta'):
            if 'name' in meta.attrs and meta.attrs['name'].lower() in ['keywords']:
                keywords.update(meta.attrs['content'].split(','))
            elif 'property' in meta.attrs and meta.attrs['property'].lower() in ['og:keywords']:
                keywords.update(meta.attrs['content'].split(','))

        return list(keywords)
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return []

# Example usage
url = "https://www.youtube.com/watch?v=9A2ljmmUT_I"
print(extract_keywords(url))

[' SQUARE ENIX', ' MMO', ' ファイナルファンタジー', ' Final Fantasy XIV (Video Game)', 'FF14', ' FFXIV', ' スクウェア・エニックス', ' スクエニ']


In [29]:
url = 'https://github.com/every-day-things/citadel'
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
extract_description(soup)

Standard meta tag


'Manage your ebook library without frustrations. Calibre compatible. - GitHub - every-day-things/citadel: Manage your ebook library without frustrations. Calibre compatible.'

In [6]:
soup.find('div')

True

In [65]:
description_tag = soup.find('meta', attrs={'name': 'description'})
if description_tag and 'content' in description_tag.attrs:
    description = description_tag['content'].strip()

In [75]:
soup.find('meta')

<meta charset="utf-8"/>

In [13]:
from llama_index import SummaryIndex
from llama_index.readers import SimpleWebPageReader
from IPython.display import Markdown, display
import os

In [14]:
documents = SimpleWebPageReader(html_to_text=True).load_data(
    ["https://www.apple.com/iphone-15-pro"]
)

In [15]:
index = SummaryIndex.from_documents(documents)
query_engine = index.as_query_engine()

In [None]:
query_engine.query("what's new in iPhone 15 Pro?")

In [3]:
from llama_index import download_loader

TrafilaturaWebReader = download_loader("TrafilaturaWebReader")

loader = TrafilaturaWebReader()
documents = loader.load_data(urls=['https://www.apple.com/iphone-15-pro'])

Collecting trafilatura~=1.4 (from -r /home/rli/src/pinre/venv/lib/python3.12/site-packages/llama_index/download/llamahub_modules/requirements.txt (line 1))
  Downloading trafilatura-1.6.4-py3-none-any.whl.metadata (16 kB)
Collecting courlan>=0.9.5 (from trafilatura~=1.4->-r /home/rli/src/pinre/venv/lib/python3.12/site-packages/llama_index/download/llamahub_modules/requirements.txt (line 1))
  Downloading courlan-0.9.5-py3-none-any.whl.metadata (18 kB)
Collecting htmldate>=1.6.1 (from trafilatura~=1.4->-r /home/rli/src/pinre/venv/lib/python3.12/site-packages/llama_index/download/llamahub_modules/requirements.txt (line 1))
  Downloading htmldate-1.6.1-py3-none-any.whl.metadata (11 kB)
Collecting justext>=3.0.0 (from trafilatura~=1.4->-r /home/rli/src/pinre/venv/lib/python3.12/site-packages/llama_index/download/llamahub_modules/requirements.txt (line 1))
  Downloading jusText-3.0.0-py2.py3-none-any.whl (837 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m837.8/837.8 kB[

In [9]:
documents[0]

Document(id_='https://www.apple.com/iphone-15-pro', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='c6e306189e0879005ca66c2e2f807ea4cd8cafe54ecd592a222a525527dfe822', text='\n\n  * [Apple](/)\n  *     * [Store](/us/shop/goto/store)\n\n    * [Mac](/mac/)\n\n    * [iPad](/ipad/)\n\n    * [iPhone](/iphone/)\n\n    * [Watch](/watch/)\n\n    * [Vision](/apple-vision-pro/)\n\n    * [AirPods](/airpods/)\n\n    * [TV & Home](/tv-home/)\n\n    * [Entertainment](/entertainment/)\n\n    * [Accessories](/us/shop/goto/buy_accessories)\n\n    * [Support](https://support.apple.com/?cid=gn-ols-home-hp-tab)\n\n  * [ ](/us/search)\n\n  * [](/us/shop/goto/bag)0+\n\n[ iPhone 15 Pro ](/iphone-15-pro/)\n\nLocal Nav Open Menu Local Nav Close Menu\n\n  * Overview\n  * [Switch from Android to iPhone](/iphone-15-pro/switch/)\n  * [Tech Specs](/iphone-15-pro/specs/)\n\n[Buy iPhone 15 Pro](/us/shop/goto/buy_iphone/iphone_15_pro)\n\n# iPhone 15 P

In [16]:
from openai import OpenAI

In [17]:
client = OpenAI()

In [18]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who won the world series in 2020?"},
    {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
    {"role": "user", "content": "Where was it played?"}
  ]
)
response.choices[0].message

ChatCompletionMessage(content='The 2020 World Series was played at Globe Life Field in Arlington, Texas.', role='assistant', function_call=None, tool_calls=None)

In [19]:
import openai
import os

openai.api_key  = os.getenv('OPENAI_API_KEY')

In [20]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [105]:
subreddit_keywords = [
    "Humor",             # /r/funny
    "Q&A",   # /r/AskReddit
    "Gaming",            # /r/gaming
    "Cute Content",      # /r/aww
    "Music",             # /r/Music
    "Photography",       # /r/pics
    "World News",        # /r/worldnews
    "Science",           # /r/science
    "Movies",            # /r/movies
    "Interesting Facts", # /r/todayilearned
    "Video Content",     # /r/videos
    "News",              # /r/news
    "Thoughtful Insights", # /r/Showerthoughts
    "Comedy",            # /r/Jokes
    "Food",              # /r/food
    "Science Questions", # /r/askscience
    "Personal Stories",  # /r/IAmA
    "Nature Photography",# /r/EarthPorn
    "GIFs",              # /r/gifs
    "Odd News",          # /r/nottheonion
    "Book Recommendation",             # /r/books
    "DIY Projects",      # /r/DIY
    "Simple Explanations", # /r/explainlikeimfive
    "Life Tips",         # /r/LifeProTips
    "Social Media",      # Additional subreddits start here
    "Car Advice",
    "Car Selection",
    "Safety Insights",
    "Positive Memes",
    "Videos",
    "Memes",
    "GIFs",
    "Entertainment News",
    "Film Discussion",
    "Music Discovery",
    "Music",
    "Future Tech",
    "Science Q&A",
    "Nature",
    "Environment",
    "Hair Care",
    "Beauty Tips",
    "Fashion",
    "Inclusive Beauty",
    "Culinary Delights",
    "Physical Fitness",
    "Nutrition",
    "Motivation",
    "Cosmetics"
]
', '.join(subreddit_keywords)

'Humor, Q&A, Gaming, Cute Content, Music, Photography, World News, Science, Movies, Interesting Facts, Video Content, News, Thoughtful Insights, Comedy, Food, Science Questions, Personal Stories, Nature Photography, GIFs, Odd News, Book Recommendation, DIY Projects, Simple Explanations, Life Tips, Social Media, Car Advice, Car Selection, Safety Insights, Positive Memes, Videos, Memes, GIFs, Entertainment News, Film Discussion, Music Discovery, Music, Future Tech, Science Q&A, Nature, Environment, Hair Care, Beauty Tips, Fashion, Inclusive Beauty, Culinary Delights, Physical Fitness, Nutrition, Motivation, Cosmetics'

In [162]:
url = 'https://games.slashdot.org/story/24/01/14/0257246/atari-will-release-a-mini-edition-of-its-1979-atari-400-which-had-an-8-bit-mos-6502-cpu'
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
title = extract_title(soup)
description = extract_description(soup)

prompt = f'''
List up to 3 Reddit subreddits or online communities the below link in brackets most likely to appear in, output in JSON format the community prediction with probability, leave out the prefix "r/" if present
(
URL: {url}
Title: {title}
Description: {description}
)
'''

print(prompt)
response = get_completion(prompt)
print(response)

Standard meta tag

List up to 3 Reddit subreddits or online communities the below link in brackets most likely to appear in, output in JSON format the community prediction with probability, leave out the prefix "r/" if present
(
URL: https://games.slashdot.org/story/24/01/14/0257246/atari-will-release-a-mini-edition-of-its-1979-atari-400-which-had-an-8-bit-mos-6502-cpu
Title: Atari Will Release a Mini Edition of Its 1979 Atari 400 (Which Had An 8-Bit MOS 6502 CPU) - Slashdot
Description: An 1979 Atari 8-bit system re-released in a tiny form factor?  Yep.  
 Retro Games Ltd. is releasing a "half-sized" version of its very first home computer, the Atari 400, "emulating the whole 8-bit Atari range, including the 400/800, XL and XE series, and the 5200 home console.   ("In 1979 Atari br...
)
{
  "community_predictions": [
    {
      "subreddit": "gaming",
      "probability": 0.8
    },
    {
      "subreddit": "retrogaming",
      "probability": 0.6
    },
    {
      "subreddit": "atari

In [107]:
subreddit_keywords.index('Software Development')

ValueError: 'Software Development' is not in list