In [1]:
import os
from dotenv import load_dotenv
from IPython.display import Markdown, display
from openai import OpenAI
from bs4 import BeautifulSoup
import requests
import json

In [2]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

MODEL = 'gpt-5-nano'
openai = OpenAI()

# Step1 : The "Fact Extractor" 

In [None]:
def get_clickbait_titles(subreddit , n_top = 20):
    print(f"Scraping headlines from subreddit: {subreddit}...")

    url = f"https://old.reddit.com/r/{subreddit}/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to retrieve content. Status code: {response.status_code}")
            return []

        soup = BeautifulSoup(response.text, 'html.parser')
        
        title_tags = soup.find_all('a', class_='title')
        
        titles = []
        for tag in title_tags:
            titles.append(tag.text)

        print(f"Completed Scraping headlines from subreddit: {subreddit}...")    
            
        return titles[:n_top]

    except Exception as e:
        print(f"An error occurred: {e}")
        return []


In [4]:
titles = get_clickbait_titles('nottheonion', n_top = 20)

Scraping headlines from subreddit: nottheonion...


In [5]:
system_prompt_to_extract_title = """
You are provided with a list of subreddit post titles. Your job is to read these titles and identify the click baity words used.
Clean the title off the click baity words, to get facts. 
And also assign a clickbait score from 0-10 (0- being the lowest clickbaity title and 10- being the most clickbaity). 

please respond in json 
{ "titles": 
[
            {"title":"title","facts":"[fact_1, fact_2]","cb_words":"[amazing , believe]" , "score":10}
            {"title":"title","facts":"[fact_1, fact_2]","cb_words":"[none]" , "score":0}
]
}
"""

In [6]:
def get_non_political_titles_from_subreddit(subreddit):
    user_prompt_to_generate = f"""
Here is all the scrapped data from {subreddit} - 
Please look at these titles and get all titles that are non political. 
respond in json. 
"""

    text = get_clickbait_titles(subreddit)
    user_prompt_to_generate += "\n".join(text)
    return user_prompt_to_generate
    

In [None]:

def get_non_political_titles_clickbait_scores(subreddit):

    response = openai.chat.completions.create(
        model = MODEL, 
        messages = [
            {"role" : "system" , "content": system_prompt_to_extract_title},
            {"role" : "user" , "content" :get_non_political_titles_from_subreddit(subreddit) }
        ], 
        response_format={"type":"json_object"}
    )

    result = response.choices[0].message.content

    print('Completed getting the clickbait scores and facts for the titles.')

    titles = json.loads(result)

    return titles



In [9]:
result = get_non_political_titles_clickbait_scores('savedyouaclick')

Scraping headlines from subreddit: savedyouaclick...


In [20]:
for titles in result['titles'] :
        facts  = titles['facts']
        print(facts)

[Cause of death: pulmonary embolism, Underlying cancer being treated since March]
[Wegovy pill introduced, advertisement]
[TomTom maps article appears to be paid content, not clearly disclosed]
[Adult joke in Disney's Muppets Show, Kermit says we're still working out a few kinks; Sabrina Carpenter comments that she likes a kink too]
[New mom hires Amish nanny, Amish nanny plays hymns on mom's electric keyboard]
[Vispa Limited will disconnect service, UK broadband shutdown]
[Robbie Williams fans warned before Liverpool concert, cold weather could affect queuing]
[Target policy requires employees to smile and make eye contact within 10 feet of customers, policy aims to be welcoming]
[Severance Season 3 release date has bad news, release likely late 2027]
[Amazon pulled Melania from Oregon theater, marquees reference Does Melania wear Prada? and To defeat your enemy, you must know them]
[Disney Retconning The Last Jedi after fan backlash, later works fill gaps in Luke Skywalker's story]
[

# Step 2:  The "Factual jornalist"

In [None]:
system_prompt_to_make_factual_summary = """
You are provided with set of facts. Your job is to make a neutral sounding one or two line summary out of these facts.
Please make the summary as least clickbaity as possible.
"""

def get_user_prompt_to_make_factual_summary(facts):
    user_prompt_to_make_factual_summary = f"""
        Here are some facts - {facts}. Please generate summary for each set of facts. 
        """
    return user_prompt_to_make_factual_summary


def make_factual_summary(subreddit):
    titles_result = get_non_political_titles_clickbait_scores(subreddit)
    for titles in titles_result['titles'] :
        facts  = titles['facts']

        print('Generating factual summary')

        response = openai.chat.completions.create(
            model = MODEL , 
            messages=[
                {"role":"system" , "content":system_prompt_to_make_factual_summary},
                {"role":"user" , "content":get_user_prompt_to_make_factual_summary(facts)}
            ], 

        )

        titles['anti_hype_summary'] = response.choices[0].message.content
        
        print('Completed generating factual summary')

    return titles_result
        



In [None]:
clean = make_factual_summary('savedyouaclick')

print(clean)

Scraping headlines from subreddit: savedyouaclick...


# Step 3: LLM as judge and display as a newspaper

In [None]:
newspaper_system_prompt = """
You are a news paper editor assistant that analyzes several titles, facts , 
and creates a short brochure about the company for prospective customers, investors and recruits.
Respond in markdown without code blocks.
Include details of company culture, customers and careers/jobs if you have the information.
"""