In [1]:
## IMPORTS
import pandas as pd
import requests
import praw
import datetime
import time
import logging
import json
import re
from prawcore.exceptions import RequestException, ResponseException, ServerError
from newspaper import Article
from newsapi import NewsApiClient
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/roryoflynn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/roryoflynn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/roryoflynn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# NewsAPI initialization
newsKey = '992b1247b5b448e3935baac5af4a841c'  # coryFlynn
newsAPI = NewsApiClient(api_key=newsKey)

# Example search queries (uncomment to test different patterns)
# searchQuery = '"AI art" OR "AI Image" OR "AI Artists" OR "AI-generated art" OR "AI generated images" OR "AI generation" OR "generative AI art" OR "algorithmic art" -OLED -Pendlebury -Pypi.org -The-next-web -byteDance -samsung'
# searchQuery = '"art" OR "painting" OR "drawings" OR "illustration" OR "illustrator" OR "graphic design" OR "animation" OR "fine arts" OR "mural" OR "creative work" OR "art exhibition" OR "museum" OR "gallery" -"AI art" -"AI-generated" -"AI Images" -"machine learning art" -OLED -The-next-web -byteDance -samsung'

# Logging configuration
logging.basicConfig(
    level=logging.INFO,  # see all logs
    format="%(asctime)s - %(levelname)s - %(message)s",
    force=True  # rest config
)

# INIT reddit PRAW API
reddit = praw.Reddit(
    client_id='JBh-NQdNIpzU5_QWija3LQ',
    client_secret='AjBxNAXhL4hPvdWf6xYUpA4sg9ATLg',
    user_agent='script:top_posts_scraper:v1.0 (by u/bingabanggg)'
)

In [3]:
testUrl = "https://www.androidpolice.com/ai-art-in-project-zomboids-update-sparks-community-outrage/"


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}

try:
    response = requests.get(testUrl, headers=headers, timeout=10)
    response.raise_for_status()  # yell when theres a bad responses
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract main article content (i skimmed, and sites use <article> or <p>)
    paragraphs = soup.find_all("p")
    full_text = "\n".join([p.get_text() for p in paragraphs])

    print("\n--- Extracted Full Article ---\n")
    print(full_text[:1000])  # Print first 1000 characters

except requests.exceptions.RequestException as e:
    print(f"Error fetching article: {e}")


--- Extracted Full Article ---

Project Zomboid's anticipated Build 42 introduced more than gameplay enhancements. The indie game's developer, The Indie Stone, faced backlash after players suspected generative AI was used in the game's new title and loading screen artwork. Before any proof emerged, accusations escalated to harassment, conspiracy theories, and hostility.
This controversy highlights the growing debates surrounding AI creativity for artists using generative tools and developers dealing with those opposed to their use. The community's reaction to Build 42 artwork sheds light on broader issues within the anti-AI art movement. It underscores the importance of redirecting efforts toward advocating for regulations on corporations while supporting small independent creators rather than unfairly targeting them.
Stop passing off machine learning tricks as real AI improvements
Players on Steam and Reddit claimed inconsistencies in the new artwork were telltale signs of generative

In [4]:
def fetchNewsArticles(searchQuery, pageSize=20, sortBy="relevancy", testMode=False, fromDate=None):
    """
    request to newsAPI and save as JSON file with all data.
    testmode limits the call to one page
    """
    allArticles = []
    maxPages = 1 if testMode else 5  # Test mode limits to 1 page for quick testing
    if not fromDate:
        fromDate = (datetime.datetime.now() - datetime.timedelta(days=30)).strftime('%Y-%m-%d')  # Default to last 30 days

    for page in range(1, maxPages + 1):  # get max amnt of pages (API thing)
        try:
            # tell what page we're on, make api request
            print(f"Fetching page {page}...")

            response = newsAPI.get_everything(
                q=searchQuery,
                language="en",
                sort_by=sortBy,
                page=page,
                page_size=pageSize,
                from_param=fromDate,
            )

            if "articles" in response:  # return count of articles retreived per page
                articlesCount = len(response["articles"])
                print(f"Page {page} returned {articlesCount} articles.")

                # Extract relevant fields
                for article in response["articles"]:
                    # each of these has a value for if it doesnt exist - this helps flagging articles for data cleaning review later
                    structured_article = {
                        "source": article["source"]["name"] if article.get("source") else "Unknown", # conditional: if source exists, get name, else "unknown"
                        "author": article.get("author", "Unknown"),
                        "title": article.get("title", "No Title"),
                        "description": article.get("description", "No Description"),
                        "content": article.get("content", "No Content Available"),
                        "url": article.get("url", "No URL"),
                        "publishedAt": article.get("publishedAt", "No Date"),
                    }
                    allArticles.append(structured_article)
            else:
                print(f"Page {page} returned no articles.")

        except Exception as e:
            print(f"Error fetching page {page}: {e}")
            break

        # Ratelimit safety
        time.sleep(2)

    # timestamp of file (for file management craziness)
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    jsonFileName = f"newsAPI_{timestamp}.json"

    # save and return as JSON
    with open(jsonFileName, "w", encoding="utf-8") as jsonFile:
        json.dump(allArticles, jsonFile, indent=4, ensure_ascii=False)

    print(f"Articles saved to {jsonFileName}")
    return jsonFileName


In [5]:
def loadArticlesFromJSON(jsonFile):
    """
    Reads output JSON from fetchNewsArticles articles and converts it to a df
    """
    with open(jsonFile, "r", encoding="utf-8") as file:
        articles = json.load(file)

    df = pd.DataFrame(articles)

    # make sure all columns are as they should be (otherwise add them with None values to be flagged later)
    expectedColumns = ["source", "author", "title", "description", "content", "url", "publishedAt"]
    for col in expectedColumns:
        if col not in df:
            df[col] = None

        if col not in df:
            df[col] = None

    # lambda to each element in source column of DF - checks if element is dict with key 'name' - if yes, return name if not, return unknown.
    # if isinstance(source, dict) and "name" in source:
    #     return source["name"]
    # else:
    #     return "Unknown"
    df["source"] = df["source"].apply(lambda x: x["name"] if isinstance(x, dict) and "name" in x else "Unknown")

    return df

In [6]:
def checkExactKeyword(text):
    if not isinstance(text, str):
        return False
    # pad with spaces *as-is* (case preserved)
    padded = f" {text} "
    # tests for lower-case keywords inside THE EXACT padded text
    return any(f" {kw} " in padded for kw in Keywords)


def filterAndProcessArticles(df):
    """
    goes article by article to check for keywords actually IN the article content/body.
    then flags for bad-data and has you check them one by one (usually only a dozen will need checking.)
    """
    # Precompile regex pattern for exact keyword matching
    keyword_pattern = re.compile(
        r'\b(' + '|'.join(re.escape(k) for k in Keywords) + r')\b',
        flags=re.IGNORECASE
    )

    # drop any rows missing content
    df.dropna(subset=['title', 'description', 'content', 'url'], inplace=True)

    # keyword checks on all text columns
    df["RelevantTitle"] = df["title"].apply(
        lambda text: bool(keyword_pattern.search(text)) if isinstance(text, str) else False
    )
    df["RelevantContent"] = df["content"].apply(
        lambda text: bool(keyword_pattern.search(text)) if isinstance(text, str) else False
    )
    df["RelevantDescription"] = df["description"].apply(
        lambda text: bool(keyword_pattern.search(text)) if isinstance(text, str) else False
    )

    # articles that dont match any keywords are flagged (new columns)
    df["KeywordMatch"] = df[["RelevantTitle", "RelevantContent", "RelevantDescription"]].any(axis=1)

    # all articles from keyword process that got flagged are grabbed for manual review
    # all articles that have less than 50 chars are also flagged for review
    df["ContentLength"] = df["content"].str.len().fillna(0) + df["description"].str.len().fillna(0)
    df["NeedsManualReview"] = (df["ContentLength"] < 50) | (~df["KeywordMatch"])

    return df

In [7]:

def fetchFullArticle(url):
    """
    given url will use newspaper3k first then bs4 to try and get full article text.
    """
    try:
        article = Article(url)
        article.download()
        article.parse()

        # check if the len grabbed is significant len
        if len(article.text) > 100:
            return article.text.strip()

        # if newspaper3k falls thru then use bs4
        else:
            print(f"Newspaper3k extracted too little content for {url}. Trying BS4 fallback...")
    except Exception as e:
        print(f"Newspaper3k failed for {url}: {e}. Trying BS4 fallback...")

    # using bs4 instead
    try:
        response = requests.get(url, headers=headers, timeout=10)

        # skip if blocked
        if response.status_code in [403, 429] or "consent" in response.url:
            print(f" Skipping (Blocked/Consent Page): {url}")
            return None

        response.raise_for_status()

        # find p tags and extract text.
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all("p")
        fullText = "\n".join([p.get_text() for p in paragraphs]).strip()

        return fullText if len(fullText) > 100 else None  # minimum length for results so that bad articles automatically get cut

    except Exception as e:
        print(f"BS4 also failed for {url}: {e}")
        return None

In [8]:
def saveProcessedData(df, cleanFileName, rejectedFileName):
    """
    Takes a DataFrame with a 'NeedsManualReview' column and writes two CSVs:
      - approvedRows (NeedsManualReview == False) into cleanFileName
      - rejectedRows (NeedsManualReview == True) into rejectedFileName
    """
    approvedRows = df[df["NeedsManualReview"] == False]
    rejectedRows = df[df["NeedsManualReview"] == True]

    approvedRows.to_csv(cleanFileName, index=False)
    rejectedRows.to_csv(rejectedFileName, index=False)

    print(f"Clean articles saved to {cleanFileName}")
    print(f"Rejected articles saved to {rejectedFileName}")


In [9]:
#master function to run everything
def processAllArticles(jsonFile):
    """
    Master function that loads, processes, and saves articles from a NewsAPI JSON file.
    Ensures every article gets full-length content.
    """
    print("Loading JSON data...")
    df = loadArticlesFromJSON(jsonFile)

    print("Filtering and processing articles...")
    df = filterAndProcessArticles(df)

    print("Fetch full article content for given articles")
    df["FullArticleContent"] = df["url"].apply(fetchFullArticle)

    # get output filenames from the JSON input
    base = jsonFile.rsplit('.', 1)[0]
    cleanFileName = f"{base}_clean.csv"
    rejectedFileName = f"{base}_rejected.csv"

    print("saving processed data...")
    saveProcessedData(df, cleanFileName, rejectedFileName)

    print("file complete.")

In [11]:
##FIRST GET ARTICLES ABOUT ART ONLY TO ADD TO THE OLD ARTICLES DATA
searchQuery = (
    '"art" OR "painting" OR "drawings" OR "drawing" OR "Mural" OR "artwork" OR "illustration" OR "illustrator" OR "graphic design" '
    'OR "animation" OR "fine arts" OR "mural" OR "creative work" OR "art exhibition" '
    'OR "museum" OR "gallery" '
    '-"AI art" -"AI-generated" -"AI Images" -"machine learning art" '
    '-"OLED" -"The-next-web" -"byteDance" -"samsung"'
)

json_file = fetchNewsArticles(
    searchQuery=searchQuery,
    pageSize=20,# 20 articles per page
    sortBy="relevancy",
    testMode=False,
    fromDate=None,  # None means last 30 days
)

Fetching page 1...
Page 1 returned 18 articles.
Fetching page 2...
Page 2 returned 19 articles.
Fetching page 3...
Page 3 returned 16 articles.
Fetching page 4...
Page 4 returned 18 articles.
Fetching page 5...
Page 5 returned 18 articles.
Articles saved to newsAPI_2025-04-26_13-08-27.json


In [16]:
#the words from the query are the keywords now
Keywords = [
    "art", "painting", "drawings", "illustration", "illustrator",
    "graphic design", "animation", "fine arts", "mural", "creative work",
    "art exhibition", "museum", "gallery"
]

newArtArticles = processAllArticles('newsAPI_2025-04-26_13-08-27.json')

Loading JSON data...
Filtering and processing articles...
Fetch full article content for given articles
Newspaper3k failed for https://www.forbes.com/sites/chaddscott/2025/04/15/diego-rivera-frida-kahlo-and-paris-in-two-exhibitions/: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/chaddscott/2025/04/15/diego-rivera-frida-kahlo-and-paris-in-two-exhibitions/ on URL https://www.forbes.com/sites/chaddscott/2025/04/15/diego-rivera-frida-kahlo-and-paris-in-two-exhibitions/. Trying BS4 fallback...
Newspaper3k failed for https://www.forbes.com/sites/robinraven/2025/04/06/msc-cruises-reveals-worlds-biggest-cruise-terminal-and-innovative-art-exhibition/: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/robinraven/2025/04/06/msc-cruises-reveals-worlds-biggest-cruise-terminal-and-innovative-art-exhibition/ on URL https://www.forbes.com/sites/robinraven/2025/04/06/m

In [28]:
df3 = pd.read_csv('newsAPI_2025-04-26_13-08-27_clean.csv')
df3['LABEL'] = 'art'

df3.sample(5)  # Print first 5 rows of the DataFrame




Unnamed: 0,source,author,title,description,content,url,publishedAt,RelevantTitle,RelevantContent,RelevantDescription,KeywordMatch,ContentLength,NeedsManualReview,FullArticleContent,LABEL
29,Unknown,Debbie Elliott,Rosie the Riveters honored for service in WWII,The National World War Two Museum and the Gary...,NEW ORLEANS A hero's welcome greets 18 women a...,https://www.npr.org/2025/03/30/nx-s1-5332291/r...,2025-03-30T09:00:00Z,False,True,True,True,420,False,Rosie the Riveters honored for service in WWII...,art
22,Unknown,info@hypebeast.com (Hypebeast),Wassily Kandinsky's Sketchbooks Offer New Insi...,"Wassily Kandinsky, best known as the pioneer o...","Wassily Kandinsky, best known as the pioneer o...",https://hypebeast.com/2025/4/wassily-kandinsky...,2025-04-07T21:38:38Z,True,True,True,True,473,False,"Wassily Kandinsky, best known as the pioneer o...",art
6,Unknown,,Rare Tudor paintings show 'fantastical beasts',The art is found behind plasterwork at a build...,"Catherine LeeBBC News, North East and Cumbria\...",https://www.bbc.com/news/articles/cd6jv5jdp6zo,2025-04-04T06:31:24Z,False,False,True,True,311,False,Rare Tudor paintings show 'fantastical beasts'...,art
16,Unknown,Josh Rubin,St. Lucia’s Sugar Beach Is an Art Lover’s Para...,Art meets luxury at a Viceroy Hotels’ newest l...,Art meets luxury at a Viceroy Hotels’ newest l...,https://coolhunting.com/travel/st-lucias-sugar...,2025-04-01T12:49:27Z,True,True,True,True,405,False,Read Travel St. Lucia’s Sugar Beach Is an Art ...,art
60,Unknown,news@appleinsider.com (Malcolm Owen),Inside Apple Via del Corso -- Rome's store tha...,A visit to the marble-covered Apple Via del Co...,A visit to the marble-covered Apple Via del Co...,https://appleinsider.com/articles/25/04/10/ins...,2025-04-10T16:39:13Z,True,False,False,True,474,False,"The outside of Apple Via del Corso in Rome, It...",art


In [None]:
df3 = df3[['LABEL', 'content', 'FullArticleContent']].copy()
df3.rename(columns={
    'content': 'newsApiContent',
    'FullArticleContent': 'fullArticleContent'
}, inplace=True)

df3.sample(20)

In [33]:
df3.to_csv('VIDEOartnewsAPI.csv', index=False)

In [None]:
#combined ebery single query from the first pass to make the boradest possible thing.

searchQuery = (
    '"AI art" OR "AI-generated art" OR "generative art" OR "machine learning art" '
    'OR "artificial intelligence art" OR "algorithmic art" OR "Midjourney" '
    'OR "Stable Diffusion" OR "DALL-E" OR "Artbreeder" '
    '-"OLED" -"Pendlebury" -"Pypi.org" -"The-next-web" -"byteDance" -"samsung"'
)


json_file = fetchNewsArticles(
    searchQuery=searchQuery,
    pageSize=20,# 20 articles per page
    sortBy="relevancy",
    testMode=False
)


In [None]:

Keywords = [
    "AI art", "AI-generated art", "generative art", "machine learning art", "artificial intelligence art",
    "algorithmic art","Midjourney", "Stable Diffusion", "DALL-E", "Artbreeder"
]

newArtArticles = processAllArticles('NewsArticles-AIArt-042225.json')
pd.read_csv("newsAPI_2025-04-26_13-08-27_clean.csv")

In [18]:
#add the new data into the old data for combo articles df
df_ai  = pd.read_csv('newsAPI_2025-04-26_13-08-27_clean.csv')
df_art = pd.read_csv('newsAPI_2025-04-26_13-08-27_clean.csv')

df_ai['LABEL']  = 'AIart'
df_art['LABEL'] = 'art'

df_all = pd.concat([df_ai, df_art], ignore_index=True)

cols = ['LABEL'] + [c for c in df_all.columns if c != 'LABEL']
df_all = df_all[cols]
print(df_all.columns)

Index(['LABEL', 'source', 'author', 'title', 'description', 'content', 'url',
       'publishedAt', 'RelevantTitle', 'RelevantContent',
       'RelevantDescription', 'KeywordMatch', 'ContentLength',
       'NeedsManualReview', 'FullArticleContent'],
      dtype='object')


In [None]:
#these are the original cleaned articles I got from the mod 1 I wanted to add more data to this hence the old code refactoring
labeledArtArticles = pd.read_csv('LabeledArtArticles.csv')
print(labeledArtArticles.columns)

In [None]:
df_mod3 = df_all[['LABEL', 'content', 'FullArticleContent']].copy()
df_mod3.rename(columns={
    'content': 'newsApiContent',
    'FullArticleContent': 'fullArticleContent'
}, inplace=True)

combined = pd.concat([labeledArtArticles, df_mod3], ignore_index=True)


In [None]:

print(len(combined))

In [None]:
ai_keywords = [
    "artificial intelligence", "machine learning", "deep learning",
    "neural network", "generative", "algorithmic", "gpt", "dall[- ]e",
    "stable diffusion", "midjourney", "artbreeder", "clip", "biggan",
    "bert", "transformer"
]
art_keywords = [
    "art", "painting", "drawing", "sculpture", "illustration",
    "photograph", "photography", "gallery", "exhibition",
    "digital art", "fine art", "graphic design", "mixed media",
    "contemporary art", "abstract art"
]


#  - matches AI as a whole word (\bAI\b)
#  - OR AI when immediately followed by a hyphen (\bAI(?=-))
#  - OR any of the other keywords as whole words (\b(...)\b)
pattern = re.compile(
    r'(?:\bAI\b|\bAI(?=-)|\b(?:' +
      '|'.join(map(re.escape, ai_keywords + art_keywords)) +
    r')\b)',
    flags=re.IGNORECASE
)


#any article that mentions any keyword ONCE will be flagged as relevant
initial_count = len(df)
keep_mask = df.apply(
    lambda row: bool(
        pattern.search(str(row.get('cleanedContent',''))) or
        pattern.search(str(row.get('title',''))) or
        pattern.search(str(row.get('description','')))
    ),
    axis=1
)


removed_count   = initial_count - keep_mask.sum()
remaining_count = keep_mask.sum()

print(f"Removed: {removed_count}, Remaining: {remaining_count}")


In [None]:
mod3NewsDataFINAL = df[keep_mask].copy()
mod3NewsDataFINAL.to_csv('mod3NewsDataFINAL.csv', index=False)

In [None]:
len(df)

In [None]:
#the words from the query are the keywords now
Keywords = [
    "art", "painting", "drawings", "illustration", "illustrator",
    "graphic design", "animation", "fine arts", "mural", "creative work",
    "art exhibition", "museum", "gallery"
]

newArtArticles = processAllArticles('NewsArticles-ART-042225.json')

In [None]:
def has_word(s):
    return bool(re.search(r'\w', str(s)))

redditAIPosts = pd.read_csv('AIposts-noKW.csv')

In [None]:
redditArtPosts = pd.read_csv('ArtPosts-noKW.csv')

# any reddit post that doesnt have any words in the title, description or content is dropped
def has_word(s):
    return bool(re.search(r'\w', str(s)))

mask_desc = redditArtPosts['description'].fillna('').astype(str).apply(has_word)
mask_cont = redditArtPosts['content'].   fillna('').astype(str).apply(has_word)
keep_mask = mask_desc | mask_cont

print(f"Rows before: {len(redditArtPosts)}, removed: {len(redditArtPosts)-keep_mask.sum()}, remaining: {keep_mask.sum()}")

# Filter without losing any columns
redditArtPosts1 = redditArtPosts.loc[keep_mask].reset_index(drop=True)

redditArtPosts1['content'] = redditArtPosts1['content'].fillna('').astype(str)

redditArtPosts1.sample(30)


In [None]:

redditAiPosts = pd.read_csv('AIposts-noKW.csv')

#one of the two text columns needs to have text
def has_word(s):
    return bool(re.search(r'\w', str(s)))

mask_desc = redditAiPosts['description'].fillna('').astype(str).apply(has_word)
mask_cont = redditAiPosts['content'].   fillna('').astype(str).apply(has_word)
keep_mask = mask_desc | mask_cont

print(f"Rows before: {len(redditAiPosts)}, removed: {len(redditAiPosts)-keep_mask.sum()}, remaining: {keep_mask.sum()}")

redditAiPosts1 = redditAiPosts.loc[keep_mask].reset_index(drop=True)

redditAiPosts1['content'] = redditAiPosts1['content'].fillna('').astype(str)

redditAiPosts1.sample(30)


In [None]:
print(len(redditAiPosts1))
print(len(redditArtPosts1))


In [None]:
mod3RedditDataFINAL = pd.concat([redditAiPosts1, redditArtPosts1], ignore_index=True)


In [None]:
#omg the labels
mod3RedditDataFINAL['LABEL'] = mod3RedditDataFINAL['LABEL'].replace({
    'ART': 'art',
    'Art': 'art',
    'AIArt': 'AIart',
})

print(mod3RedditDataFINAL['LABEL'].value_counts())

In [None]:
mod3RedditDataFINAL.to_csv('mod3RedditDataFINAL.csv', index=False)

In [None]:
len (redditAiPosts1)

In [17]:
pd.read_csv("newsAPI_2025-04-26_13-08-27_clean.csv")

Unnamed: 0,source,author,title,description,content,url,publishedAt,RelevantTitle,RelevantContent,RelevantDescription,KeywordMatch,ContentLength,NeedsManualReview,FullArticleContent
0,Unknown,Elliot Williams,Contagious Ideas,We ran a story about a wall-mounted plotter bo...,We ran a story about a wall-mounted plotter bo...,https://hackaday.com/2025/03/29/contagious-ideas/,2025-03-29T14:00:47Z,False,True,True,True,412,False,We ran a story about a wall-mounted plotter bo...
1,Unknown,Sarah Larson,David Byrne Takes the Stairs,The Talking Heads front man brought his acryli...,"At the Pace gallery in Chelsea in early April,...",https://www.newyorker.com/magazine/2025/04/21/...,2025-04-14T10:00:00Z,False,True,True,True,392,False,"At the Pace gallery in Chelsea in early April,..."
2,Unknown,,The ex-city trader who sold Malala painting fo...,Alexandra Johnson describes how she tackled bo...,Shivani Chaudhari\r\nAlexandra Johnson has sol...,https://www.bbc.com/news/articles/cvgn50rqrlgo,2025-04-20T05:35:06Z,True,False,False,True,299,False,The ex-city trader who sold Malala painting fo...
3,Unknown,Sabina Graves,Bid on Animation History and Rarities to Raise...,"DreamWorks Animation, Sony Pictures Animation,...",The animation industry in partnership with ASI...,https://gizmodo.com/bid-on-animation-history-a...,2025-04-24T23:45:22Z,True,True,True,True,358,False,The animation industry in partnership with ASI...
4,Unknown,Talia Lakritz,"See inside Ned's Club, an elite private club i...","A spinoff of Soho House, Ned's Club is a luxe ...",The Gallery at Ned's Club.Frank Frances\r\n<ul...,https://www.businessinsider.com/private-member...,2025-04-11T11:47:01Z,False,True,False,True,339,False,This story is available exclusively to Busines...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,Unknown,Benjamin Zhang,I visited a rare Boeing 747 that Delta saved f...,I toured the Boeing 747 Experience at the Delt...,The 747 Experience at the Delta Flight Museum ...,https://www.businessinsider.com/boeing-747-jum...,2025-04-04T15:01:46Z,False,True,True,True,360,False,The 747 Experience at the Delta Flight Museum ...
60,Unknown,news@appleinsider.com (Malcolm Owen),Inside Apple Via del Corso -- Rome's store tha...,A visit to the marble-covered Apple Via del Co...,A visit to the marble-covered Apple Via del Co...,https://appleinsider.com/articles/25/04/10/ins...,2025-04-10T16:39:13Z,True,False,False,True,474,False,"The outside of Apple Via del Corso in Rome, It..."
61,Unknown,,Typographic Pictures Composed Entirely of Bras...,The strange art of arranging typographic rule ...,Typographic Portrait of Jean Sibelius Composed...,https://blog.glyphdrawing.club/typographic-pic...,2025-04-13T05:04:01Z,False,True,True,True,289,False,Typographic Portrait of Jean Sibelius Composed...
62,Unknown,Shannon Carroll,11 of the world’s most expensive and unique ar...,Some of these collections are worth over a bil...,A picture may be worth a thousand words but on...,https://qz.com/most-expensive-unique-art-colle...,2025-04-03T09:00:00Z,True,True,False,True,339,False,A picture may be worth a thousand words — but ...


In [None]:
## REDDIT HELPER FUNCTIONS
def isBotComment(commentText):
    botPhrases = ["i am a bot", "automated", "bot", "auto-mod"]
    return any(phrase in commentText for phrase in botPhrases)

def handleRateLimit(waitSeconds=60):
    logging.warning(f"Rate limit reached. Waiting for {waitSeconds} seconds...")
    time.sleep(waitSeconds)

def savePost(postData, filePath):
    """append to jsonl"""
    try:
        with open(filePath, 'a', encoding='utf-8') as f:
            f.write(json.dumps(postData, ensure_ascii=False) + '\n')
    except IOError as e:
        logging.error(f"Failed to write to file {filePath}: {e}")

def countJsonlLines(filePath):
    """returns the number of lines in the json file if it exists (filename issues lol)"""
    if os.path.exists(filePath):
        with open(filePath, 'r', encoding='utf-8') as f:
            return sum(1 for _ in f)
    return 0

def loadExistingData(filePath):
    """makes sure json file exists before loading"""
    if not os.path.exists(filePath):
        return []
    with open(filePath, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]


In [None]:
# ## commented out bc i dont need it anymore
# def processPosts(posts, AIPostFile, nonAIPostFile, processedPostIDs, aiKeywords):
#     """
#     made to clean up the old reddit data i scraped before I cleaned up my process.
#     takes in a list of keywords to search for in a post's text content, and uses regex to be sure that word does or does not appear in the post.
#     searches the comments then sorts by content in comments
#     """
#     keywordPattern = re.compile(r'\b(' + '|'.join(re.escape(k) for k in aiKeywords) + r')\b', re.IGNORECASE)

#     for post in posts:
#         try:
#             if post.id in processedPostIDs:
#                 continue
#             processedPostIDs.add(post.id)

#             postText = f"{post.title} {post.selftext}".strip().lower()

#             # Initialize post data
#             postData = {
#                 "postId": post.id,
#                 "title": post.title,
#                 "selftext": post.selftext,
#                 "score": post.score,
#                 "numComments": post.num_comments,
#                 "subreddit": post.subreddit.display_name,
#                 "createdUtc": post.created_utc,
#                 "url": post.url,
#                 "comments": []
#             }

#             # Fetch comments and remove bot-generated ones
#             post.comments.replace_more(limit=0)
#             allComments = post.comments.list()

#             # Separate top and controversial comments
#             topComments = [c for c in allComments[:25] if not isBotComment(c.body.lower())]
#             controversialComments = [c for c in allComments[-10:] if not isBotComment(c.body.lower())]

#             # Filter comments: Must have at least 3 words & score > 3
#             filteredComments = [
#                 c for c in (topComments + controversialComments)
#                 if len(c.body.split()) >= 3 and c.score > 3
#             ][:15]  # Keep max 15 comments

#             for comment in filteredComments:
#                 postData["comments"].append({
#                     "commentId": comment.id,
#                     "body": comment.body,
#                     "score": comment.score,
#                     "createdUtc": comment.created_utc
#                 })

#             # Skip posts with no text and insufficient comments
#             if not post.selftext.strip() and len(postData["comments"]) < 5:
#                 logging.info(f"Skipping post {post.id}: No text and insufficient comments.")
#                 continue

#             # determine post related-ness
#             isAiRelated = bool(keywordPattern.search(postText)) or any(
#                 keywordPattern.search(comment["body"]) for comment in postData["comments"]
#             )

#             # Save to appropriate file
#             if isAiRelated:
#                 savePost(postData, AIPostFile)
#                 logging.info(f"AI-related post saved: {post.id}")
#             else:
#                 savePost(postData, nonAIPostFile)
#                 logging.info(f"Non-AI-related post saved: {post.id}")

#             time.sleep(2)  # Rate-limit API requests

#         except (RequestException, ResponseException, ServerError) as e:
#             logging.error(f"API Error: {e}. Retrying after delay.")
#             time.sleep(60)  # Prevent API block
#             continue
#         except Exception as e:
#             logging.error(f"Unexpected error processing post {post.id}: {e}")
#             continue

In [None]:
def scrapeSubreddit(subredditName):
    """
    goes through posts and sorts by ai related and nonAI related posts in a given subreddit.
    grabs from hot , controverisal, and top and stops when it has 150 total posts related to AI
    """
    logging.info(f"Starting scrape for subreddit: {subredditName}")

    AIPostFile = f"{subredditName}_AIposts.json"
    nonAIPostFile = f"{subredditName}_nonAIPosts.json"

    maxAIPosts = 150
    topLimit = 70  # ALL TIME
    controversialLimit = 60  # ALL-TIME
    hotLimit = 20  # RECENT

    processedPostIDs = set()
    maxRetries = 3
    retryCount = 0

    try:
        subreddit = reddit.subreddit(subredditName)
        totalCollectedAI = countJsonlLines(AIPostFile)

        # grab posts from TOP
        if totalCollectedAI < maxAIPosts:
            logging.info(f"Fetching {topLimit} posts from TOP...")
            processPosts(subreddit.top(time_filter="all", limit=topLimit),
                         AIPostFile, nonAIPostFile, processedPostIDs, aiKeywords)

        # grab posts from CONTROVERSIAL
        totalCollectedAI = len(loadExistingData(AIPostFile))
        if totalCollectedAI < maxAIPosts:
            logging.info(f"Fetching {controversialLimit} posts from CONTROVERSIAL...")
            processPosts(subreddit.controversial(time_filter="all", limit=controversialLimit),
                         AIPostFile, nonAIPostFile, processedPostIDs, aiKeywords)

        # grab posts from HOT
        totalCollectedAI = len(loadExistingData(AIPostFile))
        if totalCollectedAI < maxAIPosts:
            logging.info(f"Fetching {hotLimit} posts from HOT...")
            processPosts(subreddit.hot(limit=hotLimit),
                         AIPostFile, nonAIPostFile, processedPostIDs, aiKeywords)

        # backfill if needed
        while totalCollectedAI < maxAIPosts:
            neededPosts = maxAIPosts - totalCollectedAI
            logging.info(f"Filling gap with {neededPosts} additional TOP posts...")
            processPosts(subreddit.top(time_filter="all", limit=neededPosts),
                         AIPostFile, nonAIPostFile, processedPostIDs, aiKeywords)

            newTotal = len(loadExistingData(AIPostFile))
            if newTotal == totalCollectedAI:
                retryCount += 1
                logging.warning(f"No new AI posts found. Retry attempt {retryCount}/{maxRetries}.")
            else:
                totalCollectedAI = newTotal
                retryCount = 0

            if retryCount >= maxRetries:
                logging.warning("Max retries reached. Not enough AI posts in this subreddit. Stopping scrape.")
                break

    except (RequestException, ResponseException, ServerError) as e:
        logging.error(f"Reddit API error: {e}. Retrying...")
        handleRateLimit()
    except Exception as e:
        logging.error(f"Unexpected error: {e}")

    logging.info(f"Finished scraping {subredditName}. Total AI posts collected: {countJsonlLines(AIPostFile)}")