In [6]:
# Install dependencies
!pip install -q sentence-transformers tqdm pandas datasets matplotlib

# Imports
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import torch
import re


In [7]:

ai_keywords = [
    "artificial intelligence", "ai", "machine learning", "deep learning", "neural network",
    "openai", "chatgpt", "gpt", "large language model", "llm", "transformer model",
    "natural language processing", "nlp", "automation", "robots", "robotics", "autonomous",
    "reinforcement learning", "generative model", "text generation", "image generation"
]
pattern = re.compile(r'\b(' + '|'.join(map(re.escape, ai_keywords)) + r')\b', re.IGNORECASE)


stream = load_dataset("webis/tldr-17", split="train", streaming=True)
filtered_rows = []
max_matches = 50000
MAX = 1_400_000
print("Converting dataset to pandas DataFrame...")
for i, ex in enumerate(tqdm(stream, desc="Filtering AI posts", total=1_327_000)):
    if i >= MAX:
        print("Hit maximum expected dataset size. Stopping.")
        break
    text = f"{ex['content']} {ex['summary']}"
    if pattern.search(text):
        filtered_rows.append({
            "normalizedBody": ex["normalizedBody"],
            "subreddit": ex["subreddit"],
            "subreddit_id": ex["subreddit_id"],
            "content": ex["content"],
            "summary": ex["summary"]
        })
    if len(filtered_rows) >= max_matches:
        break
print("Converting dataset to pandas DataFrame...")
# Convert to pandas DataFrame
df_ai = pd.DataFrame(filtered_rows)

df_ai = df_ai[["normalizedBody", "subreddit", "subreddit_id", "content", "summary"]]



Converting dataset to pandas DataFrame...


Filtering AI posts:   0%|          | 0/1327000 [00:00<?, ?it/s]

Hit maximum expected dataset size. Stopping.
Converting dataset to pandas DataFrame...


In [8]:

df_ai.to_parquet("ai_posts.parquet", index=False)


In [15]:
from collections import Counter

word_counts = Counter()

for text in df_ai["content"]:
    matches = pattern.findall(str(text))
    if matches.count("LLM") > 0:
        print(f"Found 'LLM' in text: {text}")
    word_counts.update(matches)

print(word_counts.most_common(50))

Found 'LLM' in text: Little Lion Man," Mumford & Sons 
 Storytime: this fall I was driving home on I-85 listening to LLM when I check my speedometer & realize I'm going 95mph in a 70 mph zone. I slow down & glance up into my rearview mirror. BAM, I see flashing blue lights! Ohshitohshitohshit, I get into the right lane, slow down & prepare to pull over when I notice that the cop passes me! A few minutes later I see an eighteen-wheeler pulled over off to the side of the interstate.
Found 'LLM' in text: Yeah, I don't think we have the "genius" investor here. McCaul's biggest reported income comes from Linda McCaul Descendants Trusts, LLM Partners, LLM Family Investments and a few more variations on that. LLM Partners is named in Clear Channel's annual reports; and it's probably not a stretch to guess that LLM stands for "Lester Lowry Mays.
Found 'LLM' in text: Well, there's a lot in there, and I don't personally do enema practice myself, so I hesitate to speak "with authority", as LLM as

In [12]:
df_ai = pd.read_parquet("ai_posts.parquet")
df_ai.head(10)


Unnamed: 0,normalizedBody,subreddit,subreddit_id,content,summary
0,i was thinking about exactly this just yesterd...,science,t5_mouw,i was thinking about exactly this just yesterd...,why are the fishfarm industry so small scale?
1,Upsampling effectively does nothing. It just a...,WeAreTheMusicMakers,t5_2qmah,Upsampling effectively does nothing. It just a...,"upsampling does nothing, downsampling creates ..."
2,The schedule is advanced by 2 weeks at a time....,EA_NHL,t5_2ry2z,The schedule is advanced by 2 weeks at a time....,"we weed out people who disappear, most GMs in ..."
3,"This all day long, i was on with my brother la...",leagueoflegends,t5_2rfxx,"This all day long, i was on with my brother la...",People on Smurf accounts or who CLAIM to be on...
4,My point is that as an engineer and roboticist...,Anarchism,t5_2qh5j,My point is that as an engineer and roboticist...,"no, economics of scale will not always exist."
5,"To me, a lot of mecha anime tends to be too se...",anime,t5_2qh22,"To me, a lot of mecha anime tends to be too se...",90% of mecha shows are too serious to take ser...
6,"Great story, but there was something that shou...",whowouldwin,t5_2s599,"Great story, but there was something that shou...","in a first encounter scenario, i'd give xenomo..."
7,"This is an awesome movie, and was an SF ground...",explainlikeimfive,t5_2sokd,"This is an awesome movie, and was an SF ground...",It's about intelligence and evolution. Watch i...
8,Let me put it this way: I'm not saying we shou...,HPMOR,t5_2t5xf,Let me put it this way: I'm not saying we shou...,there probably won't be a Singularity driven b...
9,Game Developer here. \n An object in C++ game ...,gaming,t5_2qh03,Game Developer here. \n An object in C++ game ...,"they're really there, it's just programming wi..."
