In [20]:
# Install dependencies
!pip install -q sentence-transformers tqdm pandas datasets matplotlib

# Imports
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import torch
import re


In [21]:

ai_keywords = [
    "artificial intelligence", "ai", "machine learning", "deep learning", "neural network",
    "openai", "large language model", "llm", "transformer model",
    "natural language processing", "nlp", "automation", "robots", "robotics", "autonomous",
    "reinforcement learning", "generative model", "text generation", "image generation", "mlp"
]
pattern = re.compile(r'\b(' + '|'.join(map(re.escape, ai_keywords)) + r')\b', re.IGNORECASE)


stream = load_dataset("webis/tldr-17", split="train", streaming=True)
filtered_rows = []
max_matches = 50000
MAX = 1_400_000
print("Converting dataset to pandas DataFrame...")
for i, ex in enumerate(tqdm(stream, desc="Filtering AI posts")):
    # if i >= MAX:
    #     print("Hit maximum expected dataset size. Stopping.")
    #     break
    text = ex['normalizedBody']
    if pattern.search(text):
        filtered_rows.append({
            "normalizedBody": ex["normalizedBody"],
            "subreddit": ex["subreddit"],
            "subreddit_id": ex["subreddit_id"],
            "content": ex["content"],
            "summary": ex["summary"]
        })
    if len(filtered_rows) >= max_matches:
        print(f"Reached maximum matches: {max_matches}. Stopping.")
        break
print("Converting dataset to pandas DataFrame...")
# Convert to pandas DataFrame
df_ai = pd.DataFrame(filtered_rows)

df_ai = df_ai[["normalizedBody", "subreddit", "subreddit_id", "content", "summary"]]


Converting dataset to pandas DataFrame...


Filtering AI posts: 0it [00:00, ?it/s]

Converting dataset to pandas DataFrame...


In [22]:

df_ai.to_parquet("ai_posts.parquet", index=False)


In [23]:
from collections import Counter

word_counts = Counter()

for text in df_ai["content"]:
    matches = pattern.findall(str(text))
    word_counts.update(matches)

print(word_counts.most_common(50))

[('AI', 17446), ('robots', 5536), ('ai', 2860), ('automation', 2854), ('MLP', 2687), ('robotics', 1666), ('autonomous', 1528), ('artificial intelligence', 578), ('machine learning', 503), ('Robots', 501), ('Robotics', 466), ('Ai', 367), ('NLP', 325), ('Automation', 319), ('mlp', 300), ('neural network', 211), ('Artificial Intelligence', 185), ('Autonomous', 117), ('Machine Learning', 112), ('LLM', 88), ('deep learning', 57), ('natural language processing', 41), ('ROBOTS', 40), ('Artificial intelligence', 30), ('Machine learning', 30), ('reinforcement learning', 19), ('nlp', 16), ('Deep Learning', 16), ('aI', 13), ('Deep learning', 11), ('Natural Language Processing', 10), ('Reinforcement Learning', 6), ('Mlp', 6), ('Neural Network', 6), ('AUTOMATION', 5), ('artificial Intelligence', 4), ('image generation', 4), ('Natural language processing', 4), ('Neural network', 4), ('llm', 3), ('text generation', 2), ('AUTONOMOUS', 2), ('generative model', 2), ('Reinforcement learning', 1), ('ARTIF

In [24]:
df_ai = pd.read_parquet("ai_posts.parquet")
df_ai.head(10)
len(df_ai)


23987