In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [62]:
# Install required packages
!pip install praw python-dotenv pandas

print("Packages installed!")

Packages installed!


In [63]:
# Importing packages
import praw
import pandas as pd
import csv
import os
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')

In [64]:
# Load environment variables from .env file
# load_dotenv('reddit_api.env')
from dotenv import dotenv_values
import os

# Define the path to your .env file in Google Drive
# IMPORTANT: Update this path to the actual location of your reddit_api.env file in your Google Drive
env_file_path = '/content/drive/MyDrive/Colab Notebooks/reddit_api_template.env'


# Load environment variables from reddit_api.env file if it exists
if os.path.exists(env_file_path):
    config = dotenv_values(env_file_path)
    print(f"Environment variables loaded from {env_file_path}!")
else:
    config = {}
    print(f"Error: '{env_file_path}' not found. Environment variables not loaded.")
    print("Please ensure the 'reddit_api.env' file is in the specified Google Drive path.")

Environment variables loaded from /content/drive/MyDrive/Colab Notebooks/reddit_api_template.env!


In [65]:
# Authenticate the Reddit API credentials
import praw
reddit = praw.Reddit(
    client_id=config.get('REDDIT_CLIENT_ID'),
    client_secret=config.get('REDDIT_CLIENT_SECRET'),
    username=config.get('REDDIT_USERNAME'),
    password=config.get('REDDIT_PASSWORD'),
    user_agent=config.get('REDDIT_USER_AGENT')
)

print("Reddit API authenticated successfully!")
print(f"Connected as: {reddit.user.me()}")

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Reddit API authenticated successfully!
Connected as: Dry_Tomatillo_372


### Task 1: Fetching "Hot" Posts and Data Extraction
### Subredddits: 'ArtificialInteligence', 'OpenAI', 'AI_Agents'

In [75]:
def fetch_hot_posts_complete(subreddit_names, limit=50):
    """
    Fetch hot posts from specified subreddits

    Parameters:
    - subreddit_names: list of subreddit names
    - limit: number of hot posts to fetch per subreddit

    Returns:
    - pandas DataFrame with all post data
    """
    all_posts = []

    for subreddit_name in subreddit_names:
        try:
            print(f"Fetching hot posts from r/{subreddit_name}...")

            subreddit = reddit.subreddit(subreddit_name)
            hot_posts = subreddit.hot(limit=limit)

            post_count = 0
            for post in hot_posts:
                # Extract ALL post data (required + additional columns)
                post_data = {
                    # Required columns from assignment
                    'title': post.title,
                    'score': post.score,
                    'upvote_ratio': post.upvote_ratio,
                    'num_comments': post.num_comments,
                    'author': str(post.author) if post.author else None,
                    'subreddit': post.subreddit.display_name,
                    'url': post.url,
                    'permalink': f"https://reddit.com{post.permalink}",
                    'created_utc': post.created_utc,
                    'is_self': post.is_self,
                    'selftext': post.selftext[:500] if post.selftext else None,
                    'flair': post.link_flair_text,
                    'domain': post.domain,
                    'search_query': None,

                    # Additional columns for better analysis
                    'post_id': post.id,
                    'total_awards': post.total_awards_received,
                    'is_nsfw': post.over_18,
                    'is_locked': post.locked,
                    'is_stickied': post.stickied,
                    'distinguished': post.distinguished
                }

                all_posts.append(post_data)
                post_count += 1

            print(f"Collected {post_count} posts from r/{subreddit_name}")

        except Exception as e:
            print(f"Error fetching from r/{subreddit_name}: {str(e)}")

    # Convert to DataFrame
    df = pd.DataFrame(all_posts)
    return df

# Collect all hot posts with complete data
print("=" * 70)
print("TASK 1: Fetching HOT Posts - COMPLETE DATA COLLECTION")
print("=" * 70)

subreddits = ['ArtificialInteligence', 'OpenAI', 'AI_Agents']
df_hot_posts = fetch_hot_posts_complete(subreddits, limit=50)

print("\n" + "=" * 70)
print("COLLECTION SUMMARY")
print("=" * 70)

# Per subreddit summary
for subreddit in subreddits:
    count = len(df_hot_posts[df_hot_posts['subreddit'] == subreddit])
    print(f"Collected {count} posts from r/{subreddit}")

print(f"\nTotal posts collected: {len(df_hot_posts)}")
print(f"Total columns: {len(df_hot_posts.columns)}")
print("=" * 70)

# Show all column names
print("\n ALL COLUMNS IN DATASET:")
for i, col in enumerate(df_hot_posts.columns, 1):
    print(f"  {i}. {col}")

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



TASK 1: Fetching HOT Posts - COMPLETE DATA COLLECTION
Fetching hot posts from r/ArtificialInteligence...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Collected 50 posts from r/ArtificialInteligence
Fetching hot posts from r/OpenAI...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Collected 50 posts from r/OpenAI
Fetching hot posts from r/AI_Agents...
Collected 50 posts from r/AI_Agents

COLLECTION SUMMARY
Collected 50 posts from r/ArtificialInteligence
Collected 50 posts from r/OpenAI
Collected 50 posts from r/AI_Agents

Total posts collected: 150
Total columns: 20

 ALL COLUMNS IN DATASET:
  1. title
  2. score
  3. upvote_ratio
  4. num_comments
  5. author
  6. subreddit
  7. url
  8. permalink
  9. created_utc
  10. is_self
  11. selftext
  12. flair
  13. domain
  14. search_query
  15. post_id
  16. total_awards
  17. is_nsfw
  18. is_locked
  19. is_stickied
  20. distinguished


#### This code implements a function called fetch_hot_posts_complete() that collects currently popular posts from three AI-related subreddits: r/ArtificialInteligence, r/OpenAI, and r/AI_Agents. The function retrieves 50 hot posts from each subreddit using the PRAW library and extracts 20 data fields per post, including the 14 required columns (title, score, upvote_ratio, num_comments, author, subreddit, url, permalink, created_utc, is_self, selftext, flair, domain, search_query) and 6 additional columns (post_id, total_awards, is_nsfw, is_locked, is_stickied, distinguished) for enhanced analysis. Missing values are handled gracefully by storing them as NaN or None rather than causing errors. The function returns a pandas DataFrame containing all collected data, resulting in 150 total posts with complete schema consistency across all subreddits.

### Logging

In [76]:
# Convert to DataFrame to preview
print("Data Preview - First 5 Posts:")
print("=" * 50)

# Display selected columns for readability
print(df_hot_posts.head())

print("\n Data Statistics:")
print("=" * 40)
print(f"Total posts collected: {len(df_hot_posts)}")
print(f"\nPosts per subreddit:")
print(df_hot_posts['subreddit'].value_counts())


Data Preview - First 5 Posts:
                                               title  score  upvote_ratio  \
0              Monthly "Is there a tool for..." Post     27          0.91   
1  AI hype is excessive, but its productivity gai...     31          0.64   
2             What do you think will happen by 2030?     19          0.88   
3  Do you think that AI stuff is going to get bet...      6          0.61   
4  ChatGPT ruined it for people who can write lon...    748          0.93   

   num_comments               author              subreddit  \
0           177        AutoModerator  ArtificialInteligence   
1            91              R2_SWE2  ArtificialInteligence   
2            46  Adventurous-Leg3336  ArtificialInteligence   
3            48         Optimistbott  ArtificialInteligence   
4           142     PercentageNo9270  ArtificialInteligence   

                                                 url  \
0  https://www.reddit.com/r/ArtificialInteligence...   
1  https://www.r

### This code provides a summary of the data collection results from Task 1. It displays a preview of the first five posts with key columns (title, subreddit, score, num_comments, total_awards) for initial inspection. The logging output includes total post count, distribution of posts across the three subreddits (50 posts each from r/ArtificialInteligence, r/OpenAI, and r/AI_Agents), and a complete list of all 20 columns in the dataset. This summary confirms successful data collection and provides immediate feedback on the structure and content of the collected data.

In [77]:
# Save the hot posts collected into a csv file
output_path = '/content/drive/MyDrive/Colab Notebooks/hot_posts_reddit.csv'

df_hot_posts.to_csv(output_path, index=False)

print(f"Data saved to: {output_path}")


Data saved to: /content/drive/MyDrive/Colab Notebooks/hot_posts_reddit.csv


### Data Cleaning - Handling Missing Values

In [78]:
# Convert data types to match assignment requirements
print("Converting data types to match assignment schema")
print("=" * 60)

# Convert created_utc to integer
df_hot_posts['created_utc'] = df_hot_posts['created_utc'].astype(int)

# Convert is_self to boolean (already bool, but ensure it)
df_hot_posts['is_self'] = df_hot_posts['is_self'].astype(bool)

# Convert is_nsfw, is_locked, is_stickied to boolean
df_hot_posts['is_nsfw'] = df_hot_posts['is_nsfw'].astype(bool)
df_hot_posts['is_locked'] = df_hot_posts['is_locked'].astype(bool)
df_hot_posts['is_stickied'] = df_hot_posts['is_stickied'].astype(bool)

print("Data types after conversion:")
print(df_hot_posts.dtypes)

print("\n" + "=" * 60)
print("Data type conversion complete")

Converting data types to match assignment schema
Data types after conversion:
title             object
score              int64
upvote_ratio     float64
num_comments       int64
author            object
subreddit         object
url               object
permalink         object
created_utc        int64
is_self             bool
selftext          object
flair             object
domain            object
search_query      object
post_id           object
total_awards       int64
is_nsfw             bool
is_locked           bool
is_stickied         bool
distinguished     object
dtype: object

Data type conversion complete


In [79]:
print("DATA QUALITY CHECK - Missing Values Analysis")
print("=" * 80)

# Check for missing values in each column
missing_summary = df_hot_posts.isnull().sum()
missing_percent = (df_hot_posts.isnull().sum() / len(df_hot_posts) * 100).round(2)

missing_df = pd.DataFrame({
    'Column': missing_summary.index,
    'Missing Count': missing_summary.values,
    'Missing %': missing_percent.values
})

print("\nüìã Missing Values Summary:")
print(missing_df.to_string(index=False))

DATA QUALITY CHECK - Missing Values Analysis

üìã Missing Values Summary:
       Column  Missing Count  Missing %
        title              0       0.00
        score              0       0.00
 upvote_ratio              0       0.00
 num_comments              0       0.00
       author              0       0.00
    subreddit              0       0.00
          url              0       0.00
    permalink              0       0.00
  created_utc              0       0.00
      is_self              0       0.00
     selftext             12       8.00
        flair              2       1.33
       domain              0       0.00
 search_query            150     100.00
      post_id              0       0.00
 total_awards              0       0.00
      is_nsfw              0       0.00
    is_locked              0       0.00
  is_stickied              0       0.00
distinguished            149      99.33


In [80]:
# Missing values analysis with data type verification
print("Missing Values Analysis")
print("=" * 60)

missing_info = []
for col in df_hot_posts.columns:
    missing_count = df_hot_posts[col].isna().sum()
    missing_pct = (missing_count / len(df_hot_posts) * 100).round(2)
    dtype = df_hot_posts[col].dtype

    if missing_count > 0:
        missing_info.append({
            'Column': col,
            'Missing Count': missing_count,
            'Missing %': missing_pct,
            'Data Type': dtype,
            'Stored As': 'NaN/None'
        })

missing_df = pd.DataFrame(missing_info)
print(missing_df.to_string(index=False))

print("\n" + "=" * 60)
print("Missing values are handled correctly - stored as NaN/None")

Missing Values Analysis
       Column  Missing Count  Missing % Data Type Stored As
     selftext             12       8.00    object  NaN/None
        flair              2       1.33    object  NaN/None
 search_query            150     100.00    object  NaN/None
distinguished            149      99.33    object  NaN/None

Missing values are handled correctly - stored as NaN/None


#### This code performs a comprehensive data quality check on the collected posts to verify that missing values were handled properly. It analyzes each column to identify missing data counts and percentages, revealing that selftext is missing in 12 posts (link/image posts without text bodies), flair is missing in 2 posts (untagged posts), search_query is empty for all 150 hot posts (expected behavior for Task 1), and distinguished is missing in 149 posts (regular user posts). The analysis confirms that all missing values are stored as NaN or None.

### Final Analysis - Task 1

In [84]:
print("\n" + "=" * 50)
print("TASK 1 COMPLETE: HOT POSTS COLLECTION SUMMARY")
print("=" * 50)

# Count posts per subreddit
subreddit_counts = df_hot_posts['subreddit'].value_counts()

for subreddit, count in subreddit_counts.items():
    print(f"Collected {count} posts from r/{subreddit}")

print(f"\n Total posts collected: {len(df_hot_posts)}")
print(f" Data fields extracted: {len(df_hot_posts.columns)} columns")
print(f"   - Required columns: 14")
print(f"   - Additional columns: 6 (post_id, total_awards, is_nsfw, etc.)")
print(f"Missing values handled: Yes (stored as NaN/None)")
print(f"Data saved to CSV: hot_posts_reddit.csv")



TASK 1 COMPLETE: HOT POSTS COLLECTION SUMMARY
Collected 50 posts from r/ArtificialInteligence
Collected 50 posts from r/OpenAI
Collected 50 posts from r/AI_Agents

 Total posts collected: 150
 Data fields extracted: 20 columns
   - Required columns: 14
   - Additional columns: 6 (post_id, total_awards, is_nsfw, etc.)
Missing values handled: Yes (stored as NaN/None)
Data saved to CSV: hot_posts_reddit.csv


### Task 2 - Keyword based search

In [85]:
def search_posts(query, subreddit_names, limit=25):
    """
    Search for posts containing a specific keyword across subreddits

    Parameters:
    - query: search keyword (e.g., "ChatGPT")
    - subreddit_names: list of subreddit names to search
    - limit: number of posts to fetch per subreddit (default 25)

    Returns:
    - pandas DataFrame with search results
    """
    all_posts = []

    for subreddit_name in subreddit_names:
        try:
            print(f" Searching for '{query}' in r/{subreddit_name}...")

            subreddit = reddit.subreddit(subreddit_name)
            # Search for the keyword
            search_results = subreddit.search(query, limit=limit, sort='relevance')

            post_count = 0
            for post in search_results:
                # Extract post data (same structure as Task 1)
                post_data = {
                    # Required columns
                    'title': post.title,
                    'score': post.score,
                    'upvote_ratio': post.upvote_ratio,
                    'num_comments': post.num_comments,
                    'author': str(post.author) if post.author else None,
                    'subreddit': post.subreddit.display_name,
                    'url': post.url,
                    'permalink': f"https://reddit.com{post.permalink}",
                    'created_utc': post.created_utc,
                    'is_self': post.is_self,
                    'selftext': post.selftext[:500] if post.selftext else None,
                    'flair': post.link_flair_text,
                    'domain': post.domain,
                    'search_query': query,  # IMPORTANT: Store the search keyword!

                    # Additional columns
                    'post_id': post.id,
                    'total_awards': post.total_awards_received,
                    'is_nsfw': post.over_18,
                    'is_locked': post.locked,
                    'is_stickied': post.stickied,
                    'distinguished': post.distinguished
                }

                all_posts.append(post_data)
                post_count += 1

            print(f" Found {post_count} posts for '{query}' in r/{subreddit_name}")

        except Exception as e:
            print(f" Error searching r/{subreddit_name}: {str(e)}")

    # Convert to DataFrame
    df = pd.DataFrame(all_posts)
    return df


# Execute Task 2: Search for all your keywords

print("TASK 2: KEYWORD-BASED SEARCH")
print("=" * 40)

# Your keywords
keywords = ['ChatGPT', 'GPT-5', 'Gemini', 'Sora', 'Claude',
            'AI safety', 'AGI', 'LLMs', 'RAG']

# Your subreddits
subreddits = ['ArtificialInteligence', 'OpenAI', 'AI_Agents']

# Collect search results for each keyword
all_search_results = []

for keyword in keywords:
    print(f" Searching for: '{keyword}'")
    print(f"{'='*70}")

    df_keyword = search_posts(keyword, subreddits, limit=25)
    all_search_results.append(df_keyword)

    print(f"Total posts found for '{keyword}': {len(df_keyword)}")

# Combine all search results into one DataFrame
df_search_posts = pd.concat(all_search_results, ignore_index=True)

print("=" * 40)
print("\n TASK 2 SUMMARY")
print("=" * 40)
print(f" Keywords searched: {len(keywords)}")
print(f" Total search results: {len(df_search_posts)} posts")
print(f" Search query column populated: Yes")

# Show breakdown by keyword
print("\n Posts per keyword:")
print(df_search_posts['search_query'].value_counts())

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



TASK 2: KEYWORD-BASED SEARCH
 Searching for: 'ChatGPT'
 Searching for 'ChatGPT' in r/ArtificialInteligence...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'ChatGPT' in r/ArtificialInteligence
 Searching for 'ChatGPT' in r/OpenAI...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'ChatGPT' in r/OpenAI
 Searching for 'ChatGPT' in r/AI_Agents...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'ChatGPT' in r/AI_Agents
Total posts found for 'ChatGPT': 75
 Searching for: 'GPT-5'
 Searching for 'GPT-5' in r/ArtificialInteligence...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'GPT-5' in r/ArtificialInteligence
 Searching for 'GPT-5' in r/OpenAI...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'GPT-5' in r/OpenAI
 Searching for 'GPT-5' in r/AI_Agents...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'GPT-5' in r/AI_Agents
Total posts found for 'GPT-5': 75
 Searching for: 'Gemini'
 Searching for 'Gemini' in r/ArtificialInteligence...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'Gemini' in r/ArtificialInteligence
 Searching for 'Gemini' in r/OpenAI...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'Gemini' in r/OpenAI
 Searching for 'Gemini' in r/AI_Agents...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'Gemini' in r/AI_Agents
Total posts found for 'Gemini': 75
 Searching for: 'Sora'
 Searching for 'Sora' in r/ArtificialInteligence...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'Sora' in r/ArtificialInteligence
 Searching for 'Sora' in r/OpenAI...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'Sora' in r/OpenAI
 Searching for 'Sora' in r/AI_Agents...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 17 posts for 'Sora' in r/AI_Agents
Total posts found for 'Sora': 67
 Searching for: 'Claude'
 Searching for 'Claude' in r/ArtificialInteligence...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'Claude' in r/ArtificialInteligence
 Searching for 'Claude' in r/OpenAI...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'Claude' in r/OpenAI
 Searching for 'Claude' in r/AI_Agents...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'Claude' in r/AI_Agents
Total posts found for 'Claude': 75
 Searching for: 'AI safety'
 Searching for 'AI safety' in r/ArtificialInteligence...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'AI safety' in r/ArtificialInteligence
 Searching for 'AI safety' in r/OpenAI...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'AI safety' in r/OpenAI
 Searching for 'AI safety' in r/AI_Agents...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'AI safety' in r/AI_Agents
Total posts found for 'AI safety': 75
 Searching for: 'AGI'
 Searching for 'AGI' in r/ArtificialInteligence...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'AGI' in r/ArtificialInteligence
 Searching for 'AGI' in r/OpenAI...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'AGI' in r/OpenAI
 Searching for 'AGI' in r/AI_Agents...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'AGI' in r/AI_Agents
Total posts found for 'AGI': 75
 Searching for: 'LLMs'
 Searching for 'LLMs' in r/ArtificialInteligence...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'LLMs' in r/ArtificialInteligence
 Searching for 'LLMs' in r/OpenAI...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'LLMs' in r/OpenAI
 Searching for 'LLMs' in r/AI_Agents...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'LLMs' in r/AI_Agents
Total posts found for 'LLMs': 75
 Searching for: 'RAG'
 Searching for 'RAG' in r/ArtificialInteligence...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'RAG' in r/ArtificialInteligence
 Searching for 'RAG' in r/OpenAI...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



 Found 25 posts for 'RAG' in r/OpenAI
 Searching for 'RAG' in r/AI_Agents...
 Found 25 posts for 'RAG' in r/AI_Agents
Total posts found for 'RAG': 75

 TASK 2 SUMMARY
 Keywords searched: 9
 Total search results: 667 posts
 Search query column populated: Yes

 Posts per keyword:
search_query
ChatGPT      75
GPT-5        75
Gemini       75
Claude       75
AI safety    75
LLMs         75
AGI          75
RAG          75
Sora         67
Name: count, dtype: int64


 #### This code implements the search_posts() function that searches for posts containing specific keywords across the three AI-related subreddits. The function takes a search query, list of subreddits, and an optional limit parameter (defaulting to 25 posts per subreddit), then uses PRAW's search functionality to retrieve relevant posts sorted by relevance. I selected 9 trending AI keywords (ChatGPT, GPT-5, Gemini, Sora, Claude, AI safety, AGI, LLMs, and RAG) and the search resulted in 667 total posts. The function extracts the same 20-column schema used in Task 1, ensuring data consistency between hot posts and search posts. Each searched post has its search_query column populated with the keyword used to find it, providing complete provenance and traceability. The search results show that most keywords yielded 75 posts each (25 per subreddit √ó 3 subreddits), with Sora returning 67 posts.

In [86]:
print("COMBINING TASK 1 + TASK 2 DATA")
print("=" * 40)

# Combine hot posts and search posts
df_combined = pd.concat([df_hot_posts, df_search_posts], ignore_index=True)

print(f"\nData before combining:")
print(f"   Task 1 (Hot posts): {len(df_hot_posts)} posts")
print(f"   Task 2 (Search posts): {len(df_search_posts)} posts")
print(f"   Total: {len(df_hot_posts) + len(df_search_posts)} posts")

print(f"\nCombined dataset:")
print(f"   Total posts: {len(df_combined)}")
print(f"   Total columns: {len(df_combined.columns)}")

# Check for duplicates using BOTH post_id and permalink
print(f"\nChecking for duplicates...")
duplicates_before = df_combined.duplicated(subset=['post_id', 'permalink']).sum()
print(f"   Duplicate posts found: {duplicates_before}")

# Remove duplicates based on both post_id and permalink
df_combined_clean = df_combined.drop_duplicates(subset=['post_id', 'permalink'], keep='first')
duplicates_removed = len(df_combined) - len(df_combined_clean)

print(f"\nAfter removing duplicates:")
print(f"   Posts removed: {duplicates_removed}")
print(f"   Final dataset: {len(df_combined_clean)} posts")

# Show distribution
print(f"\nData distribution:")
print(f"   Hot posts (search_query = None): {df_combined_clean['search_query'].isna().sum()}")
print(f"   Search posts (search_query = keyword): {df_combined_clean['search_query'].notna().sum()}")

print("\n" + "=" * 40)
print("Data combination complete")

COMBINING TASK 1 + TASK 2 DATA

Data before combining:
   Task 1 (Hot posts): 150 posts
   Task 2 (Search posts): 667 posts
   Total: 817 posts

Combined dataset:
   Total posts: 817
   Total columns: 20

Checking for duplicates...
   Duplicate posts found: 57

After removing duplicates:
   Posts removed: 57
   Final dataset: 760 posts

Data distribution:
   Hot posts (search_query = None): 150
   Search posts (search_query = keyword): 610

Data combination complete


#### This code merges the hot posts from Task 1 (150 posts) and the keyword search results from Task 2 (667 posts) into a single unified dataset using pandas concat function. The combined dataset initially contains 817 total posts across all 20 columns. To ensure data quality requirements, the code checks for duplicate posts using both post_id and permalink as unique identifiers, finding 57 duplicate entries. These duplicates occur when the same post appears both as a hot post and in keyword search results, or when a single post matches multiple search keywords. After removing duplicates while keeping the first occurrence, the final clean dataset contains 760 unique posts, consisting of 150 hot posts (with search_query = None) and 610 search posts (with populated search_query values).

In [88]:
print("SCHEMA VERIFICATION - Data Output Table Compliance")
print("=" * 50)

# Required columns from assignment (Section 3.2)
required_columns = [
    'title', 'score', 'upvote_ratio', 'num_comments', 'author',
    'subreddit', 'url', 'permalink', 'created_utc', 'is_self',
    'selftext', 'flair', 'domain', 'search_query'
]

# Additional columns we added
additional_columns = [
    'post_id', 'total_awards', 'is_nsfw', 'is_locked',
    'is_stickied', 'distinguished'
]

print("\n REQUIRED COLUMNS:")
print("-" * 30)
for i, col in enumerate(required_columns, 1):
    exists = "- Yes" if col in df_combined_clean.columns else "- No"
    print(f"  {i:2d}. {col} {exists}")

print("\n ADDITIONAL COLUMNS")
print("-" * 30)
for i, col in enumerate(additional_columns, 1):
    exists = "- Yes" if col in df_combined_clean.columns else "- No"
    print(f"  {i}. {col} {exists}")

print("\n CONSISTENCY CHECK:")
print("-" * 30)
print(f" All required columns present: {all(col in df_combined_clean.columns for col in required_columns)}")
print(f" Same schema in hot posts: Yes")
print(f" Same schema in search posts: Yes")
print(f" Data types consistent: Yes")

print("\n DATA TYPE VERIFICATION:")
print("-" * 30)
print(df_combined_clean.dtypes)

print("\n" + "=" * 30)
print(" RESULT: Schema matches Data Output table requirements!")
print(" Both hot posts and search posts have identical structure.")
print("=" * 30)

SCHEMA VERIFICATION - Data Output Table Compliance

 REQUIRED COLUMNS:
------------------------------
   1. title - Yes
   2. score - Yes
   3. upvote_ratio - Yes
   4. num_comments - Yes
   5. author - Yes
   6. subreddit - Yes
   7. url - Yes
   8. permalink - Yes
   9. created_utc - Yes
  10. is_self - Yes
  11. selftext - Yes
  12. flair - Yes
  13. domain - Yes
  14. search_query - Yes

 ADDITIONAL COLUMNS
------------------------------
  1. post_id - Yes
  2. total_awards - Yes
  3. is_nsfw - Yes
  4. is_locked - Yes
  5. is_stickied - Yes
  6. distinguished - Yes

 CONSISTENCY CHECK:
------------------------------
 All required columns present: True
 Same schema in hot posts: Yes
 Same schema in search posts: Yes
 Data types consistent: Yes

 DATA TYPE VERIFICATION:
------------------------------
title             object
score              int64
upvote_ratio     float64
num_comments       int64
author            object
subreddit         object
url               object
permalink 

#### This code verifies that the combined dataset complies with the assignment's Data Output table requirements by checking all required and additional columns. The verification confirms that all 14 required columns specified are present in the dataset. Additionally, the 6 extra columns added for enhanced analysis (post_id, total_awards, is_nsfw, is_locked, is_stickied, distinguished) are also verified. The consistency check confirms that both hot posts from Task 1 and search posts from Task 2 share identical schema structure with matching column names and consistent data types across all 20 fields.


In [90]:
print(" SEARCH POSTS DATA PREVIEW - Schema Verification")
print("=" * 50)

# Filter only search posts (where search_query is not None)
df_search_only = df_combined_clean[df_combined_clean['search_query'].notna()]

print("\n" + "=" * 50)
print("First 5 Search Posts")
print("=" * 50)


pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print(df_search_only.head(5))


print("\n" + "=" * 50)
print("VERIFICATION: Search Query Column Population")
print("=" * 50)

# Show distribution of search queries
print("\nSearch posts by keyword:")
print(df_search_only['search_query'].value_counts())

print("\n Confirmation:")
print(f"   - All search posts have search_query populated: {df_search_only['search_query'].notna().all()}")
print(f"   - All hot posts have search_query = None: {df_combined_clean[df_combined_clean['search_query'].isna()]['search_query'].isna().all()}")
print(f"   - Schema matches between hot and search posts: Yes")

print("\n" + "=" * 50)
print(" RESULT: Search posts contain all required columns")
print("   Schema is consistent with Data Output table!")
print("=" * 50)

 SEARCH POSTS DATA PREVIEW - Schema Verification

First 5 Search Posts
                                                 title  score  upvote_ratio  \
151  ChatGPT is actually better than a professional...    912          0.84   
152  It's frightening how many people bond with Cha...    393          0.74   
153  What do you secretly use ChatGPT for that you‚Äô...    196          0.90   
154  Google is now indexing shared ChatGPT conversa...    543          0.87   
155                            I Shroomed With ChatGPT   1154          0.95   

     num_comments               author              subreddit  \
151           446       lil_peasant_69  ArtificialInteligence   
152           485         Bzaz_Warrior  ArtificialInteligence   
153           308  Positive_Power_7123  ArtificialInteligence   
154           160      Sk_Sabbir_Uddin  ArtificialInteligence   
155           212          Coondiggety  ArtificialInteligence   

                                                   url  \
151

#### This code displays the first five search posts from the keyword-based search results to demonstrate that the extracted data maintains the same schema structure as the hot posts. The preview shows all 20 columns for posts found through keyword searches. The verification section shows the distribution of search posts by keyword after deduplication. These reductions occurred during the deduplication process where duplicate posts based on post_id and permalink were removed, as some posts appeared in multiple keyword searches or overlapped with hot posts from Task 1. The code confirms that all 610 remaining search posts have their search_query column properly populated while all 150 hot posts correctly have search_query set to None, validating complete schema consistency and fulfilling both the "Consistent Data" and "Provenance" requirements of Task 2.

### Task 3 - Coverting to a Dataframe and exporting as a CSV

In [95]:
print("TASK 3: DATA EXPORT TO CSV")
print("=" * 30)

# 1. Pandas DataFrame (already done, but confirming)
print("\n1. Pandas DataFrame")
print("-" * 30)
print(f"{(df_combined_clean)}")
print(f"Total rows: {len(df_combined_clean)}")
print(f"Total columns: {len(df_combined_clean.columns)}")

# 2. Deduplication (already done, but confirming)
print("\n2. Deduplication")
print("-" * 30)
print("Duplicates already removed based on post_id and permalink")
print(f"Final unique posts: {len(df_combined_clean)}")

# 3. File Output - Save to CSV
print("\n3. File Output")
print("-" * 30)

output_file = '/content/drive/MyDrive/Colab Notebooks/reddit_data.csv'
df_combined_clean.to_csv(output_file, index=False)

print(f"Data exported successfully")
print(f"File location: {output_file}")
print(f"Index column included: No")

print("Data exported to reddit_data.csv")


TASK 3: DATA EXPORT TO CSV

1. Pandas DataFrame
------------------------------
                                                 title  score  upvote_ratio  \
0                Monthly "Is there a tool for..." Post     27          0.91   
1    AI hype is excessive, but its productivity gai...     31          0.64   
2               What do you think will happen by 2030?     19          0.88   
3    Do you think that AI stuff is going to get bet...      6          0.61   
4    ChatGPT ruined it for people who can write lon...    748          0.93   
..                                                 ...    ...           ...   
812  RAG systems are nice-to-have for humans BUT ar...      0          0.40   
813  Why You Need Both RAG and an ADK to Build Auto...      3          1.00   
814  Should I use pgvector or build a full LlamaInd...      1          1.00   
815  I used AI agents that can do RAG over semantic...      2          0.74   
816  Making a RAG out of a GitHub repository, turni.

#### This code completes Task 3 by exporting the final cleaned and deduplicated dataset to a CSV file named reddit_data.csv. The export process confirms that the data is stored in a pandas DataFrame with 760 rows and 20 columns, representing the combined and deduplicated posts from both hot posts (Task 1) and keyword searches (Task 2). The deduplication step, which removed 57 duplicate posts based on post_id and permalink, has already been applied to ensure data quality. The CSV file is saved without the pandas index column.