## 1. Setup

In [1]:
import os
import glob
import pandas as pd
import kaggle



## 2. Dataset

In [2]:
download_path = "../data/kaggle_dataset"
dataset_slug = "shahp7575/reddit-posts-with-keyword-coffee/"
os.system(f'kaggle datasets download -d {dataset_slug} -p {download_path} --unzip')

Dataset URL: https://www.kaggle.com/datasets/shahp7575/reddit-posts-with-keyword-coffee/versions/
License(s): MIT
Downloading reddit-posts-with-keyword-coffee.zip to ../data/kaggle_dataset


100%|██████████| 150M/150M [00:04<00:00, 38.0MB/s] 





0

In [3]:
kaggle_csv = glob.glob(download_path + "/*.csv")[0]
kaggle_csv

'../data/kaggle_dataset/reddit_coffee_scraper_till_1739679079.csv'

In [4]:
df_kaggle = pd.read_csv(kaggle_csv)
df_kaggle.shape

(239265, 10)

In [5]:
df_kaggle.sample(3)

Unnamed: 0,id,url,title,text,score,created_utc,subreddit,num_comments,upvote_ratio,over_18
56606,1gg9vcq,https://i.redd.it/t2xlam6a12yd1.jpeg,Just want to get my morning coffee,,8,1730364000.0,OneOrangeBraincell,0,1.0,False
153489,1hqvybc,https://www.reddit.com/r/Market76/comments/1hq...,"[PS4] H: Conductors x3, Scanners, Vipers, Prop...",,0,1735707000.0,Market76,1,0.5,False
196238,1hkb9pp,https://www.reddit.com/r/ausjdocs/comments/1hk...,Prince Charming Haematologist Trainee,Hi everyone!\n\nI have a story time. I'm a non...,15,1734912000.0,ausjdocs,3,0.89,False


In [6]:
df_kaggle.columns

Index(['id', 'url', 'title', 'text', 'score', 'created_utc', 'subreddit',
       'num_comments', 'upvote_ratio', 'over_18'],
      dtype='object')

## 3. Pre-processing

In [11]:
def preprocess(df):
    print("Before pre-processing shape: ", df.shape)
    # drop duplicate ids
    df = df.drop_duplicates(subset=['id'])
    
    # remove oct 2nd data
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
    df = df[df.created_utc >= "2024-10-03"]
    
    # remove empty string texts
    df = df[df.text != ""]
    
    # remove duplicate texts
    df = df[~df.text.duplicated()]
    
    # remove nans from text
    df = df.dropna(subset=['text'])
    
    # remove user profile posts
    df = df[~df.subreddit.str.startswith('u_')]

    # reset index
    df.reset_index(inplace=True)
    
    print("Post pre-processing shape: ", df.shape)
    
    return df

In [12]:
df = preprocess(df_kaggle)

Before pre-processing shape:  (239265, 10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')


Post pre-processing shape:  (170161, 11)


In [13]:
df.shape

(170161, 11)

In [15]:
df.sample(3)

Unnamed: 0,index,id,url,title,text,score,created_utc,subreddit,num_comments,upvote_ratio,over_18
114453,154877,1ic9z0c,https://www.reddit.com/r/fasting/comments/1ic9...,Completed my first 24 hour fast,Just wanted to tell you guys that I'm 24 hours...,5,2025-01-28 19:30:31,fasting,1,1.0,False
110433,149401,1h4con2,https://www.reddit.com/r/offmychest/comments/1...,Partner didn't pick me up from the airport a w...,(M37) had said they'd pick me (29f) from the l...,1,2024-12-01 20:29:40,offmychest,5,0.99,False
56288,76423,1i9h5x5,https://www.reddit.com/r/SeveranceAppleTVPlus/...,Pineapples!,Haven’t seen this posted anywhere so forgive m...,3,2025-01-25 06:42:40,SeveranceAppleTVPlus,1,1.0,False
