## 1. Setup

In [2]:
import os
import glob
import pandas as pd
import kaggle

## 2. Dataset

In [3]:
download_path = "../data/kaggle_dataset"
dataset_slug = "shahp7575/reddit-posts-with-keyword-coffee/"
os.system(f'kaggle datasets download -d {dataset_slug} -p {download_path} --unzip')

Dataset URL: https://www.kaggle.com/datasets/shahp7575/reddit-posts-with-keyword-coffee/versions/
License(s): MIT
Downloading reddit-posts-with-keyword-coffee.zip to ../data/kaggle_dataset


100%|██████████| 131M/131M [00:02<00:00, 47.4MB/s] 





0

In [4]:
kaggle_csv = glob.glob(download_path + "/*.csv")[0]
kaggle_csv

'../data/kaggle_dataset/reddit_coffee_scraper_till_2025-02-04 000241.csv'

In [5]:
df_kaggle = pd.read_csv(kaggle_csv)
df_kaggle.shape

(210779, 10)

In [8]:
df_kaggle.sample(3)

Unnamed: 0,id,url,title,text,score,created_utc,subreddit,num_comments,upvote_ratio,over_18
158269,1gztdl5,https://www.reddit.com/r/villainscode/comments...,Coffee,"Hello all,\n\nIn my most recent re-listen, I f...",4,1732568000.0,villainscode,0,1.0,False
120962,1ia14qd,https://www.reddit.com/r/woodworking/comments/...,Hand rubbed poly with General Finishes,Has anyone had good luck applying General Fini...,1,1737851000.0,woodworking,0,1.0,False
67344,1gtfhyd,https://www.reddit.com/gallery/1gtfhyd,Monkeypod acacia cutoffs,Ok…I know some don’t have this issue since the...,4,1731856000.0,woodworking,0,1.0,False


## 3. Pre-processing

In [10]:
def preprocess(df):
    print("Before pre-processing shape: ", df.shape)
    # drop duplicate ids
    df = df.drop_duplicates(subset=['id'])
    
    # remove oct 2nd data
    df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
    df = df[df.created_utc >= "2024-10-03"]
    
    # remove empty string texts
    df = df[df.text != ""]
    
    # remove duplicate texts
    df = df[~df.text.duplicated()]
    
    # remove nans from text
    df = df.dropna(subset=['text'])
    
    # remove user profile posts
    df = df[~df.subreddit.str.startswith('u_')]
    
    print("Post pre-processing shape: ", df.shape)
    
    return df

In [11]:
df = preprocess(df_kaggle)

Before pre-processing shape:  (210779, 10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')


Post pre-processing shape:  (153923, 10)


In [12]:
df.shape

(153923, 10)