In [1]:
import pandas as pd
import os
import glob
import kaggle
from datetime import datetime



## Download dataset from kaggle

In [2]:
download_path = "kaggle_dataset/"
dataset_slug = "shahp7575/reddit-posts-with-keyword-coffee/"
os.system(f'kaggle datasets download -d {dataset_slug} -p {download_path} --unzip')

Dataset URL: https://www.kaggle.com/datasets/shahp7575/reddit-posts-with-keyword-coffee/versions/
License(s): MIT
Downloading reddit-posts-with-keyword-coffee.zip to kaggle_dataset


100%|██████████| 5.39M/5.39M [00:00<00:00, 29.1MB/s]





0

In [4]:
# remove the file present in kaggle_dataset directory first, then run the below command

In [5]:
kaggle_csv = glob.glob(download_path + "*.csv")[0]

In [6]:
df_kaggle = pd.read_csv(kaggle_csv)

In [7]:
df_kaggle.shape

(8414, 10)

## Read Scraped data

In [8]:
DATA_PATH = "../data/"
CSV_FILES = glob.glob(DATA_PATH + "*.csv")

In [9]:
CSV_FILES

['../data/reddit_coffee_posts_1728353592_1728360792.csv',
 '../data/reddit_coffee_posts_1728547888_1728555088.csv',
 '../data/reddit_coffee_posts_1728576832_1728584032.csv',
 '../data/reddit_coffee_posts_1728483162_1728490362.csv',
 '../data/reddit_coffee_posts_1728634255_1728641455.csv',
 '../data/reddit_coffee_posts_1728360924_1728368124.csv',
 '../data/reddit_coffee_posts_1728677368_1728684568.csv',
 '../data/reddit_coffee_posts_1728562219_1728569419.csv',
 '../data/reddit_coffee_posts_1728410973_1728418173.csv',
 '../data/reddit_coffee_posts_1728461483_1728468683.csv',
 '../data/reddit_coffee_posts_1728434654_1728441854.csv',
 '../data/reddit_coffee_posts_1728454443_1728461643.csv',
 '../data/reddit_coffee_posts_1728504609_1728511809.csv',
 '../data/reddit_coffee_posts_1728490411_1728497611.csv',
 '../data/reddit_coffee_posts_1728447308_1728454508.csv',
 '../data/reddit_coffee_posts_1728569550_1728576750.csv',
 '../data/reddit_coffee_posts_1728514112_1728521312.csv',
 '../data/redd

In [10]:
df_list = []

In [11]:
for f in CSV_FILES:
    df = pd.read_csv(f)
    df_list.append(df)

In [12]:
combined_df = pd.concat(df_list, ignore_index=True)

In [13]:
combined_df.shape

(6490, 10)

In [14]:
combined_df

Unnamed: 0,id,url,title,text,score,created_utc,subreddit,num_comments,upvote_ratio,over_18
0,1fyrd3v,https://www.reddit.com/gallery/1fyrd3v,Kitchen supplies sale (discount if buy multipl...,(PM for more pictures and info) \n- Instapot 7...,1,1.728361e+09,UCSDclassifieds,1,1.00,False
1,1fyrcxa,https://www.reddit.com/r/offmychest/comments/1...,Worst birthday ever(rant),I've been thinking about writing in here for a...,1,1.728361e+09,offmychest,0,1.00,False
2,1fyrag5,https://www.reddit.com/r/Beading/comments/1fyr...,My pets are conspiring against me.,I left a tube of beads on the coffee table and...,1,1.728360e+09,Beading,1,1.00,False
3,1fyr9y8,https://www.reddit.com/r/tharookhaulersnark/co...,omg i just laughed out loud when he mentioned ...,bruhhhhh i forgot this man is a self declared ...,1,1.728360e+09,tharookhaulersnark,0,1.00,False
4,1fyr925,https://i.redd.it/wc9gb4mjigtd1.jpeg,coloring book!,"i’ve posted a few of my drawings, and i wanted...",2,1.728360e+09,gamegrumps,0,1.00,False
...,...,...,...,...,...,...,...,...,...,...
6485,1g1atmu,https://i.redd.it/wkcz8gd7y4ud1.jpeg,Coffee on the go today,,1,1.728656e+09,SFWAmIHot,1,1.00,False
6486,1g1asy7,https://www.reddit.com/r/austinfood/comments/1...,First-Time Austin Visitor - Feedback on My Res...,Hey y’all! My wife and I are visiting Austin f...,6,1.728656e+09,austinfood,71,0.64,False
6487,1g1asf1,https://www.reddit.com/r/RhodeIsland/comments/...,Where to find Del's Lemonade/Coffee Milk in We...,We're visiting Westerly this October and I saw...,1,1.728656e+09,RhodeIsland,2,1.00,False
6488,1g1arfg,https://www.reddit.com/r/snapchat/comments/1g1...,"36 [M4A] - Happy Friday, would love to find so...",Morning all!\n\nWell it’s Friday (Thank goodne...,2,1.728656e+09,snapchat,1,1.00,False


## Merge Scraped data with Kaggle data

In [15]:
df_final = pd.concat([df_kaggle, combined_df], ignore_index=True)

In [16]:
df_final.shape

(14904, 10)

In [17]:
max_utc = df_final['created_utc'].max()
max_datetime = datetime.strftime(datetime.utcfromtimestamp(max_utc), "%Y-%m-%d %H:%m:%S")
print(max_datetime)

2024-10-11 22:10:50


  max_datetime = datetime.strftime(datetime.utcfromtimestamp(max_utc), "%Y-%m-%d %H:%m:%S")


In [18]:
df_final.drop_duplicates(inplace=True)

In [19]:
df_final.shape

(14765, 10)

## Upload to kaggle

In [20]:
version_name = f"reddit_coffee_scraper_till_{max_datetime}"
print(version_name)

reddit_coffee_scraper_till_2024-10-11 22:10:50


In [21]:
df_final.to_csv(f"kaggle_dataset/{version_name}.csv", index=False)

In [22]:
os.system(f'kaggle datasets version -p kaggle_dataset/ -m "New data upload - {version_name}" --dir-mode zip')

Starting upload for file reddit_coffee_scraper_till_2024-10-11 22:10:50.csv


100%|██████████| 26.3M/26.3M [00:02<00:00, 10.5MB/s]
  0%|          | 0.00/14.8M [00:00<?, ?B/s]

Upload successful: reddit_coffee_scraper_till_2024-10-11 22:10:50.csv (26MB)
Starting upload for file reddit_coffee_scraper_till_2024-10-07 221018.csv


100%|██████████| 14.8M/14.8M [00:00<00:00, 21.8MB/s]
  0%|          | 0.00/285 [00:00<?, ?B/s]

Upload successful: reddit_coffee_scraper_till_2024-10-07 221018.csv (15MB)
Starting upload for file .ipynb_checkpoints.zip


100%|██████████| 285/285 [00:00<00:00, 759B/s]


Upload successful: .ipynb_checkpoints.zip (285B)
Dataset version is being created. Please check progress at https://www.kaggle.com/shahp7575/reddit-posts-with-keyword-coffee


0