In [1]:
import pandas as pd
import os
import glob
import kaggle
from datetime import datetime



## Download dataset from kaggle

In [3]:
download_path = "kaggle_dataset/"
dataset_slug = "shahp7575/reddit-posts-with-keyword-coffee/"
os.system(f'kaggle datasets download -d {dataset_slug} -p {download_path} --unzip')

Dataset URL: https://www.kaggle.com/datasets/shahp7575/reddit-posts-with-keyword-coffee/versions/
License(s): MIT
Downloading reddit-posts-with-keyword-coffee.zip to kaggle_dataset


100%|██████████| 41.7M/41.7M [00:01<00:00, 25.7MB/s]





0

In [23]:
# remove the file present in kaggle_dataset directory first, then run the below command

In [None]:
import os
import glob
import shutil
import pandas as pd

def pull_push_kaggle(dataset_slug: str, download_kaggle_path: str, download_scraped_path: str):

    # Ensure the download path is fresh
    if os.path.exists(download_kaggle_path):
        shutil.rmtree(download_kaggle_path)
    os.makedirs(download_kaggle_path, exist_ok=True)
    
    # download existing dataset
    os.system(f'kaggle datasets download -d {dataset_slug} -p {download_kaggle_path} --unzip')

    # read downloaded dataset, if exists
    kaggle_csv = glob.glob(download_kaggle_path + "*.csv")[0]
    print(kaggle_csv)

    df_kaggle = pd.read_csv(kaggle_csv)
    print("Kaggle CSV shape: ", df_kaggle.shape)

    # read scraped dataset
    CSV_FILES = glob.glob(download_scraped_path + "*.csv")
    print("Total CSV files scraped: ", len(CSV_FILES))
    df_list = []
    for f in CSV_FILES:
        df = pd.read_csv(f)
        df_list.append(df)
    combined_df = pd.concat(df_list, ignore_index=True)
    print("Scraped CSV files shape: ", combined_df.shape)

    # append scraped data to downloaded kaggle data
    df_final = pd.concat([df_kaggle, combined_df], ignore_index=True)
    df_final.drop_duplicates(inplace=True)
    print("Appended data shape: ", df_final.shape)
    

if __name__ == "__main__":
    DATASET_SLUG = "shahp7575/reddit-posts-with-keyword-coffee/"
    DOWNLOAD_KAGGLE_PATH = "kaggle_downloaded_dataset/"
    DOWNLOAD_SCRAPED_PATH = "data/"

    pull_push_kaggle(dataset_slug=DATASET_SLUG, 
                     download_kaggle_path=DOWNLOAD_KAGGLE_PATH,
                     download_scraped_path=DOWNLOAD_SCRAPED_PATH)

In [4]:
kaggle_csv = glob.glob(download_path + "*.csv")[0]

In [5]:
kaggle_csv

'kaggle_dataset/reddit_coffee_scraper_till_2024-11-13 041137.csv'

In [6]:
df_kaggle = pd.read_csv(kaggle_csv)

In [7]:
df_kaggle.shape

(66826, 10)

## Read Scraped data

In [8]:
DATA_PATH = "../data/"
CSV_FILES = glob.glob(DATA_PATH + "*.csv")

In [9]:
CSV_FILES[:10]

['../data/reddit_coffee_posts_1737792601_1737799801.csv',
 '../data/reddit_coffee_posts_1733408060_1733415260.csv',
 '../data/reddit_coffee_posts_1734776286_1734783486.csv',
 '../data/reddit_coffee_posts_1731852703_1731859903.csv',
 '../data/reddit_coffee_posts_1733826123_1733833323.csv',
 '../data/reddit_coffee_posts_1737900670_1737907870.csv',
 '../data/reddit_coffee_posts_1735070987_1735078187.csv',
 '../data/reddit_coffee_posts_1737987221_1737994421.csv',
 '../data/reddit_coffee_posts_1736636142_1736643342.csv',
 '../data/reddit_coffee_posts_1738131253_1738138453.csv']

In [10]:
len(CSV_FILES)

976

In [11]:
df_list = []

In [12]:
for f in CSV_FILES:
    df = pd.read_csv(f)
    df_list.append(df)

In [13]:
combined_df = pd.concat(df_list, ignore_index=True)

In [14]:
combined_df.shape

(145237, 10)

In [15]:
combined_df.sample(2)

Unnamed: 0,id,url,title,text,score,created_utc,subreddit,num_comments,upvote_ratio,over_18
140520,1hvss15,https://www.reddit.com/r/Dhaka/comments/1hvss1...,What do i do now?,During my exam time i have been facing many pr...,1,1736261000.0,Dhaka,0,0.99,False
86424,1hyyopm,https://www.reddit.com/r/lithuania/comments/1h...,Thank you Lithuania,"Hi, I am a German officer currently stationed ...",121,1736611000.0,lithuania,8,0.98,False


## Merge Scraped data with Kaggle data

In [16]:
df_final = pd.concat([df_kaggle, combined_df], ignore_index=True)

In [17]:
df_final.shape

(212063, 10)

In [18]:
max_utc = df_final['created_utc'].max()
max_datetime = datetime.strftime(datetime.utcfromtimestamp(max_utc), "%Y-%m-%d %H:%m:%S")
print(max_datetime)

2025-02-04 00:02:41


  max_datetime = datetime.strftime(datetime.utcfromtimestamp(max_utc), "%Y-%m-%d %H:%m:%S")


In [19]:
df_final.drop_duplicates(inplace=True)

In [20]:
df_final.shape

(210779, 10)

## Upload to kaggle

In [21]:
version_name = f"reddit_coffee_scraper_till_{max_datetime}"
print(version_name)

reddit_coffee_scraper_till_2025-02-04 00:02:41


In [22]:
df_final.to_csv(f"kaggle_dataset/{version_name}.csv", index=False)

In [47]:
# delete the old versions from the directory kaggle_dataset/ first, then run below command

In [28]:
os.system(f'kaggle datasets version -p kaggle_dataset/ -m "New data upload - {version_name}" --dir-mode zip')

Starting upload for file reddit_coffee_scraper_till_2025-02-04 00:02:41.csv


100%|██████████| 352M/352M [01:35<00:00, 3.89MB/s] 
  0%|          | 0.00/285 [00:00<?, ?B/s]

Upload successful: reddit_coffee_scraper_till_2025-02-04 00:02:41.csv (352MB)
Starting upload for file .ipynb_checkpoints.zip


100%|██████████| 285/285 [00:00<00:00, 515B/s]


Upload successful: .ipynb_checkpoints.zip (285B)
Dataset version is being created. Please check progress at https://www.kaggle.com/shahp7575/reddit-posts-with-keyword-coffee


0

In [29]:
# check if data successfully uploaded on kaggle.
# next, delete the file from kaggle_dataset
# delete all files from data/, except one.

In [33]:
!rm -r ../data/*.csv