In [11]:
# import packages
import pandas as pd

In [None]:
# overview of function
# inputs: name of dataset
# returns: new file path, original shape (num posts), final shape (num posts)

In [42]:
def subreddit_tidier(subreddit_name):
    # read in raw csv
    path = 'raw/r_' + subreddit_name + '_dataset.csv'
    df = pd.read_csv(path)
    print('original size of df:', df.shape[0])

    # remove any posts that are NaN for column 'Post Text'
    df = df[df['Post Text'].notna()]
    print('size of df after removing NaN:', df.shape[0])

    # discard any posts that include “EDIT: “ or “edit: “
    # messes with the focus on evolution / timing of the post
    df = df[~(df['Post Text'].str.contains('EDIT:') | 
              df['Post Text'].str.contains('edit:') | 
              df['Post Text'].str.contains('Edit:'))]
    print('size of df after removing edited posts:', df.shape[0])

    # set cutoff date
    df = df[df['Created On'] < '2024-09-01 00:00:00']
    print('size of df after tidying:', df.shape[0])

    # sort by column 'Created On'
    df = df.sort_values(by=['Created On'], ignore_index=True)
    df = df.rename(columns={'Unnamed: 0': 'Popularity Index'})

    # save new tidied dataset
    output_file_path = 'tidied/r_' + subreddit_name + '_tidied.csv'
    df.to_csv(output_file_path, index=False)
    print("\ncleaned dataset saved to:", output_file_path)

In [44]:
subreddit_tidier('printedcircuitboard')

original size of df: 801
size of df after removing NaN: 329
size of df after removing edited posts: 296
size of df after tidying: 293

cleaned dataset saved to: tidied/r_printedcircuitboard_tidied.csv


In [46]:
subreddits = ['adops'] # create list of subreddits to scrape

In [48]:
for subreddit in subreddits: 
    name = subreddit.lower()
    subreddit_tidier(name)

original size of df: 566
size of df after removing NaN: 266
size of df after removing edited posts: 255
size of df after tidying: 254

cleaned dataset saved to: tidied/r_adops_tidied.csv
original size of df: 996
size of df after removing NaN: 170
size of df after removing edited posts: 169
size of df after tidying: 164

cleaned dataset saved to: tidied/r_adsb_tidied.csv
original size of df: 999
size of df after removing NaN: 677
size of df after removing edited posts: 660
size of df after tidying: 655

cleaned dataset saved to: tidied/r_airreps_tidied.csv
original size of df: 771
size of df after removing NaN: 54
size of df after removing edited posts: 54
size of df after tidying: 53

cleaned dataset saved to: tidied/r_augmentedreality_tidied.csv
original size of df: 798
size of df after removing NaN: 374
size of df after removing edited posts: 352
size of df after tidying: 346

cleaned dataset saved to: tidied/r_bose_tidied.csv
original size of df: 757
size of df after removing NaN: 2