In [2]:
#load in dependencies
import pandas as pd
import numpy as np
import bz2
import json

In [2]:
#lists of words to narrow down how many results I get back
blacklist = ['Fluxbox', 'fluxbox', 'ps2pdf', 'cps3', 'mps3', 'Trixbox', 'rps25', 'aps2', 'gps2', 'lps3', 'pps25', r'a-zA-Z'+'ps', r'a-zA-Z'+'xbox']
ps_list = ['playstation', 'ps1', 'ps2', 'ps3', 'ps4']
xbox_list = ['xbox']

#ps/xbox_mention scan an individual post to see if it contains info about the corresponding console
def ps_mention(row):
    mention = False
    if not any(entry in row for entry in blacklist):
        if any(x in row for x in ps_list):
            mention = True
    return mention

def xbox_mention(row):
    mention = False
    if not any(entry in row for entry in blacklist):
        if any(x in row for x in xbox_list):
            mention = True
    return mention

#breaks a dataframe down into the useful columns and filters out rows that don't contain ps/xbox mentions
def cleaning(uncleaned_df):
    cleaned = uncleaned_df[['id', 'score', 'controversiality', 'subreddit', 'body', 'month', 'year']]
    original_size = len(uncleaned_df.index)
    cleaned['original_size'] = original_size
    cleaned['PS'] = cleaned['body'].apply(ps_mention)
    cleaned['XBOX'] = cleaned['body'].apply(xbox_mention)
    df = cleaned[(cleaned['PS'] == True) | (cleaned['XBOX'] == True)]
    df.set_index('id', inplace=True, drop=True) 
    return(df)

In [11]:
#year and month iterators for running through the bz2 files
#note that I ran each year independently due to memory issues with later years that had much larger files

#years = ['2006', '2007', '2008', '2009']
years = ['2009']
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

#runs through the bz2 files containing the raw Reddit data and passes them to the cleaning function above
#then saves the result as a json file one month at a time
for y in years:
    for m in months:
        df = pd.DataFrame()
        try:
            filename = "data/" + y + "/RC_" + y + "-" + m + ".bz2"
            parsed_file = pd.read_json(filename, compression='bz2', lines=True)
            parsed_file['month'] = m
            parsed_file['year'] = y
            cleaned_monthly = cleaning(parsed_file)
            df = df.append(cleaned_monthly)
            print(filename + " parsed")
            df.to_json('dataframes/df'+y+'-'+m+'.json')
            print(df.shape)
            
        except:
            print("file named " + filename + " does not exist")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned['original_size'] = original_size
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned['PS'] = cleaned['body'].apply(ps_mention)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned['XBOX'] = cleaned['body'].apply(xbox_mention)


data/2009/RC_2009-01.bz2 parsed
(470, 9)
data/2009/RC_2009-02.bz2 parsed
(441, 9)
data/2009/RC_2009-03.bz2 parsed
(492, 9)
data/2009/RC_2009-04.bz2 parsed
(447, 9)
data/2009/RC_2009-05.bz2 parsed
(547, 9)
data/2009/RC_2009-06.bz2 parsed
(754, 9)
data/2009/RC_2009-07.bz2 parsed
(750, 9)
data/2009/RC_2009-08.bz2 parsed
(1053, 9)
data/2009/RC_2009-09.bz2 parsed
(1132, 9)
data/2009/RC_2009-10.bz2 parsed
(1406, 9)
data/2009/RC_2009-11.bz2 parsed
(1957, 9)
data/2009/RC_2009-12.bz2 parsed
(2096, 9)


In [80]:
#compiles a set of year-month jsons from the same year into one larger dataframe
years = ['2006', '2007', '2008', '2009']
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
for y in years:
    compiled_df = pd.DataFrame()
    for m in months:
        url = 'dataframes/df'+y+'-'+m+'.json'
        df = pd.read_json(url)
        compiled_df = compiled_df.append(df)
    compiled_df.to_json('dataframes/df'+y+'.json')
    print(compiled_df.shape)

(82, 9)
(431, 9)
(2712, 9)
(11545, 9)


In [84]:
#compile all year dataframes into one
df06 = pd.read_json('dataframes/df2006.json')
df07 = pd.read_json('dataframes/df2007.json')
df08 = pd.read_json('dataframes/df2008.json')
df09 = pd.read_json('dataframes/df2009.json')

final_df = df06.append(df07.append(df08.append(df09)))
final_df.to_json('dataframes/reddit_data.json')
print(final_df.shape)

(14770, 9)
