In [1]:
import json
import csv
import os 
from datetime import datetime
import time
import re
import nltk
from nltk.util import ngrams
from nltk import word_tokenize
import pandas as pd
import textblob

In [2]:
def clean_text(text): 
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())

def get_character_count(text):
    return len(clean_text(text))

def get_word_counts(text):
    text = clean_text(text)
    words = word_tokenize(text)
    uniq_words = set(words)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    return len(words), len(uniq_words), len(uniq_words.intersection(stop_words)), len(uniq_words.difference(stop_words))

def get_day_of_week(date):
    return date.weekday()

def get_text_sentiment(text): 
    analysis = textblob.TextBlob(text) 
    if analysis.sentiment.polarity > 0: 
        return 1
    elif analysis.sentiment.polarity == 0: 
        return 0
    else: 
        return -1

In [3]:
path_to_json = './data/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
#json_files = ['knitting.json']

for file in json_files:
    file = file[:-5]
    
    print('Parsing subreddit', file)
    
    with open('./data/' + file + '.json', 'r') as f:
        data = json.load(f)
    
    new_data = []

    for item in data:
        post_title = clean_text(item['post_title'])
        post_text = clean_text(item['post_selftext'])
        
        if len(post_title) > 0:
            item['post_title_sentiment'] = get_text_sentiment(post_title)
            item['post_title_characters'] = get_character_count(post_title)
            num_words, num_uniq_words, num_stop_words, num_non_stop_words = get_word_counts(post_title)
            item['post_title_words'] = num_words
            item['post_title_uniq_words'] = num_uniq_words
            item['post_title_stop_words'] = num_stop_words
            item['post_title_non_stop_words'] = num_non_stop_words
        else:
            item['post_title_sentiment'] = 0
            item['post_title_characters'] = 0
            item['post_title_words'] = 0
            item['post_title_uniq_words'] = 0
            item['post_title_stop_words'] = 0
            item['post_title_non_stop_words'] = 0

        if len(post_text) > 0:
            item['post_text_sentiment'] = get_text_sentiment(post_text)
            item['post_text_characters'] = get_character_count(post_text)
            num_words, num_uniq_words, num_stop_words, num_non_stop_words = get_word_counts(post_text)
            item['post_text_words'] = num_words
            item['post_text_uniq_words'] = num_uniq_words
            item['post_text_stop_words'] = num_stop_words
            item['post_text_non_stop_words'] = num_non_stop_words
        else:
            item['post_text_sentiment'] = 0
            item['post_text_characters'] = 0
            item['post_text_words'] = 0
            item['post_text_uniq_words'] = 0
            item['post_text_stop_words'] = 0
            item['post_text_non_stop_words'] = 0
            
        
        #if 'metadata' in item.keys():
        #    metadata = metadata + item['metadata']
        #else:
        #    item['metadata'] = []
        

        # calculate time frames
        posted_on = item['post_created_utc']
        captured_on = item['captured_on_utc']
        d1 = datetime.strptime(datetime.utcfromtimestamp(posted_on).strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        d2 = datetime.strptime(captured_on[:-7], '%Y-%m-%d %H:%M:%S')
        posted_len_minutes = abs((d1-d2).days * 24 * 60)
        day_of_week = get_day_of_week(d2)

        item['posted_len_minutes'] = posted_len_minutes
        item['day_of_week'] = day_of_week

        new_data.append(item)

        

    # Build CSV header
    header = ['post_score', 'post_archived', 'post_distinguished', 'post_downs', 'post_edited', 'post_gilded', 'post_gilded_silver', 
        'post_gilded_gold', 'post_gilded_platinum', 'post_is_original_content', 'post_is_video', 'post_likes', 
        'post_num_comments', 'post_num_crossposts', 'post_num_reports', 'post_over_18', 'post_pinned', 'subreddit',
        'subreddit_subscribers', 'post_ups', 'posted_len_minutes', 'post_title_sentiment', 'post_title_characters', 
        'post_title_words', 'post_title_uniq_words', 'post_title_stop_words', 'post_title_non_stop_words', 
        'post_text_sentiment', 'post_text_characters', 'post_text_words', 'post_text_uniq_words', 
        'post_text_stop_words', 'post_text_non_stop_words', 'day_of_week']

    
    
    # Build CSV data
    csv_data = [header]

    for item in new_data:
        row = []
        row.append(item['post_score'])
        row.append(item['post_archived'])
        row.append(item['post_distinguished'])
        row.append(item['post_downs'])
        row.append(item['post_edited'])
        row.append(item['post_gilded'])
        row.append(item['post_gilded_silver'])
        row.append(item['post_gilded_gold'])
        row.append(item['post_gilded_platinum'])
        row.append(item['post_is_original_content'])
        row.append(item['post_is_video'])
        row.append(item['post_likes'])
        row.append(item['post_num_comments'])
        row.append(item['post_num_crossposts'])
        row.append(item['post_num_reports'])
        row.append(item['post_over_18'])
        row.append(item['post_pinned'])
        row.append(item['subreddit'])
        row.append(item['subreddit_subscribers'])
        row.append(item['post_ups'])
        row.append(item['posted_len_minutes'])
        row.append(item['post_title_sentiment'])
        row.append(item['post_title_characters'])
        row.append(item['post_title_words'])
        row.append(item['post_title_uniq_words'])
        row.append(item['post_title_stop_words'])
        row.append(item['post_title_non_stop_words'])
        row.append(item['post_text_sentiment'])
        row.append(item['post_text_characters'])
        row.append(item['post_text_words'])
        row.append(item['post_text_uniq_words'])
        row.append(item['post_text_stop_words'])
        row.append(item['post_text_non_stop_words'])
        row.append(item['day_of_week'])
        
        csv_data.append(row)
    
    
    
    # Write CSV data to a file
    with open('./data/' + file + '.csv', 'w', newline='', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerows(csv_data)
        f.close()
        
print('All done parsing files')

Parsing subreddit askmen
Parsing subreddit askwomen
Parsing subreddit aww-no-metadata
Parsing subreddit aww
Parsing subreddit conspiracy
Parsing subreddit fitness
Parsing subreddit knitting
All done parsing files
