In [1]:
import json
import csv
import os 
from datetime import datetime
import time
import re
import nltk
from nltk.util import ngrams
from nltk import word_tokenize
import pandas as pd
import textblob

In [2]:
def clean_text(text): 
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())

def get_character_count(text):
    return len(clean_text(text))

def get_word_counts(text):
    text = clean_text(text)
    words = word_tokenize(text)
    uniq_words = set(words)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    return len(words), len(uniq_words), len(uniq_words.intersection(stop_words)), len(uniq_words.difference(stop_words))

def get_day_of_week(date):
    return date.weekday()

def get_text_sentiment(text): 
    analysis = textblob.TextBlob(text) 
    if analysis.sentiment.polarity > 0: 
        return 1
    elif analysis.sentiment.polarity == 0: 
        return 0
    else: 
        return -1
    
def get_ngrams(text, n):
    text = clean_text(text)
    words = word_tokenize(text)
    tokens = set(words).difference(set(nltk.corpus.stopwords.words('english')))
    return list(ngrams(tokens, n))

In [None]:
path_to_json = './data/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
#json_files = ['knitting.json']

for file in json_files:
    file = file[:-5]
    
    print('Parsing subreddit', file)
    
    with open('./data/' + file + '.json', 'r') as f:
        data = json.load(f)
    
    new_data = []
    
    w_bigrams = []
    w_trigrams = []
    w_quadgrams = []

    for item in data:
        post_title = clean_text(item['post_title'].lower())
        post_text = clean_text(item['post_selftext'].lower())
        
        if len(post_title) > 0:
            item['post_title_sentiment'] = get_text_sentiment(post_title)
            item['post_title_characters'] = get_character_count(post_title)
            num_words, num_uniq_words, num_stop_words, num_non_stop_words = get_word_counts(post_title)
            item['post_title_words'] = num_words
            item['post_title_uniq_words'] = num_uniq_words
            item['post_title_stop_words'] = num_stop_words
            item['post_title_non_stop_words'] = num_non_stop_words
            
            # get ngrams
            wbg = get_ngrams(post_title, 2)
            wtg = get_ngrams(post_title, 3)
            wqg = get_ngrams(post_title, 4)
            
            w_bigrams = w_bigrams + wbg
            w_trigrams = w_trigrams + wtg
            w_quadgrams = w_quadgrams + wqg
            
            item['post_title_bigrams'] = wbg
            item['post_title_trigrams'] = wtg
            item['post_title_quadgrams'] = wqg

        else:
            item['post_title_sentiment'] = 0
            item['post_title_characters'] = 0
            item['post_title_words'] = 0
            item['post_title_uniq_words'] = 0
            item['post_title_stop_words'] = 0
            item['post_title_non_stop_words'] = 0
            item['post_title_bigrams'] = []
            item['post_title_trigrams'] = []
            item['post_title_quadgrams'] = []

        if len(post_text) > 0:
            item['post_text_sentiment'] = get_text_sentiment(post_text)
            item['post_text_characters'] = get_character_count(post_text)
            num_words, num_uniq_words, num_stop_words, num_non_stop_words = get_word_counts(post_text)
            item['post_text_words'] = num_words
            item['post_text_uniq_words'] = num_uniq_words
            item['post_text_stop_words'] = num_stop_words
            item['post_text_non_stop_words'] = num_non_stop_words
        else:
            item['post_text_sentiment'] = 0
            item['post_text_characters'] = 0
            item['post_text_words'] = 0
            item['post_text_uniq_words'] = 0
            item['post_text_stop_words'] = 0
            item['post_text_non_stop_words'] = 0
            
        
        #if 'metadata' in item.keys():
        #    metadata = metadata + item['metadata']
        #else:
        #    item['metadata'] = []
        

        # calculate time frames
        posted_on = item['post_created_utc']
        captured_on = item['captured_on_utc']
        d1 = datetime.strptime(datetime.utcfromtimestamp(posted_on).strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        d2 = datetime.strptime(captured_on[:-7], '%Y-%m-%d %H:%M:%S')
        posted_len_minutes = abs((d1-d2).days * 24 * 60)
        day_of_week = get_day_of_week(d2)

        item['posted_len_minutes'] = posted_len_minutes
        item['day_of_week'] = day_of_week

        new_data.append(item)
    
    
    # summary data for each file
    w_bigrams_uniq = set(w_bigrams)
    w_trigrams_uniq = set(w_trigrams)
    w_quadgrams_uniq = set(w_quadgrams)

    print(' post title ngram totals:')
    print('  ', file)
    print('   Bigrams', len(w_bigrams))
    print('   Trigrams', len(w_trigrams))
    print('   Quadgrams', len(w_quadgrams))
    print()
    print(' post title unique ngram totals:')
    print('  ', file)
    print('   Bigrams', len(w_bigrams_uniq))
    print('   Trigrams', len(w_trigrams_uniq))
    print('   Quadgrams', len(w_quadgrams_uniq))

        

    # Build CSV header
    header = ['post_score', 'post_archived', 'post_distinguished', 'post_downs', 'post_edited', 'post_gilded', 'post_gilded_silver', 
        'post_gilded_gold', 'post_gilded_platinum', 'post_is_original_content', 'post_is_video', 'post_likes', 
        'post_num_comments', 'post_num_crossposts', 'post_num_reports', 'post_over_18', 'post_pinned', 'subreddit',
        'subreddit_subscribers', 'post_ups', 'posted_len_minutes', 'post_title_sentiment', 'post_title_characters', 
        'post_title_words', 'post_title_uniq_words', 'post_title_stop_words', 'post_title_non_stop_words', 
        'post_text_sentiment', 'post_text_characters', 'post_text_words', 'post_text_uniq_words', 
        'post_text_stop_words', 'post_text_non_stop_words', 'day_of_week']
    
    static_header_count = len(header)
    
    header = header + [x[0] + '*' + x[1] for x in list(w_bigrams_uniq)]
    header = header + [x[0] + '*' + x[1] + '*' + x[2] for x in list(w_trigrams_uniq)]
    header = header + [x[0] + '*' + x[1] + '*' + x[2] + '*' + x[3] for x in list(w_quadgrams_uniq)]


    
    
    # Build CSV data
    csv_data = [header]

    for item in new_data:
        row = []
        row.append(item['post_score'])
        row.append(item['post_archived'])
        row.append(item['post_distinguished'])
        row.append(item['post_downs'])
        row.append(item['post_edited'])
        row.append(item['post_gilded'])
        row.append(item['post_gilded_silver'])
        row.append(item['post_gilded_gold'])
        row.append(item['post_gilded_platinum'])
        row.append(item['post_is_original_content'])
        row.append(item['post_is_video'])
        row.append(item['post_likes'])
        row.append(item['post_num_comments'])
        row.append(item['post_num_crossposts'])
        row.append(item['post_num_reports'])
        row.append(item['post_over_18'])
        row.append(item['post_pinned'])
        row.append(item['subreddit'])
        row.append(item['subreddit_subscribers'])
        row.append(item['post_ups'])
        row.append(item['posted_len_minutes'])
        row.append(item['post_title_sentiment'])
        row.append(item['post_title_characters'])
        row.append(item['post_title_words'])
        row.append(item['post_title_uniq_words'])
        row.append(item['post_title_stop_words'])
        row.append(item['post_title_non_stop_words'])
        row.append(item['post_text_sentiment'])
        row.append(item['post_text_characters'])
        row.append(item['post_text_words'])
        row.append(item['post_text_uniq_words'])
        row.append(item['post_text_stop_words'])
        row.append(item['post_text_non_stop_words'])
        row.append(item['day_of_week'])
        
        for x in header[static_header_count:]:
            temp = x.split('*')
            contains_ngram = False

            if len(temp) == 2:
                for y in item['post_title_bigrams']:
                    if y[0] == temp[0] and y[1] == temp[1]:
                        contains_ngram = True
            if len(temp) == 3:
                for y in item['post_title_trigrams']:
                    if y[0] == temp[0] and y[1] == temp[1] and y[2] == temp[2]:
                        contains_ngram = True
            if len(temp) == 4:
                for y in item['post_title_quadgrams']:
                    if y[0] == temp[0] and y[1] == temp[1] and y[2] == temp[2] and y[3] == temp[3]:
                        contains_ngram = True
                        
            row.append(int(contains_ngram))
        
        csv_data.append(row)
    
    
    print(' Total records:', len(csv_data)-1)
    print(' Total features:', len(csv_data[0]))
    
    # Write CSV data to a file
    with open('./data/' + file + '.csv', 'w', newline='', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerows(csv_data)
        f.close()
        
print('All done parsing files')

Parsing subreddit askmen




 post title ngram totals:
   askmen
   Bigrams 5082
   Trigrams 4096
   Quadgrams 3154

 post title unique ngram totals:
   askmen
   Bigrams 4858
   Trigrams 4091
   Quadgrams 3152
 Total records: 991
 Total features: 12135
Parsing subreddit askwomen
 post title ngram totals:
   askwomen
   Bigrams 4662
   Trigrams 3683
   Quadgrams 2765

 post title unique ngram totals:
   askwomen
   Bigrams 4438
   Trigrams 3663
   Quadgrams 2750
 Total records: 991
 Total features: 10885
Parsing subreddit aww-no-metadata
 post title ngram totals:
   aww-no-metadata
   Bigrams 3389
   Trigrams 2498
   Quadgrams 1824

 post title unique ngram totals:
   aww-no-metadata
   Bigrams 3266
   Trigrams 2491
   Quadgrams 1822
 Total records: 987
 Total features: 7613
Parsing subreddit aww
 post title ngram totals:
   aww
   Bigrams 3389
   Trigrams 2498
   Quadgrams 1824

 post title unique ngram totals:
   aww
   Bigrams 3266
   Trigrams 2491
   Quadgrams 1822
 Total records: 987
 Total features: 7613
Par