In [1]:
import json
import csv
import os 
from datetime import datetime
import time
import re
import nltk
from nltk.util import ngrams
from nltk import word_tokenize
import pandas as pd

In [8]:
path_to_json = './data/'
#json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
json_files = ['knitting.json']

for file in json_files:
    file = file[:-5]
    with open('./data/' + file + '.json', 'r') as f:
        data = json.load(f)
    
    
    new_data = []

    w_bigrams = []
    w_trigrams = []
    w_quadgrams = []
    metadata = []

    for item in data:
        post_text = item['post_selftext']
        wbg = []
        wtg = []
        wqg = []

        # tokenize and ngramify post text if available
        if len(post_text) > 0:
            post_text = re.sub('https?:\/{2}[\d\w]+\.([\d\w]+)*(\/[^\s]*)*', '', post_text)
            post_text = re.sub(r'[^\w\s]', '', post_text)

            #emoji_pattern = re.compile("["
            #    u"\U0001F600-\U0001F64F"  # emoticons
            #    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            #    u"\U0001F680-\U0001F6FF"  # transport & map symbols
            #    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            #                       "]+", flags=re.UNICODE)
            #post_text = emoji_pattern.sub(r'', post_text)

            words = word_tokenize(post_text)
            words_minus_stop_words = set(words).difference(set(nltk.corpus.stopwords.words('english')))
            wnl = nltk.WordNetLemmatizer()
            tokens = [wnl.lemmatize(t) for t in words_minus_stop_words]

            wbg = list(ngrams(tokens, 2))
            wtg = list(ngrams(tokens, 3))
            wqg = list(ngrams(tokens, 4))

            w_bigrams = w_bigrams + wbg
            w_trigrams = w_trigrams + wtg
            w_quadgrams = w_quadgrams + wqg

        # collect all metadata tags
        if 'metadata' in item.keys():
            metadata = metadata + item['metadata']
        else:
            item['metadata'] = []

        # calculate time frames
        posted_on = item['post_created_utc']
        captured_on = item['captured_on_utc']
        d1 = datetime.strptime(datetime.utcfromtimestamp(posted_on).strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        d2 = datetime.strptime(captured_on[:-7], '%Y-%m-%d %H:%M:%S')
        posted_len_minutes = abs((d1-d2).days * 24 * 60)

        item['posted_len_minutes'] = posted_len_minutes
        item['word_bigrams'] = wbg
        item['word_trigrams'] = wtg
        item['word_quadgrams'] = wqg

        new_data.append(item)

    uniq_metadata = set(metadata)
    uniq_bigrams = set(w_bigrams)
    uniq_trigrams = set(w_trigrams)
    uniq_quadgrams = set(w_quadgrams)
    
    print('Metadata', len(uniq_metadata))
    print('Bigrams', len(uniq_bigrams))
    print('Trigrams', len(uniq_trigrams))
    print('Quadgrams', len(uniq_quadgrams))
    
    
    

    # Build CSV header
    header = ['post_score', 'post_archived', 'post_distinguished', 'post_downs', 'post_edited', 'post_gilded', 'post_gilded_silver', 
        'post_gilded_gold', 'post_gilded_platinum', 'post_is_original_content', 'post_is_video', 'post_likes', 
        'post_num_comments', 'post_num_crossposts', 'post_num_reports', 'post_over_18', 'post_pinned', 'subreddit',
        'subreddit_subscribers', 'post_ups', 'posted_len_minutes']

    header = header + [x[0] + '*' + x[1] for x in list(uniq_bigrams)]
    header = header + [x[0] + '*' + x[1] + '*' + x[2] for x in list(uniq_trigrams)]
    header = header + [x[0] + '*' + x[1] + '*' + x[2] + '*' + x[3] for x in list(uniq_quadgrams)]
    header = header + [x for x in list(uniq_metadata)]


    # Build CSV data
    csv_data = [header]

    for item in new_data:
        row = []
        row.append(item['post_score'])
        row.append(item['post_archived'])
        row.append(item['post_distinguished'])
        row.append(item['post_downs'])
        row.append(item['post_edited'])
        row.append(item['post_gilded'])
        row.append(item['post_gilded_silver'])
        row.append(item['post_gilded_gold'])
        row.append(item['post_gilded_platinum'])
        row.append(item['post_is_original_content'])
        row.append(item['post_is_video'])
        row.append(item['post_likes'])
        row.append(item['post_num_comments'])
        row.append(item['post_num_crossposts'])
        row.append(item['post_num_reports'])
        row.append(item['post_over_18'])
        row.append(item['post_pinned'])
        row.append(item['subreddit'])
        row.append(item['subreddit_subscribers'])
        row.append(item['post_ups'])
        row.append(item['posted_len_minutes'])

        for x in header[21:]:
            temp = x.split('*')
            contains_ngram = False

            if len(temp) == 1:
                if temp[0] in item['metadata']:
                    contains_ngram = True
            if len(temp) == 2:
                for y in item['word_bigrams']:
                    if y[0] == temp[0] and y[1] == temp[1]:
                        contains_ngram = True
                        break
            if len(temp) == 3:
                for y in item['word_trigrams']:
                    if y[0] == temp[0] and y[1] == temp[1] and y[2] == temp[2]:
                        contains_ngram = True
                        break
            if len(temp) == 4:
                for y in item['word_quadgrams']:
                    if y[0] == temp[0] and y[1] == temp[1] and y[2] == temp[2] and y[3] == temp[3]:
                        contains_ngram = True
                        break

            row.append(int(contains_ngram))

        csv_data.append(row)
    
    # Write CSV data to a file
    with open('./data/' + file + '.csv', 'w', newline='', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerows(csv_data)
        f.close()




Metadata 0
Bigrams 9773
Trigrams 10637
Quadgrams 10438
