In [1]:
import json
import csv
import os 
from datetime import datetime
import time
import re
import nltk
from nltk.util import ngrams
from nltk import word_tokenize
import pandas as pd
import textblob

In [13]:
def clean_text(text): 
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text).split())

def get_character_count(text):
    return len(clean_text(text))

def get_word_counts(text):
    text = clean_text(text)
    words = word_tokenize(text)
    uniq_words = set(words)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    return len(words), len(uniq_words), len(uniq_words.intersection(stop_words)), len(uniq_words.difference(stop_words))

def get_num_words(text):
    text = clean_text(text)
    words = word_tokenize(text)
    return len(words)

def get_num_uniq_words(text):
    text = clean_text(text)
    words = word_tokenize(text)
    uniq_words = set(words)
    return len(uniq_words)

def get_num_stop_words(text):
    text = clean_text(text)
    words = word_tokenize(text)
    uniq_words = set(words)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    return len(uniq_words.intersection(stop_words))

def get_num_non_stop_words(text):
    text = clean_text(text)
    words = word_tokenize(text)
    uniq_words = set(words)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    return len(uniq_words.difference(stop_words))

def get_day_of_week(date):
    return date.weekday()

def get_text_sentiment(text): 
    analysis = textblob.TextBlob(text) 
    if analysis.sentiment.polarity > 0: 
        return 1
    elif analysis.sentiment.polarity == 0: 
        return 0
    else: 
        return -1
    
def get_ngrams(text, n):
    text = clean_text(text)
    words = word_tokenize(text)
    tokens = set(words).difference(set(nltk.corpus.stopwords.words('english')))
    return list(ngrams(tokens, n))

In [3]:
def format_simple_data(data, is_original):
    output = []

    for item in data:
        item['is_original'] = is_original
        post_title = clean_text(item['post_title'].lower())
        post_text = clean_text(item['post_selftext'].lower())
        
        if len(post_title) > 0:
            item['post_title_sentiment'] = get_text_sentiment(post_title)
            item['post_title_characters'] = get_character_count(post_title)
            num_words, num_uniq_words, num_stop_words, num_non_stop_words = get_word_counts(post_title)
            item['post_title_words'] = num_words
            item['post_title_uniq_words'] = num_uniq_words
            item['post_title_stop_words'] = num_stop_words
            item['post_title_non_stop_words'] = num_non_stop_words
        else:
            item['post_title_sentiment'] = 0
            item['post_title_characters'] = 0
            item['post_title_words'] = 0
            item['post_title_uniq_words'] = 0
            item['post_title_stop_words'] = 0
            item['post_title_non_stop_words'] = 0

        if len(post_text) > 0:
            item['post_text_sentiment'] = get_text_sentiment(post_text)
            item['post_text_characters'] = get_character_count(post_text)
            num_words, num_uniq_words, num_stop_words, num_non_stop_words = get_word_counts(post_text)
            item['post_text_words'] = num_words
            item['post_text_uniq_words'] = num_uniq_words
            item['post_text_stop_words'] = num_stop_words
            item['post_text_non_stop_words'] = num_non_stop_words
        else:
            item['post_text_sentiment'] = 0
            item['post_text_characters'] = 0
            item['post_text_words'] = 0
            item['post_text_uniq_words'] = 0
            item['post_text_stop_words'] = 0
            item['post_text_non_stop_words'] = 0
            
        # calculate time frames
        posted_on = item['post_created_utc']
        captured_on = item['captured_on_utc']
        d1 = datetime.strptime(datetime.utcfromtimestamp(posted_on).strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S')
        d2 = datetime.strptime(captured_on[:-7], '%Y-%m-%d %H:%M:%S')
        posted_len_minutes = abs((d1-d2).days * 24 * 60)
        day_of_week = get_day_of_week(d2)

        item['posted_len_minutes'] = posted_len_minutes
        item['day_of_week'] = day_of_week

        output.append(item)
    return output

In [4]:
def build_csv(data, include_header):
    if include_header:
        # Build CSV header
        header = ['post_score', 'is_original', 'post_title', 'post_archived', 'post_distinguished', 'post_downs', 'post_edited', 'post_gilded', 'post_gilded_silver', 
            'post_gilded_gold', 'post_gilded_platinum', 'post_is_original_content', 'post_is_video', 'post_likes', 
            'post_num_comments', 'post_num_crossposts', 'post_num_reports', 'post_over_18', 'post_pinned', 'subreddit',
            'subreddit_subscribers', 'post_ups', 'posted_len_minutes', 'post_title_sentiment', 'post_title_characters', 
            'post_title_words', 'post_title_uniq_words', 'post_title_stop_words', 'post_title_non_stop_words', 
            'post_text_sentiment', 'post_text_characters', 'post_text_words', 'post_text_uniq_words', 
            'post_text_stop_words', 'post_text_non_stop_words', 'day_of_week']

        # Build CSV data
        csv_data = [header]
    else:
        csv_data = []

    for item in data:
        row = []
        row.append(item['post_score'])
        row.append(item['is_original'])
        row.append(item['post_title'])
        row.append(item['post_archived'])
        row.append(item['post_distinguished'])
        row.append(item['post_downs'])
        row.append(item['post_edited'])
        row.append(item['post_gilded'])
        row.append(item['post_gilded_silver'])
        row.append(item['post_gilded_gold'])
        row.append(item['post_gilded_platinum'])
        row.append(item['post_is_original_content'])
        row.append(item['post_is_video'])
        row.append(item['post_likes'])
        row.append(item['post_num_comments'])
        row.append(item['post_num_crossposts'])
        row.append(item['post_num_reports'])
        row.append(item['post_over_18'])
        row.append(item['post_pinned'])
        row.append(item['subreddit'])
        row.append(item['subreddit_subscribers'])
        row.append(item['post_ups'])
        row.append(item['posted_len_minutes'])
        row.append(item['post_title_sentiment'])
        row.append(item['post_title_characters'])
        row.append(item['post_title_words'])
        row.append(item['post_title_uniq_words'])
        row.append(item['post_title_stop_words'])
        row.append(item['post_title_non_stop_words'])
        row.append(item['post_text_sentiment'])
        row.append(item['post_text_characters'])
        row.append(item['post_text_words'])
        row.append(item['post_text_uniq_words'])
        row.append(item['post_text_stop_words'])
        row.append(item['post_text_non_stop_words'])
        row.append(item['day_of_week'])
        csv_data.append(row)
    
    
    print(' Total records:', len(csv_data)-1)
    print(' Total features:', len(csv_data[0]))
    
    return csv_data

In [5]:
path_to_json = './data/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json') and not pos_json.endswith('-top.json')]
#json_files = ['knitting.json']

for file in json_files:
    file = file[:-5]
    
    print('Parsing subreddit', file)
    
    with open('./data/' + file + '.json', 'r') as f:
        data = json.load(f)
        new_data = format_simple_data(data, True)
        
    csv_data = build_csv(new_data, True)
    with open('./data/' + file + '-simple.csv', 'w', newline='', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerows(csv_data)
        f.close()
        
print('All done parsing files')

Parsing subreddit askmen
 Total records: 991
 Total features: 36
Parsing subreddit askwomen
 Total records: 991
 Total features: 36
Parsing subreddit aww
 Total records: 987
 Total features: 36
Parsing subreddit conspiracy
 Total records: 987
 Total features: 36
Parsing subreddit fitness
 Total records: 955
 Total features: 36
Parsing subreddit knitting
 Total records: 995
 Total features: 36
All done parsing files


In [6]:
path_to_csv = './data/'
csvs = [pos_csv for pos_csv in os.listdir(path_to_csv) if pos_csv.endswith('-simple.csv')]
csv_dict = dict()
for page in csvs:
    df = pd.read_csv(path_to_csv + page, low_memory=False)
    csv_dict[page] = df
    print(page)

askmen-simple.csv
askwomen-simple.csv
aww-simple.csv
conspiracy-simple.csv
fitness-simple.csv
knitting-simple.csv


In [7]:
for key in csv_dict.keys():
    df = csv_dict[key]
    cutoff_score = csv_dict[key]['post_score'].describe()[6]
    popular = len(csv_dict[key].loc[csv_dict[key]['post_score'] >= cutoff_score])
    unpopular = len(csv_dict[key].loc[csv_dict[key]['post_score'] < cutoff_score])
    
    print(key)
    #print('Top 25% cutoff', cutoff_score)
    #print('Total Popular', popular)
    #print('Total Not Popular', unpopular)
    #print('New Popular posts needed', unpopular-popular)
    #print()
    
    # fill data with top posts to make it 50-50
    file = key[:-11]
    with open('./data/' + file + '-top.json', 'r') as f:
        data = json.load(f)
        new_data = format_simple_data(data, False)
        
    # get up to 480 posts that exceed the cutoff
    top_data = []
    counter = 0
    for item in new_data:
        if item['post_score'] >= cutoff_score:
            top_data.append(item)
            counter = counter+1
            
        if counter > 480:
            break
    
    csv_data = build_csv(top_data, False)
    with open('./data/' + file + '-simple.csv', 'a', newline='', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerows(csv_data)
        f.close()

askmen-simple.csv
 Total records: 480
 Total features: 36
askwomen-simple.csv
 Total records: 480
 Total features: 36
aww-simple.csv
 Total records: 480
 Total features: 36
conspiracy-simple.csv
 Total records: 480
 Total features: 36
fitness-simple.csv
 Total records: 480
 Total features: 36
knitting-simple.csv
 Total records: 480
 Total features: 36


In [8]:
path_to_csv = './data/'
csvs = [pos_csv for pos_csv in os.listdir(path_to_csv) if pos_csv.endswith('-simple.csv')]
csv_dict = dict()
for page in csvs:
    df = pd.read_csv(path_to_csv + page, low_memory=False)
    csv_dict[page] = df
    
    print(page)
    df = csv_dict[page]
    cutoff_score = df.loc[df['is_original'] == True]['post_score'].describe()[6]
    
    popular = len(df.loc[df['post_score'] >= cutoff_score])
    unpopular = len(df.loc[df['post_score'] < cutoff_score])
    
    print('cutoff', cutoff_score)
    print('Total Popular', popular)
    print('Total Not Popular', unpopular)
    print()

askmen-simple.csv
cutoff 19.0
Total Popular 734
Total Not Popular 738

askwomen-simple.csv
cutoff 21.0
Total Popular 732
Total Not Popular 740

aww-simple.csv
cutoff 36.0
Total Popular 731
Total Not Popular 737

conspiracy-simple.csv
cutoff 39.0
Total Popular 730
Total Not Popular 738

fitness-simple.csv
cutoff 14.0
Total Popular 723
Total Not Popular 713

knitting-simple.csv
cutoff 112.0
Total Popular 732
Total Not Popular 744



In [32]:
def append_ngram_data(data):
    w_bigrams = []
    w_trigrams = []
    
    for index, row in data.iterrows():
        wbg = get_ngrams(row['post_title'], 2)
        wtg = get_ngrams(row['post_title'], 3)
    
        w_bigrams = w_bigrams + wbg
        w_trigrams = w_trigrams + wtg
        
    w_bigrams_uniq = set(w_bigrams)
    w_trigrams_uniq = set(w_trigrams)
    
    # add all columns and set to 0
    for item in w_bigrams_uniq:
        data[item] = 0
        
    for item in w_trigrams_uniq:
        data[item] = 0
        
    # loop through all records again to set to 1 if applicable
    for index, row in data.iterrows():
        wbg = get_ngrams(row['post_title'], 2)
        wtg = get_ngrams(row['post_title'], 3)
        
        for item in w_bigrams_uniq:
            if item in wbg:
                data.loc[index, item] = 1
                
        for item in w_trigrams_uniq:
            if item in wtg:
                data.loc[index, item] = 1
    
    return data

In [34]:
# calculate ngrams and add to csvs
path_to_csv = './data/'
csvs = [pos_csv for pos_csv in os.listdir(path_to_csv) if pos_csv.endswith('-simple.csv')]

csv_dict = dict()
for key in csvs:
    df = pd.read_csv(path_to_csv + key, low_memory=False)
    print(key)
    print('Generating ngrams')
    csv_dict[key] = append_ngram_data(df)
    
    print('Saving file')
    save_path = './data/' + key[:-11] + '-full.csv'
    csv_dict[key].to_csv(path_or_buf=save_path, index=False)
    print('done saving', save_path)
    print()

askmen-simple.csv
Generating ngrams
Saving file
done saving ./data/askmen-full.csv

askwomen-simple.csv
Generating ngrams




Saving file
done saving ./data/askwomen-full.csv

aww-simple.csv
Generating ngrams
Saving file
done saving ./data/aww-full.csv

conspiracy-simple.csv
Generating ngrams
Saving file
done saving ./data/conspiracy-full.csv

fitness-simple.csv
Generating ngrams
Saving file
done saving ./data/fitness-full.csv

knitting-simple.csv
Generating ngrams
Saving file
done saving ./data/knitting-full.csv

