# Process Data
This notebook runs through obtaining, filtering, and refining the raw data. It begins with a data dump of Reddit post submissions and extracts posts of the relevant subreddit: AskReddit. It thereby filters out insufficient entries (deleted posts, etc.) and extracts features of interest. Among these features is the sentiment of the post title, obtained through training a sentiment classifier on Twitter NLTK data (due to availability of manually classified datasets) with hopes that it will generalize to Reddit posts.

In [17]:
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from datetime import datetime
import numpy as np
import pandas as pd
import string
import json
import time
import os
import re

In [18]:
DUMP_DIRECTORY = 'https://files.pushshift.io/reddit/submissions/'
DUMP_BASENAME = 'RS_2018-Y.xz'
DATA_FILEPATH = '/home/ashuaibi7/project/data'
ALL_POSTS = os.path.join(DATA_FILEPATH, 'ask_reddit_posts')
GLOVE_HOME = os.path.join(DATA_FILEPATH, 'glove')
SUBREDDIT_ID = 't5_2qh1i' # subreddit ID for AskReddit used in filtering
FILE_COUNT = 6 # to fetch data dumps from months 1 - 6 of 2018
RANDOM_SEED = 229

## Fetching Data

In [3]:
# First, I fetch the data dump files of reddit post submissions.
# We fetch from pushshift.io, unzip, filter out posts from AskReddit,
# then move the data to the respective directory. Finally, I concatenate
# all AskReddit submissions into a single file.
def fetch_data():
    for i in range(1, FILE_COUNT + 1):
        basename = DUMP_BASENAME.replace('Y', '{:02d}'.format(i))
        base_name_split = os.path.splitext(basename)
        dump_filename = os.path.join(DUMP_DIRECTORY, basename)
        os.system('curl -LO {}'.format(dump_filename))
        os.system('unxz {}'.format(basename))
        os.system('grep \"{}\" {}> {}_ask_reddit'.format(SUBREDDIT_ID, base_name_split[0], base_name_split[0]))
        os.system('mv -t {} {} {}_ask_reddit'.format(DATA_FILEPATH, base_name_split[0], base_name_split[0]))
    all_files_basename = os.path.join(DATA_FILEPATH, '{}_ask_reddit'.format(os.path.splitext(basename)))
    os.system('cat {}* > {}'.format(all_files_basename, ALL_POSTS))

In [4]:
fetch_data()

In [20]:
# Next we open and read the data. The file is contained of lines of 
# individual json entries, which we will parse using json.loads()
# We parse columns of interested, including 'title', 'num_comments', and 'score'
def process_posts():
    data = []
    with open(ALL_POSTS) as f:
        for line in f:
            data.append((json.loads(line)))
    return pd.DataFrame(data)[['title', 'num_comments', 'created_utc', 'gilded', 'over_18', 'score']]

In [21]:
posts_df = process_posts()

## Classify Post Title Sentiment
We will utilize NLTK Twitter data to develop a sentiment classifier. We thereby train on twitter data to the existence of manually classified sentiment datasets. We use the results of the model/weights to classify the sentiment of a reddit post title.

In [22]:
# Read in the positive and negative tweets datasets and construct the 
# training and testing data
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
test_examples = positive_tweets[4000:] + negative_tweets[4000:]
y = np.append(np.ones(1000), np.zeros(1000))
positive_tweets = positive_tweets[:4000]
negative_tweets = negative_tweets[:4000]

In [23]:
# Create the tweet tokenizer and stemmer and read in the english stopwords to utilize
# in processing a single tweet.
tweet_tokenizer = TweetTokenizer(reduce_len=True, preserve_case=False, strip_handles=True)
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()

In [24]:
def process_single_tweet(tweet):
    # remove the hashtags from word starts/ hyperlinks, and retweet sign before tokenizing
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(tweet)
    tweet_split = []
    for word in tokens:
        if (word not in stopwords_english): 
            tweet_split.append(stemmer.stem(word))
    return tweet_split

In [25]:
# Get count of words that occur in positive and negative tweets respectively.
def get_word_counts(positive_tweets, negative_tweets):
    positive_counts, negative_counts, vocab = {}, {}, set()
    for tweet in positive_tweets:
        clean_tweet = process_single_tweet(tweet)
        for word in clean_tweet:
            vocab.add(word)
            if word not in positive_counts.keys():
                positive_counts[word] = 0
            positive_counts[word] += 1
    for tweet in negative_tweets:
        clean_tweet = process_single_tweet(tweet)
        for word in clean_tweet:
            vocab.add(word)
            if word not in negative_counts.keys():
                negative_counts[word] = 0
            negative_counts[word] += 1
    return positive_counts, negative_counts, vocab

In [26]:
positive_counts, negative_counts, vocab = get_word_counts(positive_tweets, negative_tweets)

In [27]:
def extract_twitter_features(pos_counts, neg_counts, tweets):
    X = np.zeros((len(tweets),2))
    for i in range(0,len(tweets)):
        tweet = tweets[i]
        tweet = process_single_tweet(tweet) 
        pos_count = 0
        neg_count = 0
        for word in tweet: 
            if word in pos_counts.keys():
                pos_count += pos_counts[word]
            if word in neg_counts.keys():
                neg_count += neg_counts[word]
        X[i,0], X[i,1] = pos_count, neg_count
    return X

In [28]:
total_tweets = positive_tweets + negative_tweets
Y = np.append(np.ones(len(positive_tweets)), np.zeros(len(negative_tweets)))
X = extract_twitter_features(positive_counts, negative_counts, total_tweets)

In [29]:
# Define and train with gradient descent
def sigmoid(z): 
    return 1/(1 + np.exp(-z))
def gradientDescent(x, y, theta, alpha, num_iters):
    m = x.shape[0]
    for i in range(0,num_iters):
        h = sigmoid(np.dot(x,theta))
        J = -1 / m * (np.dot(y.T, np.log(h)) + np.dot((1 - y).T, np.log(1 - h)))
        theta = theta - alpha* np.dot(x.T, h - y)
    return J, theta

In [30]:
theta, alpha = np.zeros(3), 5e-15
new_input = np.c_[np.ones(X.shape[0]), X]
J, theta = gradientDescent(new_input, Y,theta, alpha, 1000)

In [31]:
new_x = extract_twitter_features(positive_counts, negative_counts, test_examples)
new_x = np.c_[np.ones(new_x.shape[0]), new_x]
pred = sigmoid(np.dot(new_x,theta)) > 0.5
accuracy = 1 - np.abs(np.mean(y-pred))
accuracy

0.9925

## Build Final Dataset
We extract and engineer features for each AskReddit post and create the final dataset, dividing into train/dev/test splits.

In [32]:
# First add a few features given the existing columns. Of these are the 
# length of the post 'title', the 'weekday' that a post was submitted,
# and the 'hour' of the day the post was submitted.
posts_df['title_length'] = posts_df['title'].str.len()
posts_df['datetime'] = pd.to_datetime(posts_df['created_utc'], unit='s')
posts_df['weekday'] = posts_df['datetime'].apply(lambda x: x.weekday())
posts_df['hour'] = posts_df['datetime'].apply(lambda x: x.hour)

In [33]:
# Next, we classify the sentiment of a title and add that as a binary feature with 1 indicating
# positive and 0 indicating negative. First extract features of the post titles
def process_single_title(title):
    tokens = word_tokenize(title)
    title_split = []
    for word in tokens:
        if (word not in stopwords_english): 
            title_split.append(word)
    return title_split

def extract_reddit_features(pos_counts, neg_counts, titles):
    classifications = []
    for title in titles:
        processed_title = process_single_title(title) 
        pos_count, neg_count = 0, 0
        for word in processed_title: 
            if word in pos_counts.keys():
                pos_count += pos_counts[word]
            if word in neg_counts.keys():
                neg_count += neg_counts[word]
        new_x = np.array([1, pos_count, neg_count])
        p = sigmoid(np.dot(theta.T, new_x))
        classifications.append(1) if p > 0.5 else classifications.append(0)
    return pd.Series(classifications)

In [34]:
sentiments = extract_reddit_features(positive_counts, negative_counts, posts_df['title'])

In [35]:
posts_df['sentiment'] = sentiments

In [36]:
# We will use the 300D glove representations of the words in the title stacked together as another feature
def glove_to_dict(filename):
    data = {}
    with open(filename) as f:
        while True:
            try:
                line = next(f)
                line = line.strip().split()
                data[line[0]] = np.array(line[1: ], dtype = np.float)
            except StopIteration: break
            except UnicodeDecodeError: pass
    return data

In [37]:
glove_lookup = glove_to_dict(os.path.join(GLOVE_HOME, 'glove.6B.300d.txt'))

In [38]:
def vsm_title_phi(title, lookup, np_func=np.sum):
    allvecs = np.array([lookup[w] for w in process_single_title(title) if w in lookup])    
    if len(allvecs) == 0:
        dim = len(next(iter(lookup.values())))
        feats = np.zeros(dim)
    else:       
        feats = np_func(allvecs, axis=0)      
    return feats
def glove_title_phi(title, np_func=np.sum):
    return vsm_title_phi(title, glove_lookup, np_func=np_func)

In [39]:
def extract_stacked_gloves(titles):
    representations = []
    for title in titles:
        representations.append(glove_title_phi(title))
    return pd.Series(representations)

In [40]:
glove_representations = extract_stacked_gloves(posts_df['title'])

In [41]:
posts_df['stacked_glove_representations'] = glove_representations

In [42]:
# Finally, we reproducibly separate the data set randomly into train/dev/test
def build_dataset(df):
    df.to_csv(os.path.join(DATA_FILEPATH, 'all.csv'), index=False)
    train = df.sample(frac=0.8, random_state=RANDOM_SEED)
    remainder = df.drop(train.index)
    test = remainder.sample(frac=0.5, random_state=RANDOM_SEED)
    dev = remainder.drop(test.index)
    train.to_csv(os.path.join(DATA_FILEPATH, 'train.csv'), index=False)
    dev.to_csv(os.path.join(DATA_FILEPATH, 'dev.csv'), index=False)
    test.to_csv(os.path.join(DATA_FILEPATH, 'test.csv'), index=False)

In [43]:
# We will concern ourselves with a subset of the available features, namely:
columns = ['num_comments', 
           'title_length', 
           'sentiment', 
           'weekday', 
           'hour', 
           'gilded',
           'over_18',
           'score', 
#            'stacked_glove_representations',
           'title']

In [44]:
build_dataset(posts_df[columns])