In [17]:
import sys
import os
import pandas
import numpy as np
import nltk
import multiprocessing as mp
import threading_jobs as tj
# uncomment to download nltk library
#nltk.download()
os.environ["NLTK_DATA"] = "D:\nltk_data"

In [3]:
!conda install --yes --prefix {sys.prefix} -c anaconda nltk

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [3]:
# this section gets the initial data frame from the sentiment140 file

# load the sentiment 140 dataset to pandas

# this takes way too long, literally days to process, and jupyter crashed twice on me and left me having to restart the processing....
# sentiment140dataset = "training.1600000.processed.noemoticon.csv"
sentiment140dataset = "testdata.manual.2009.06.14.csv"

# this has 1.6 million rows
s140tweets = pandas.read_csv(sentiment140dataset, encoding='utf-8')

s140tweets.drop("id", axis=1, inplace=True)
s140tweets.drop("date", axis=1, inplace=True)
s140tweets.drop("topic", axis=1, inplace=True)
s140tweets.drop("account", axis=1, inplace=True)

# the above drops will leave us with only the "emotion" and "post" columns,
# which exactly what we care about

s140tweets.head()

Unnamed: 0,emotion,post
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...


In [4]:
# we also are only going to take 100,000 posts, instead of 1.6 million, because 1.6 million is too much

#zero_tweets = s140tweets[s140tweets.emotion == 0]
#four_tweets = s140tweets[s140tweets.emotion == 4]
#final_tweet_df = pandas.concat([zero_tweets[:30000], four_tweets[:30000]])

# actually i'm using the smaller set now because of how much data was in the other one - it was too cumbersome
final_tweet_df = s140tweets
final_tweet_df.head()

Unnamed: 0,emotion,post
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...


In [5]:
# this section gets the data prepared to be put through the classifier

# this gives the (post,emotion) tuple
def translate_dataframe_to_post_format(frame):
    posts = []
    for index, row in frame.iterrows():
        words = row['post']
        post = [e.lower() for e in words.split() if len(e) >= 3]
        posts.append((post, row['emotion']))
    return posts

def get_words_in_tweets(posts):
    all_words = []
    for (words, sentiment) in posts:
      all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

posts = translate_dataframe_to_post_format(final_tweet_df)

words = get_words_in_tweets(posts)
word_features = get_word_features(words)

# print(word_features)

In [6]:
# this section builds the classifier
# we currently have about 1.6 million posts to build our classifier with
# but I took 30k positive and 30k negative posts to classify with, because 1.6m took more than a full day to process
# so I stopped it

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features


In [7]:
training_set = nltk.classify.apply_features(extract_features, posts)

In [8]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [9]:
# this section shows some information about the trained classifier

print(classifier.show_most_informative_features(32))

Most Informative Features
          contains(hate) = True                0 : 4      =     10.6 : 1.0
        contains(jquery) = True                2 : 0      =     10.6 : 1.0
          contains(time) = True                0 : 4      =      9.8 : 1.0
           contains(not) = True                0 : 2      =      9.2 : 1.0
          contains(love) = True                4 : 0      =      8.8 : 1.0
        contains(warner) = True                0 : 2      =      7.7 : 1.0
         contains(great) = True                4 : 2      =      6.4 : 1.0
        contains(lebron) = True                4 : 2      =      6.4 : 1.0
          contains(good) = True                4 : 2      =      5.7 : 1.0
           contains(his) = True                4 : 0      =      5.5 : 1.0
           contains(are) = True                0 : 2      =      5.5 : 1.0
           contains(one) = True                4 : 0      =      4.9 : 1.0
           contains(san) = True                2 : 4      =      4.8 : 1.0

In [19]:
# classify the submissions and comments retieved from Reddit
# write it to a csv

def getFileName(subreddit, year):
    return 'D:\social_media_analytics\\reddit_content\%s\%s_%s_submissions_and_comments.csv' %(subreddit, year, subreddit)


years = ["2010", "2011", "2012", "2013", "2014"]
subreddits = ["nyc", "chicago", "houston", "phoenix", "losangeles"]

if __name__ ==  '__main__': 
    p = mp.Pool(processes = (mp.cpu_count()))

    final_tweet_df = None
    for subreddit in subreddits:
        for year in years:
            file_name = getFileName(subreddit, year)
            
            reader = pandas.read_csv(file_name, chunksize=5000, encoding='ANSI')
            
            funclist = []
            for frame in reader:
                frame = frame[frame.body != '[deleted]']
                frame["emotion"] = -1
                frame["subreddit"] = subreddit
                # process each data frame
                f = p.apply_async(tj.process_frame, [frame, classifier])
                funclist.append(f)
                print('Rows added for processing for %s %s' %(subreddit, year))
            for f in funclist:
                if not final_tweet_df is None:
                    final_tweet_df = pandas.concat([f.get(), final_tweet_df])
                else:
                    final_tweet_df = f.get()
            print('Rows processed for %s %s' %(subreddit, year))


        # run the CSV update per subreddit, just in case
        # this file will either be created or wipe the existing and make a new one
        final_tweet_df.to_csv(path_or_buf ="D:\social_media_analytics\\reddit_content\collection\sentiment_data_2010-2014.csv")

Rows added for processing for nyc 2010
Rows added for processing for nyc 2010
Rows added for processing for nyc 2010
Rows added for processing for nyc 2010
Rows added for processing for nyc 2010
Rows added for processing for nyc 2010
Rows added for processing for nyc 2010
Rows added for processing for nyc 2010
Rows processed for nyc 2010
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows added for processing for nyc 2011
Rows processed for nyc 2011
Rows add

Rows processed for losangeles 2011
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows added for processing for losangeles 2012
Rows processed for losangeles 2012
Rows added for processing for losangeles 2013
Rows added for processing for losangeles 2013
Rows added for processing for losangeles 2013
Rows added for processing for losangeles 2013
Rows added for processing for losangeles 2013
Rows added for processing for losangeles 2013
Rows added for processing for losangeles 2013
Rows added

In [20]:
print(final_tweet_df.head())

       created_utc               author  \
55000   1418286349            LaunchGap   
55001   1418286640  theultimateusername   
55002   1418288214             kneemahp   
55003   1418288816          PlasticGirl   
55004   1418293495         Front_Street   

                                                    body     type  emotion  \
55000  Haven't been on 405 during rush hour in a whil...  comment        4   
55001                                          Stairs...  comment        4   
55002    Holy shit wth is rosedale?! Looks really nice.   comment        0   
55003     There's a $2.89 in Pico Union and Bunker Hill.  comment        4   
55004  I hear that place by the staples center is leg...  comment        0   

        subreddit  
55000  losangeles  
55001  losangeles  
55002  losangeles  
55003  losangeles  
55004  losangeles  


In [22]:
print(final_tweet_df.count())

created_utc    948480
author         948480
body           948477
type           948480
emotion        948480
subreddit      948480
dtype: int64
