In [1]:
import urllib
import json
import pandas as pd
import base64
import numpy as np
import random
import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
import csv




# Step 1 - Tweet Sentiment Analysis

We have a dump of tweets you may use here: https://s3.amazonaws.com/aiq-assets/tweets.txt

In this step, you’ll conduct sentiment analysis to determine the relative positivity and negativity of the tweets in the file. There are many ways to classify tweets as positive or negative, and the exact methods are left up to you. You’ll need to justify your choices as well as explain any tradeoffs with the method chosen. You are welcome to use any libraries you’d like, as long as your code handles the training and classification. You are also welcome to use pre-classified training sets as a means to train your model. The output of your algorithm should be two files with two columns, one containing a sorted list of positive tweets and their scores with the most positive tweets appearing first (positive.txt), and the second containing a sorted list of negative tweets and their scores with the most negative tweets appearing first (negative.txt).



# Training Data

In [2]:
def json_numpy_obj_hook(dct):
    """Decodes a previously encoded numpy ndarray with proper shape and dtype.
    :param dct: (dict) json encoded ndarray
    :return: (ndarray) if input was an encoded ndarray
    """
    if isinstance(dct, dict) and '__ndarray__' in dct:
        data = base64.b64decode(dct['__ndarray__'])
        return np.frombuffer(data, dct['dtype']).reshape(dct['shape'])
    return dct

In [3]:
with open('beer_1000.json') as data_file:    
    data = json.load(data_file)

In [4]:
# Json to list
beer_text_sentiment = []
for beer in data:
    for review in data[beer]:
            beer_text_sentiment.append((beer,review['text'],review['overall']))

In [5]:
def training_data(beer_input):
    random.shuffle(beer_text_sentiment)
    training_number = int(len(beer_text_sentiment)*.2)
    training_text = beer_text_sentiment[:training_number]
    train_data_df = pd.DataFrame(training_text).convert_objects(convert_numeric=True)
    train_data_df.columns = ["Beer", "Text", "Sentiment"]
    num = train_data_df._get_numeric_data()
    num[num <= 3] = 0
    num[num > 3] = 1
    return train_data_df

In [6]:
train_data_df = training_data(beer_text_sentiment).drop('Beer', 1)



In [7]:
train_data_df.head()

Unnamed: 0,Text,Sentiment
0,Poured from a 12 oz can to a pint glass. A - ...,1.0
1,A: Poured a nice golden color with almost no b...,1.0
2,Served room temp in a Trois Pistoles snifter. ...,1.0
3,A: It is a great inky black with a brown thin ...,1.0
4,"Wow, ABC must have changed their recipe on thi...",1.0


# Testing Data

In [8]:
with open('tweets.txt') as f:
    content = f.readlines()
#remove whitespace characters like `\n` at the end of each line
tweetstxt = [x.strip() for x in content] 
test_data_df = pd.DataFrame(tweetstxt)
test_data_df.columns = ['Text']

# Preparing a corpus

In [9]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
            stemmed.append(item)
    return stemmed

def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 85
)

In [10]:
corpus_data_features = vectorizer.fit_transform(train_data_df.Text.tolist() + test_data_df.Text.tolist())
corpus_data_features_nd = corpus_data_features.toarray()
vocab = vectorizer.get_feature_names()
# Sum up the counts of each vocabulary word
dist = np.sum(corpus_data_features_nd, axis=0)

# A bag-of-words linear classifier

In [11]:
def bag_of_words_linear_classifier(training_data):
    # remember that corpus_data_features_nd contains all of our 
    # original train and test data, so we need to exclude
    # the unlabeled test entries
    X_train, X_test, y_train, y_test  = train_test_split(
            corpus_data_features_nd[0:len(train_data_df)], 
            train_data_df.Sentiment,
            train_size=0.80, 
            random_state=1234)
    #Now we are ready to train our classifier.
    log_model = LogisticRegression()
    log_model = log_model.fit(X=X_train, y=y_train)
    #Now we use the classifier to label our evaluation set. 
    #We can use either predict for classes or predict_proba for probabilities.
    y_pred = log_model.predict(X_test)
    #Finally, we can re-train our model with all the training data and use it for sentiment 
    #classification with the original (unlabeled) test set.
    # train classifier
    log_model = LogisticRegression()
    log_model = log_model.fit(X=corpus_data_features_nd[0:len(train_data_df)], y=train_data_df.Sentiment)
    # get predictions
    test_pred = log_model.predict(corpus_data_features_nd[len(train_data_df):])
    # get probability of predictions
    test_prob = log_model.predict_proba(corpus_data_features_nd[len(train_data_df):])
    return test_pred, test_prob

In [12]:
test_pred, test_prob = bag_of_words_linear_classifier(train_data_df)

# Sample and Print

In [13]:
tweet_sentiment  = dict.fromkeys(test_data_df.Text)
for x in xrange(len(test_data_df.Text)):
    tweet_sentiment[test_data_df.Text[x]] = test_pred[x]

In [14]:
tweet_index  = dict.fromkeys(test_data_df.index)
for x in xrange(len(test_data_df.index)):
    tweet_index[test_data_df.index[x]] = test_pred[x]

# Create Output

In [15]:
tweet_prob  = dict.fromkeys(test_data_df.Text)
for x in xrange(len(test_data_df.Text)):
    tweet_prob[test_data_df.Text[x]] = test_prob[x]

In [17]:
negative_tweets = []
positive_tweets = []

for key, value in tweet_prob.iteritems():
    if value[0]<value[1]:
        positive_tweets.append((key, value[0]))
    else:
        negative_tweets.append((key, value[1]))
        
sorted_neg = sorted(negative_tweets, reverse=True, key=lambda x: x[1])
sorted_pos = sorted(positive_tweets, reverse=True, key=lambda x: x[1])

with open("negative_output.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(sorted_neg)
    
with open("positive_output.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(sorted_pos)

# Step 2 - Summarization of Methods and Results

In this final step, summarize the results you found in Step 2 however you see appropriate (summary statistics, visualization, etc..). Include a description of assumptions made that could affect results and possibilities for improving the accuracy of your model.

To submit your results, zip your code, tweet data, any additional training data, and result files into a single directory. Include a simple README that describes how to run your classifier so that the results you send us can be duplicated

In [18]:
len(tweet_index)

57377

In [19]:
sum(tweet_index.values())

54913.0

In [20]:
57377-53459

3918

# Out of 57377 tweets, 53459.0 were positive and 3918 were negative. This means the training data was skewed or the classifier simply wasn't good. Let's check the training data first. 

In [21]:
train_data_df.Sentiment.sum()

1807.0

In [22]:
len(train_data_df.Sentiment)

2318

In [23]:
2318-1793.0

525.0

# The training data was not that skewed compared to our final result. It had 1793 positive beer reviews and 525 negative reviews. However, it's possible the positive reviews were much longer in total and many more words altogether compared to the negative reviews. This may somewhat make sense because people often write longwinded rave reviews for items they love as opposed to simply stating an item was bad.

# Assumptions

1. Positive and negative beer reviews are similar and use similar words to positive and negative tweets - bad assumption?
2. LR assumes the dependent variable to be binary.
3. LR assumes the observations are independent.
3. I sorted the list from most positive to least positive, similarly for negative, using the probability of the outcome, which isn't the best method to do this. I did this because I originally thought I was classifying just positive and negative, and this was a quick hacky fix method I came up with.


# Future Work

How would I fix this?

1. Use the right training set.
2. Compare results of logistic regression to results of a Naive Bayes Classifier and a Support Vector Machine. 
3. Use a different method of defining positive and negative words and tweets including weights.