In [1]:
# CS 410 Course Project
# Sentiment Analysis of twitter samples using NLTK
# This will use the downloaded "twitter_samples" corpus of NLTK. The twitter samples contain labeled positive and
# negative tweets, which will be used to train a model. Once the model is trained, the sentiment analysis task will
# be performed on the test data.

In [2]:
# NLTK, twitter_samples and other imports
import nltk
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import classify
from nltk import NaiveBayesClassifier
import re, string
import random

In [3]:
# Utility function to remove puntuation, urls and twitter handles from the tokenized tweet text
# After the tweet tokens are cleaned, they are lemmatized and the lemmatized list is returned
def clean_and_lemmatize_tokens(tweet_tokens, word_lemmatizer, stop_words = ()):
    lemmatized_tokens = []
    # Use pos_tag function to attach part of speech to the tokens.
    # For each token and POS tag...
    for token, tag in pos_tag(tweet_tokens):
        # Lowercase all the letters
        token = token.lower()
        # Remove twitter handles
        token = re.sub("@[A-Za-z0-9_]+","", token)
        # Remove hashtags
        token = re.sub("#[A-Za-z0-9_]+","", token)
        # Remove urls
        token = re.sub(r"http\S+", "", token)
        token = re.sub(r"www.\S+", "", token)
        # Remove puntuations
        token = re.sub('[()!?]', "", token)
        token = re.sub('\[.*?\]', "", token)
        # Remove alpha-numeric characters
        token = re.sub("[^a-z0-9]", "", token)
        # Remove stop words
        if len(token) > 0 and token not in stop_words:
            # Lemmatize the cleaned token
            if tag.startswith("NN"):
                pos = 'n'
            elif tag.startswith('VB'):
                pos = 'v'
            else:
                pos = 'a'
            # Use the passed in word lemmatizer
            token = word_lemmatizer.lemmatize(token, pos)
            # Append to the lemmatized token list
            lemmatized_tokens.append(token)
    return lemmatized_tokens
        

In [4]:
# This function takes a list of cleaned and lemmatized tokens and adds "True" for each
# feature to be included in the feature list. It also labels the data as "Positive" or
# "Negative"
def prep_model_data(lemmatized_tokens_list, label):
    # Initialize a list of training or testing data. This will be a list
    # of tuples where first element is a dictionary containing featues and
    # second element is the label, whether the tweet sentiment is "Positive" or
    # "Negative". Each element of the list will look as follows:
    # ({feature1: True, feature2: True,....}, "Positive"). This is the format
    # in which the classifier takes the input data.
    classifier_input_data_list = []
    for tweet_tokens in lemmatized_tokens_list:
        # Initialize the dictionary that will have features and include flag
        input_features = {}
        for token in tweet_tokens:
            # We will include all tokens as input_features and mark them True
            input_features[token] = "True"
        # Create a tuple of the above dictionary and the given label
        input_features_with_label = (input_features, label)
        # Add this input data to the list
        classifier_input_data_list.append(input_features_with_label)
    # Return the classifier input data list
    return classifier_input_data_list

In [5]:
# Setup variables for positive, negative and neutral tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [6]:
# Print tweets data
print(positive_tweets[0])
print(negative_tweets[0])

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
hopeless for tmr :(


In [7]:
# Tokenize positive and negative tweets
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

In [8]:
# Print tokenized tweet tokens
print(positive_tweet_tokens[0])
print(negative_tweet_tokens[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hopeless', 'for', 'tmr', ':(']


In [9]:
# Print tweet tokens with parts of speech attached
print(pos_tag(positive_tweet_tokens[0]))
print(pos_tag(negative_tweet_tokens[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]
[('hopeless', 'NN'), ('for', 'IN'), ('tmr', 'NN'), (':(', 'NN')]


In [10]:
# Remove links, twitter handles, hashtags, puntuations etc. and lemmatize
# Get stopwords list
stop_words = stopwords.words('english')
# Get word lemmatizer
word_lemmatizer = WordNetLemmatizer()
# Initialize output lists
positive_lemmatized_tokens_list = []
negative_lemmatized_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_lemmatized_tokens_list.append(clean_and_lemmatize_tokens(tokens, word_lemmatizer, stop_words))

for tokens in negative_tweet_tokens:
    negative_lemmatized_tokens_list.append(clean_and_lemmatize_tokens(tokens, word_lemmatizer, stop_words))

In [11]:
# Test the cleaned and lemmatized outputs
print(positive_lemmatized_tokens_list[0])
print(negative_lemmatized_tokens_list[0])

['top', 'engage', 'member', 'community', 'week']
['hopeless', 'tmr']


In [12]:
# This step creates a list of tuple where each tuple contains a dictionary of
# input features and a label. We create training data for a set of positive
# and negative tweets. Each dictionary contains features as key and a include
# flag (True, False to include feature or not). The label indicates if the
# tweet sentiment is positive or negative (labeled data).
positive_labeled_dataset = prep_model_data(positive_lemmatized_tokens_list, "Positive")
negative_labeled_dataset = prep_model_data(negative_lemmatized_tokens_list, "Negative")

In [13]:
# Test the input data set for model
print(positive_labeled_dataset[0])
print(negative_labeled_dataset[0])

({'top': 'True', 'engage': 'True', 'member': 'True', 'community': 'True', 'week': 'True'}, 'Positive')
({'hopeless': 'True', 'tmr': 'True'}, 'Negative')


In [14]:
# We will use the first 3750 tweets data from each set to total of 7500 items for training data
training_dataset = positive_labeled_dataset[:3750] + negative_labeled_dataset[:3750]
# Shuffle training data to remove any bias
random.shuffle(training_dataset)
# Use remaining 1250 items from each set as testing dataset
testing_dataset = positive_labeled_dataset[3750:] + negative_labeled_dataset[3750:]
# Shuffle testing data to remove any bias
random.shuffle(testing_dataset)

In [15]:
# Test the model training data
print(training_dataset[0])
print(testing_dataset[0])

({'ate': 'True', 'menille': 'True', 'need': 'True', 'youuuu': 'True'}, 'Negative')
({'fave': 'True', 'unfollows': 'True'}, 'Negative')


In [16]:
# Train the Naive Bayes Classifier
nb_classifier = NaiveBayesClassifier.train(training_dataset)

# Now test the trained model accuracy on testing dataset
testing_accuracy = classify.accuracy(nb_classifier, testing_dataset)
print("Testing accuracy is:", testing_accuracy)

# Top 20 most informative features
nb_classifier.show_most_informative_features(20)

Testing accuracy is: 0.7368
Most Informative Features
                followed = 'True'         Negati : Positi =     34.3 : 1.0
                follower = 'True'         Positi : Negati =     26.2 : 1.0
                    glad = 'True'         Positi : Negati =     25.7 : 1.0
                     x15 = 'True'         Negati : Positi =     23.7 : 1.0
                  arrive = 'True'         Positi : Negati =     22.2 : 1.0
                     sad = 'True'         Negati : Positi =     21.7 : 1.0
                       p = 'True'         Positi : Negati =     21.4 : 1.0
                    sick = 'True'         Negati : Positi =     19.7 : 1.0
               community = 'True'         Positi : Negati =     16.3 : 1.0
                  justin = 'True'         Negati : Positi =     15.0 : 1.0
                     ugh = 'True'         Negati : Positi =     13.7 : 1.0
                    miss = 'True'         Negati : Positi =     13.3 : 1.0
                      aw = 'True'         Nega