# Twitter Sentiment Analysis with NLTK

## Import Tweets Samples

The twitter_samples corpus contains 3 files.
+ negative_tweets.json: contains 5k negative tweets
+ positive_tweets.json: contains 5k positive tweets
+ tweets.20150430-223406.json: contains 20k positive and negative tweets

In [1]:
from nltk.corpus import twitter_samples
print(twitter_samples.fileids())

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']


In [2]:
pos_tweets = twitter_samples.strings('positive_tweets.json')

print(len(pos_tweets))

5000


In [3]:
neg_tweets = twitter_samples.strings('negative_tweets.json')

print(len(neg_tweets))

5000


In [4]:
all_tweets = twitter_samples.strings('tweets.20150430-223406.json')

print(len(all_tweets))

20000


In [6]:
for tweet in pos_tweets[:5]:
    print(tweet)

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!
@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!
@97sides CONGRATS :)
yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days


## Tokenize Tweets

Three different parameters can be passed while calling the TweetTokenizer class. They are:
+ preserve_case: if False then it converts tweet to lowercase and vice-versa.
+ strip_handles: if True then it removes twitter handles from the tweet and vice-versa.
+ reduce_len: if True then it reduces the length of words in the tweet like hurrayyyy, yipppiieeee, etc. and vice-versa.

In [7]:
from nltk.tokenize import TweetTokenizer

tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

for tweet in pos_tweets[:5]:
    print (tweet_tokenizer.tokenize(tweet))

['#followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hey', 'james', '!', 'how', 'odd', ':/', 'please', 'call', 'our', 'contact', 'centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'many', 'thanks', '!']
['we', 'had', 'a', 'listen', 'last', 'night', ':)', 'as', 'you', 'bleed', 'is', 'an', 'amazing', 'track', '.', 'when', 'are', 'you', 'in', 'scotland', '?', '!']
['congrats', ':)']
['yeaaah', 'yipppy', '!', '!', '!', 'my', 'accnt', 'verified', 'rqst', 'has', 'succeed', 'got', 'a', 'blue', 'tick', 'mark', 'on', 'my', 'fb', 'profile', ':)', 'in', '15', 'days']


## Cleaning Tweets

+ Remove stock market tickers like $GE
+ Remove retweet text “RT”
+ Remove hyperlinks
+ Remove hashtags (only the hashtag # and not the word)
+ Remove stop words like a, and, the, is, are, etc.
+ Remove emoticons like :), :D, :(, :-), etc.
+ Remove punctuation like full-stop, comma, exclamation sign, etc.
+ Convert words to Stem/Base words using Porter Stemming Algorithm. E.g. words like ‘working’, ‘works’, and ‘worked’ will be converted to their base/stem word “work”.

In [9]:
import string
import re
 
from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')
 
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
 
from nltk.tokenize import TweetTokenizer
 
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)
 
def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []
    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
              word not in emoticons and # remove emoticons
                word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
 
    return tweets_clean

In [10]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

print (clean_tweets(custom_tweet))

['hello', 'great', 'day', 'good', 'morn']


In [11]:
print (pos_tweets[5])

@BhaktisBanter @PallaviRuhail This one is irresistible :)
#FlipkartFashionFriday http://t.co/EbZ0L2VENM


In [12]:
print (clean_tweets(pos_tweets[5]))

['one', 'irresist', 'flipkartfashionfriday']


## Feature Extraction

We define a simple bag_of_words function that extracts unigram features from the tweets.

In [14]:
# feature extractor function
def bag_of_words(tweet):
    words = clean_tweets(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary

In [15]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

print (bag_of_words(custom_tweet))

{'hello': True, 'great': True, 'day': True, 'good': True, 'morn': True}


In [17]:
# positive tweets feature set
pos_tweets_set = []

for tweet in pos_tweets:
    pos_tweets_set.append((bag_of_words(tweet), 'pos'))
    
# negative tweets feature set
neg_tweets_set = []

for tweet in neg_tweets:
    neg_tweets_set.append((bag_of_words(tweet), 'neg'))
    
print("Length of positive: ", len(pos_tweets_set)) 
print("Length of negative: ", len(neg_tweets_set))

Length of positive:  5000
Length of negative:  5000


## Create Train and Test Set

There are 5000 positive tweets set and 5000 negative tweets set. We take 20% (i.e. 1000) of positive tweets and 20% (i.e. 1000) of negative tweets as the test set. The remaining negative and positive tweets will be taken as the training set.

In [18]:
# radomize pos_reviews_set and neg_reviews_set
# doing so will output different accuracy result everytime we run the program
from random import shuffle 

shuffle(pos_tweets_set)
shuffle(neg_tweets_set)
 
test_set = pos_tweets_set[:1000] + neg_tweets_set[:1000]
train_set = pos_tweets_set[1000:] + neg_tweets_set[1000:]
 
print(len(test_set),  len(train_set))

2000 8000


In [19]:
from nltk import classify
from nltk import NaiveBayesClassifier
 
classifier = NaiveBayesClassifier.train(train_set)

accuracy = classify.accuracy(classifier, test_set)
print(accuracy) # Output: 0.765

print (classifier.show_most_informative_features(10))  

0.7365
Most Informative Features
                     via = True              pos : neg    =     36.3 : 1.0
                     bam = True              pos : neg    =     25.0 : 1.0
                      aw = True              neg : pos    =     20.3 : 1.0
                 appreci = True              pos : neg    =     18.3 : 1.0
                     sad = True              neg : pos    =     18.1 : 1.0
                     x15 = True              neg : pos    =     17.0 : 1.0
                      ff = True              pos : neg    =     17.0 : 1.0
                     ugh = True              neg : pos    =     15.7 : 1.0
                    sick = True              neg : pos    =     13.8 : 1.0
                  friday = True              pos : neg    =     13.5 : 1.0
None


## Testing Classifier with Custom Tweet

We provide custom tweet and check the classification output of the trained classifier. The classifier correctly predicts both negative and positive tweets provided.

### Test with custome review 1

In [22]:
custom_tweet = "Is that why you're so busy slashing Medicaid, giving tax cuts to the wealthy, put sexual predators on the bench, and allow kids to die?"
custom_tweet_set = bag_of_words(custom_tweet) 

### Train and classify custome review 1

In [23]:
print("Classificate as: ", classifier.classify(custom_tweet_set))
 
# probability result
prob_result = classifier.prob_classify(custom_tweet_set)

print("Classified as: ", prob_result)
print("Classification category: ", prob_result.max())
print("Negative probability : ", prob_result.prob("neg"))
print("Positive probability : ", prob_result.prob("pos"))

Classificate as:  neg
Classified as:  <ProbDist with 2 samples>
Classification category:  neg
Negative probability :  0.9988870824411937
Positive probability :  0.0011129175588098107


### Test with custome review 2

In [16]:
custom_tweet = "It was a wonderful and amazing movie. I loved it. Best direction, good acting."
custom_tweet_set = bag_of_words(custom_tweet)

### Train and classify custome review 2

In [17]:
print("Classificate as: ", classifier.classify(custom_tweet_set))
 
# probability result
prob_result = classifier.prob_classify(custom_tweet_set)

print("Classified as: ", prob_result)
print("Classification category: ", prob_result.max())
print("Negative probability: ", prob_result.prob("neg"))
print("Positive probability: ", prob_result.prob("pos"))

Classificate as:  pos
Classified as:  <ProbDist with 2 samples>
Classification category:  pos
Negative probability :  0.0004941345148866521
Positive probability :  0.9995058654851143


In [22]:
from collections import defaultdict
 
actual_set = defaultdict(set)
predicted_set = defaultdict(set)
 
actual_set_cm = []
predicted_set_cm = []
 
for index, (feature, actual_label) in enumerate(test_set):
    actual_set[actual_label].add(index)
    actual_set_cm.append(actual_label)
 
    predicted_label = classifier.classify(feature)
 
    predicted_set[predicted_label].add(index)
    predicted_set_cm.append(predicted_label)
    
from nltk.metrics import precision, recall, f_measure, ConfusionMatrix
 
print('pos precision :', precision(actual_set['pos'], predicted_set['pos'])) # Output: pos precision: 0.762896825397
print('pos recall    :', recall(actual_set['pos'], predicted_set['pos'])) # Output: pos recall: 0.769
print('pos F-measure :', f_measure(actual_set['pos'], predicted_set['pos'])) # Output: pos F-measure: 0.76593625498
 
print('neg precision :', precision(actual_set['neg'], predicted_set['neg'])) # Output: neg precision: 0.767137096774
print('neg recall    :', recall(actual_set['neg'], predicted_set['neg'])) # Output: neg recall: 0.761
print('neg F-measure :', f_measure(actual_set['neg'], predicted_set['neg'])) # Output: neg F-measure: 0.7640562249

pos precision : 0.7345385347288297
pos recall    : 0.772
pos F-measure : 0.7528035104826915
neg precision : 0.7597471022128557
neg recall    : 0.721
neg F-measure : 0.7398665982555156


In [23]:
cm = ConfusionMatrix(actual_set_cm, predicted_set_cm)
print (cm)

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<721>279 |
pos | 228<772>|
----+---------+
(row = reference; col = test)



In [24]:
print (cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))

    |      n      p |
    |      e      o |
    |      g      s |
----+---------------+
neg | <36.0%> 13.9% |
pos |  11.4% <38.6%>|
----+---------------+
(row = reference; col = test)

