Problem Statement: Classify text as +ve or -ve sentiment

In [170]:
#Reference: http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
import nltk
import pandas as pd
import numpy as np
import scipy

# Lets specify some positive tweets for evaluation purpose
pos_tweets = [('I love this car', 'positive'), ('This view is amazing', 'positive'), 
              ('I feel great this morning', 'positive'), ('I am so excited about the concert', 'positive'),
              ('He is my best friend', 'positive'), ('The beer is good', 'positive'), 
              ('I do love ice-cream', 'positive'), ('morning is good', 'positive'), ('welcome morning', 'positive')]

# Similary some negative tweets
neg_tweets = [('I do not like this car', 'negative'), ('This view is horrible', 'negative'), 
              ('I am not looking forward to the concert', 'negative'),
              ('He is my enemy', 'negative')] # ('very annoying', 'negative')

In [171]:
# 1. Transform the array of positive and negative tweets to a tuple2 (tweet, sentiment)
# 2. Filtered some stop words, where word length < 3
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))
print(pd.DataFrame(tweets))

                                        0         1
0                       [love, this, car]  positive
1                   [this, view, amazing]  positive
2            [feel, great, this, morning]  positive
3          [excited, about, the, concert]  positive
4                          [best, friend]  positive
5                       [the, beer, good]  positive
6                       [love, ice-cream]  positive
7                         [morning, good]  positive
8                      [welcome, morning]  positive
9                  [not, like, this, car]  negative
10                 [this, view, horrible]  negative
11  [not, looking, forward, the, concert]  negative
12                                [enemy]  negative


In [172]:
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    # make use of nltk.FreqDist function to compute TF
    w_l = nltk.FreqDist(wordlist)
    return w_l

In [182]:
word_features = get_word_features(get_words_in_tweets(tweets))

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [179]:
unique_word_list = np.unique(get_words_in_tweets(tweets))
print('Unique words in the corpus: {}'.format(unique_word_list))
print('Count of unique words in corpus: {}'.format(len(unique_word_list)))
print(pd.DataFrame(word_features.most_common(50)))

Unique words in the corpus: ['about' 'amazing' 'beer' 'best' 'car' 'concert' 'enemy' 'excited' 'feel'
 'forward' 'friend' 'good' 'great' 'horrible' 'ice-cream' 'like' 'looking'
 'love' 'morning' 'not' 'the' 'this' 'view' 'welcome']
Count of unique words in corpus: 24
            0  1
0        this  5
1     morning  3
2         the  3
3        love  2
4     concert  2
5        good  2
6         not  2
7         car  2
8        view  2
9        feel  1
10  ice-cream  1
11       best  1
12    amazing  1
13    looking  1
14       beer  1
15   horrible  1
16    forward  1
17    excited  1
18     friend  1
19    welcome  1
20      about  1
21      enemy  1
22      great  1
23       like  1


In [180]:
training_set = nltk.classify.apply_features(extract_features, tweets)

In [181]:
print(training_set)

[(False, 'positive'), (True, 'positive'), ...]


In [176]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [177]:
print(classifier.show_most_informative_features(25))

Most Informative Features
       contains(concert) = True           negati : positi =      2.0 : 1.0
           contains(car) = True           negati : positi =      2.0 : 1.0
          contains(view) = True           negati : positi =      2.0 : 1.0
           contains(not) = False          positi : negati =      1.9 : 1.0
          contains(this) = True           negati : positi =      1.4 : 1.0
       contains(morning) = False          negati : positi =      1.4 : 1.0
          contains(like) = False          positi : negati =      1.4 : 1.0
       contains(looking) = False          positi : negati =      1.4 : 1.0
         contains(enemy) = False          positi : negati =      1.4 : 1.0
       contains(forward) = False          positi : negati =      1.4 : 1.0
      contains(horrible) = False          positi : negati =      1.4 : 1.0
          contains(this) = False          positi : negati =      1.3 : 1.0
       contains(concert) = False          positi : negati =      1.2 : 1.0

In [126]:
# A positive example
tweet = 'Larry is my friend'
transformed_features = extract_features(tweet.split())
print (pd.DataFrame(transformed_features.items()))
print classifier.classify(extract_features(tweet.split()))

                      0      1
0     contains(looking)  False
1         contains(not)  False
2     contains(excited)  False
3        contains(view)  False
4     contains(forward)  False
5   contains(ice-cream)  False
6        contains(love)  False
7       contains(enemy)  False
8    contains(horrible)  False
9        contains(beer)  False
10      contains(about)  False
11    contains(concert)  False
12       contains(feel)  False
13       contains(like)  False
14   contains(annoying)  False
15      contains(great)  False
16        contains(the)  False
17     contains(friend)   True
18      contains(tired)  False
19    contains(morning)  False
20       contains(best)  False
21       contains(good)  False
22       contains(this)  False
23        contains(car)  False
24    contains(amazing)  False
positive


In [127]:
# A failed example
tweet = 'Your song is annoying'
print classifier.classify(extract_features(tweet.split()))

negative


In [128]:
# Add the words annoying to the list and repeat
tweet = 'Your song is annoying'
print classifier.classify(extract_features(tweet.split()))

negative


In [129]:
tweet = 'Your song is annoying'
print classifier.classify(extract_features(tweet.split()))

negative


In [130]:
tweet = 'love the summers'
print classifier.classify(extract_features(tweet.split()))

positive


In [131]:
tweet = 'hate the winters'
print classifier.classify(extract_features(tweet.split()))

positive


In [136]:
tweet = 'review on Black Mirror'
print classifier.classify(extract_features(tweet.split()))

positive
