In [1]:
import sys
sys.version_info

sys.version_info(major=3, minor=6, micro=5, releaselevel='final', serial=0)

In [2]:
import nltk
nltk.__version__

'3.3'

In [3]:
import matplotlib
matplotlib.__version__

'2.2.2'

## Step 1 --- Pre-Processing

### Explore the Data

In [4]:
from nltk.corpus import twitter_samples
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [5]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

In [6]:
len(text)

20000

In [7]:
text[:3]

['RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP',
 'VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY',
 'RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co…']

### Tokentize Data

In [8]:
twitter_samples.tokenized('positive_tweets.json')[0]

['#FollowFriday',
 '@France_Inte',
 '@PKuchly57',
 '@Milipol_Paris',
 'for',
 'being',
 'top',
 'engaged',
 'members',
 'in',
 'my',
 'community',
 'this',
 'week',
 ':)']

In [9]:
from nltk.tokenize import word_tokenize
word_tokenize("Hi, my name is Bob.")

['Hi', ',', 'my', 'name', 'is', 'Bob', '.']

### Word Normalization

In [10]:
from nltk.stem.porter import PorterStemmer 
stem = PorterStemmer()
stem.stem('running')

'run'

In [11]:
stem.stem('ran')

'ran'

In [12]:
from nltk.stem.wordnet import WordNetLemmatizer 
lem = WordNetLemmatizer()
lem.lemmatize('ran', 'v')

'run'

In [13]:
from nltk.tag import pos_tag

sample = "The quick brown fox ran over the lazy dog"
pos_tag(word_tokenize(sample))

[('The', 'DT'),
 ('quick', 'JJ'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('ran', 'VBD'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN')]

In [14]:
def lemmatize_sentence(sentence):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(sentence):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

In [15]:
lemmatize_sentence(word_tokenize(sample))

['The', 'quick', 'brown', 'fox', 'run', 'over', 'the', 'lazy', 'dog']

### Remove Noise

In [16]:
import re
sample_text = 'Go to https://mail.google.com/mail/u/0/?zx=4t03cx13uoha#inbox for Gmail'
re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', sample_text)

'Go to  for Gmail'

In [17]:
import string
sample_text = "I can't believe it; This is awesome!!!"
sample_text.translate(str.maketrans("", "", string.punctuation))
# 'I cant believe it This is awesome'

'I cant believe it This is awesome'

In [18]:
def remove_noise(tweet_tokens, stop_words = ()):
    '''Remove @ mentions, hyperlinks, punctuation, and stop words'''

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        # Remove Hyperlinks
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        # Remove twitter handles
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        # Normalize sentence
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            # Get lowercase
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [19]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [20]:
print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']


### Freq Dist

In [21]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token
all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [22]:
freq_dist_pos = nltk.FreqDist(all_pos_words)

In [23]:
freq_dist_pos.most_common(10)

[(':)', 3691),
 (':-)', 701),
 (':d', 658),
 ('thanks', 388),
 ('follow', 357),
 ('love', 333),
 ('...', 290),
 ('good', 283),
 ('get', 263),
 ('thank', 253)]

In [24]:
import matplotlib.pyplot as plt

items = freq_dist_pos.most_common(10)

labels, values = zip(*items)
width = 0.75

plt.bar(labels, values, width, align='center', )

plt.show()

<Figure size 640x480 with 1 Axes>

## Step 2 --- Prepare data for the Model

In [25]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [26]:
import random

# To randomly shuffle the test data, need to convert to list
positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

# 70-30 split
train_data = dataset[:7000]
test_data = dataset[7000:]

### Step 3 --- Build Model

In [27]:
from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_data)
 
classify.accuracy(classifier, test_data)

0.9953333333333333

In [28]:
classifier.show_most_informative_features(10)

Most Informative Features
                      :( = True           Negati : Positi =   2053.3 : 1.0
                      :) = True           Positi : Negati =   1679.1 : 1.0
                follower = True           Positi : Negati =     34.7 : 1.0
                     sad = True           Negati : Positi =     31.9 : 1.0
                 welcome = True           Positi : Negati =     22.4 : 1.0
                    glad = True           Positi : Negati =     19.2 : 1.0
                     bam = True           Positi : Negati =     19.2 : 1.0
                     x15 = True           Negati : Positi =     17.5 : 1.0
               community = True           Positi : Negati =     15.2 : 1.0
                    poor = True           Negati : Positi =     14.8 : 1.0


### Model Inferences and Testing

In [29]:
custom_tweet = "I ordered just once from foodpanda, they screwed up, never used the app again."
classifier.classify(dict([token, True] 
                    for token in remove_noise(word_tokenize(custom_tweet))))

'Negative'

In [30]:
custom_tweet = 'Congrats @MoSalah on your 7th best goal from last '\
                   'season winning goal of the year :) #Baller #Topbin #oneofmanyworldies'
classifier.classify(dict([token, True] 
                    for token in remove_noise(word_tokenize(custom_tweet))))

'Positive'

In [31]:
# https://twitter.com/roshansxc/status/858574333270265856
custom_tweet = 'Thank you for sending my baggage to Hyd and flying me to Calcutta at the'\
               ' same time. Brilliant service. #DieIndigo'
classifier.classify(dict([token, True] 
                    for token in remove_noise(word_tokenize(custom_tweet))))

'Positive'