In [1]:
import nltk

nltk.download('punkt')        # Contains a pre-trained model to help tokenize sentences into single words
nltk.download('wordnet')      # Lexical database that will be used during normalization
nltk.download('averaged_perceptron_tagger')    # Tagger to find nature of words (verb, noun, ...)
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/rach/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /Users/rach/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rach/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /Users/rach/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
# Download and store datasets locally
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/rach/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [3]:
from nltk.corpus import twitter_samples

# To see what are the available files
twitter_samples.fileids()

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [4]:
# Load the training set
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

# Load the test set
text = twitter_samples.strings('tweets.20150430-223406.json')

In [5]:
from nltk.tokenize import TweetTokenizer

# Instantiate a tweet tokenizer that will preserve each word (or token) as it is
tweet_tokenizer = TweetTokenizer(
    preserve_case = True,
    reduce_len    = False,
    strip_handles = False)

tokens_positive = [tweet_tokenizer.tokenize(p) for p in positive_tweets]
tokens_negative = [tweet_tokenizer.tokenize(n) for n in negative_tweets]

print("Example of a positive tweet:\n{}\n".format(positive_tweets[0]))
print("Tokens:\n{}".format(tokens_positive[0]))

Example of a positive tweet:
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Tokens:
['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


In [6]:
from nltk.tag import pos_tag    # Part-of-speech tagger

tags_positive = [pos_tag(p) for p in tokens_positive]
tags_negative = [pos_tag(n) for n in tokens_negative]

# print
tags_positive[0]

[('#FollowFriday', 'JJ'),
 ('@France_Inte', 'NNP'),
 ('@PKuchly57', 'NNP'),
 ('@Milipol_Paris', 'NNP'),
 ('for', 'IN'),
 ('being', 'VBG'),
 ('top', 'JJ'),
 ('engaged', 'VBN'),
 ('members', 'NNS'),
 ('in', 'IN'),
 ('my', 'PRP$'),
 ('community', 'NN'),
 ('this', 'DT'),
 ('week', 'NN'),
 (':)', 'NN')]

In [7]:
from nltk.stem.wordnet import WordNetLemmatizer

# All we need is to know the type (Noun, Verb, or others) of each word
def _tag2type(tag):
    '''
    Take a tag and return a type.
    return 'n' for noun, 'v' for verb, and 'a' for any
    '''
    if tag.startswith('NN'):
        return 'n'
    elif tag.startswith('VB'):
        return 'v'
    else:
        return 'a'

lemmatizer = WordNetLemmatizer()

lemma_positive = [[lemmatizer.lemmatize(word, _tag2type(tag)) for (word, tag) in tags] for tags in tags_positive]
lemma_negative = [[lemmatizer.lemmatize(word, _tag2type(tag)) for (word, tag) in tags] for tags in tags_negative]


print("Example of a positive tweet:\n{}\n".format(positive_tweets[0]))
print("Lemmatized:\n{}".format(lemma_positive[0]))

Example of a positive tweet:
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Lemmatized:
['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [8]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# print
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [9]:
import re
from string import punctuation

def _is_noise(word):
    pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(@[A-Za-z0-9_]+)'
    return word in punctuation \
        or word.lower() in stopwords \
        or re.search(pattern, word, re.IGNORECASE) != None

denoised_positive = [[p.lower() for p in _list if not _is_noise(p)] for _list in lemma_positive]
denoised_negative = [[n.lower() for n in _list if not _is_noise(n)] for _list in lemma_negative]

print("Example of a positive tweet:\n{}\n".format(positive_tweets[0]))
print("Denoised:\n{}".format(denoised_positive[0]))

Example of a positive tweet:
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)

Denoised:
['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [10]:
from nltk import FreqDist

def get_all_words(tokens_list):
    '''
    Generator function to get a flat mapping of all words in the dataset.
    
    @arg tokens_list: A 2-D list of (preferably cleaned) tokens
    @return A list of all words
    '''
    for tokens in tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(denoised_positive)
all_neg_words = get_all_words(denoised_negative)

freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)

print("The 10 most common words in a set of positive tweets:\n{}\n".format(freq_dist_pos.most_common(10)))
print("The 10 most common words in a set of negative tweets:\n{}".format(freq_dist_neg.most_common(10)))


The 10 most common words in a set of positive tweets:
[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]

The 10 most common words in a set of negative tweets:
[(':(', 4585), (':-(', 501), ("i'm", 343), ('...', 332), ('get', 325), ('miss', 291), ('go', 275), ('please', 275), ('want', 246), ('like', 218)]


In [11]:
def get_tweets_for_model(tokens_list):
    '''
    Generator function that associates a boolean 'True' to each token in a list of tokens,
    which represents the label of each token.
    This step is required by the NLTK classifier we'll be using:
    - Documentation: https://www.nltk.org/book/ch06.html
    
    @arg tokens_list a 2-D list of (preferably cleaned) tokens
    @return A 2-D list of tuples (original_token, True) containing the unaltered token and a boolean label
    '''
    for tweet_tokens in tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(denoised_positive)
negative_tokens_for_model = get_tweets_for_model(denoised_negative)

In [12]:
import random

TRAIN_SIZE_RATIO = 0.7    # We use 70% as a training set

positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]

# Merge the positive and negative sets, then shuffle to avoid any bias
# that could come from the arrangement of tweets.
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)

train_data = dataset[: round(len(dataset) * TRAIN_SIZE_RATIO)]
test_data = dataset[round(len(dataset) * TRAIN_SIZE_RATIO) :]

In [13]:
from nltk import classify
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_data)

print("Training accuracy is:{}\n".format(classify.accuracy(classifier, train_data)))
print("Testing accuracy is:{}\n".format(classify.accuracy(classifier, test_data)))
print(classifier.show_most_informative_features(10))

Training accuracy is:0.9994285714285714

Testing accuracy is:0.995

Most Informative Features
                      :( = True           Negati : Positi =   2071.0 : 1.0
                      :) = True           Positi : Negati =   1661.0 : 1.0
                     sad = True           Negati : Positi =     24.4 : 1.0
                    glad = True           Positi : Negati =     21.4 : 1.0
                follower = True           Positi : Negati =     21.1 : 1.0
                  arrive = True           Positi : Negati =     18.4 : 1.0
                     x15 = True           Negati : Positi =     16.5 : 1.0
              appreciate = True           Positi : Negati =     14.8 : 1.0
                      aw = True           Negati : Positi =     14.5 : 1.0
                 perfect = True           Positi : Negati =     11.5 : 1.0
None


In [14]:
def classify(tweet):
    '''
    Wrapper function for the pre-processing and classification steps previously performed.
    
    @arg tweet: String representing a tweet
    @return String representing a polarity. (Positive or Negative)
    '''
    tokens = tweet_tokenizer.tokenize(tweet)
    tokens = [
        lemmatizer.lemmatize(word, _tag2type(tag)).lower()
        for word, tag in pos_tag(tokens)
        if not _is_noise(word)
    ]
    
    return tokens, classifier.classify(dict([token, True] for token in tokens))

In [15]:
positive_tweet = "@bakery_brothers Thanks for the Pie! Really appreciate it :) #yummy #pie_day"
tokens, polarity = classify(positive_tweet)

print("Denoised tokens: {}\nPolarity: {}\n".format(tokens, polarity))

Denoised tokens: ['thanks', 'pie', 'really', 'appreciate', ':)', '#yummy', '#pie_day']
Polarity: Positive



In [16]:
negative_tweet = "@raptors really sad that you lost the qualifications to the final. #no_luck"
tokens, polarity = classify(negative_tweet)

print("Denoised tokens: {}\nPolarity: {}\n".format(tokens, polarity))

Denoised tokens: ['really', 'sad', 'lose', 'qualification', 'final', '#no_luck']
Polarity: Negative



In [17]:
sarcasme_tweet = "@police thank you so much for closing half the roads to the city in the middle of the day! #traffic"
tokens, polarity = classify(sarcasme_tweet)

print("Denoised tokens: {}\nPolarity: {}\n".format(tokens, polarity))

Denoised tokens: ['thank', 'much', 'close', 'half', 'road', 'city', 'middle', 'day', '#traffic']
Polarity: Positive



In [18]:
import pickle

with open('./model.pickle', 'wb') as f:
    pickle.dump(classifier, f)