In [23]:
import random
import re
import string
import pickle
import nltk
import pandas as pd
from nltk import FreqDist, NaiveBayesClassifier, classify
from nltk.corpus import stopwords, twitter_samples
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

In [2]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download("omw-1.4")
nltk.download("stopwords")
nltk.download("twitter_samples")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\srudloff\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\srudloff\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\srudloff\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\srudloff\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srudloff\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\srudloff\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.


True

In [3]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith("NN"):
            pos = "n"
        elif tag.startswith("VB"):
            pos = "v"
        else:
            pos = "a"
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence


def remove_noise(tweet_tokens, stop_words=()):
    cleaned_tokens = []
    for token, tag in pos_tag(tweet_tokens):
        token = re.sub(
            "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|"
            "(?:%[0-9a-fA-F][0-9a-fA-F]))+",
            "",
            token,
        )
        token = re.sub("(@[A-Za-z0-9_]+)", "", token)

        if tag.startswith("NN"):
            pos = "n"
        elif tag.startswith("VB"):
            pos = "v"
        else:
            pos = "a"

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if (
            len(token) > 0
            and token not in string.punctuation
            and token.lower() not in stop_words
        ):
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [4]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

In [5]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

In [6]:
positive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")
text = twitter_samples.strings("tweets.20150430-223406.json")
tweet_tokens = twitter_samples.tokenized("positive_tweets.json")[0]
tweet_tokens = twitter_samples.tokenized("positive_tweets.json")

In [7]:
stop_words = stopwords.words("english")
stop_words = stopwords.words("english")

positive_tweet_tokens = twitter_samples.tokenized("positive_tweets.json")
negative_tweet_tokens = twitter_samples.tokenized("negative_tweets.json")

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [8]:
all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)

In [17]:
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

positive_dataset = [
    (tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model
]

negative_dataset = [
    (tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model
]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

classifier = NaiveBayesClassifier.train(dataset)
with open('naive_bayes_classifier.pkl', 'wb') as f:
    pickle.dump(classifier, f)

In [18]:
def Analysis(t):
    custom_tokens = remove_noise(word_tokenize(t))
    return classifier.classify(dict([token, True] for token in custom_tokens))

In [19]:
Analysis("Hey guys i just got back from vacation!")

'Positive'

In [20]:
Analysis("I just got back from the hospital.")

'Positive'

In [24]:
lines = [{'text':i} for i in """
Hey guys i just got back from vacation!
I just got back from the hospital. dad is not doing well.
I just got back from the hospital. the baby is so cute.
When people ask me stupid questions, it is my legal obligation to give a sarcastic remark.
I’m not saying I hate you, what I'm saying is that you are literally the Monday of my life.
Silence is golden. Duct tape is silver.
I am busy right now, can I ignore you some other time?
Find your patience before" I lose mine.
It's okay if you don’t like me. Not everyone has good taste.
Do you think God gets stoned? I think so… look at the platypus.
Light travels faster than sound. This is why some people appear bright until they speak.
If you find me offensive. Then I suggest you quit finding me.
Sarcasm is the body’s natural defense against stupidity.""".strip().splitlines()]
lines = pd.DataFrame(lines)
lines

Unnamed: 0,text
0,Hey guys i just got back from vacation!
1,I just got back from the hospital. dad is not ...
2,I just got back from the hospital. the baby is...
3,"When people ask me stupid questions, it is my ..."
4,"I’m not saying I hate you, what I'm saying is ..."
5,Silence is golden. Duct tape is silver.
6,"I am busy right now, can I ignore you some oth..."
7,"Find your patience before"" I lose mine."
8,It's okay if you don’t like me. Not everyone h...
9,Do you think God gets stoned? I think so… look...


In [26]:
def do_analysis(row):
    sentiment = Analysis(row['text'])
    row['sentiment'] = sentiment
    return row

lines = lines.apply(do_analysis, axis=1)
lines

Unnamed: 0,text,sentiment
0,Hey guys i just got back from vacation!,Positive
1,I just got back from the hospital. dad is not ...,Negative
2,I just got back from the hospital. the baby is...,Negative
3,"When people ask me stupid questions, it is my ...",Positive
4,"I’m not saying I hate you, what I'm saying is ...",Negative
5,Silence is golden. Duct tape is silver.,Negative
6,"I am busy right now, can I ignore you some oth...",Negative
7,"Find your patience before"" I lose mine.",Negative
8,It's okay if you don’t like me. Not everyone h...,Positive
9,Do you think God gets stoned? I think so… look...,Positive


In [27]:
lines.to_csv('nktl_method.csv', index=False)
