In [1]:
import nltk
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
np.random.seed(1)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
import string                              # for string operations

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Riniperencsik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
data = pd.read_csv("/Users/Riniperencsik/Desktop/Projects/Twitter Sentiment Analysis/twitter_airlines_data.csv", encoding= 'unicode_escape')

In [26]:
#separate the tweets based on their sentiment
data = data[['airline_sentiment', "text"]]
positive_tweets = list(data[data['airline_sentiment']=='positive']['text'])
negative_tweets = list(data[data['airline_sentiment']=='negative']['text'])
neutral_tweets = list(data[data['airline_sentiment']=='neutral']['text'])

In [27]:
# create test sets and training sets
# we want to keep the distribution of classes the same for both sets

neutral_train = neutral_tweets[:2480] # 2480 is 80% of the neutral tweets

negative_train = negative_tweets[:7343] # 7343 is 80% of the negative tweets

positive_train = positive_tweets[:1891] # 1891 is 80% of the positive tweets

train = positive_train + negative_train + neutral_train


labels_train = [] # create the labels for each tweet

for i in range(2480): # 0 for neutral
    labels_train.append(0)
    
for i in range(7343): # 1 for negative
    labels_train.append(1)
    
for i in range(1891): # 2 for positive
    labels_train.append(2)
    

In [28]:
neutral_test = neutral_tweets[2480:] # 20% of the neutral tweets

negative_test = negative_tweets[7343:] # 20% of the negative tweets

positive_test = positive_tweets[1891:] # 20% of the positive tweets

test = positive_test + negative_test + neutral_test


labels_test = [] # create the labels for each tweet

for i in range(2480, 3099): # 0 for neutral
    labels_test.append(0)
    
for i in range(7343, 9178): # 1 for negative
    labels_test.append(1)
    
for i in range(1891, 2363): # 2 for positive
    labels_test.append(2)

In [29]:
def remove_unnecessary_text(tweet0):
    # remove old style retweet text "RT"
    tweet1= re.sub(r'^RT[\s]+', '', tweet0)

    # remove hyperlinks
    tweet1 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet1)

    # remove hashtags
    # only removing the hash # sign from the word
    tweet1 = re.sub(r'#', '', tweet1)
    return tweet1

In [30]:
def tokenize(tweet0):
    # instantiate tokenizer class, make all characters the same case and remove all twitter mentions (@)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

    # tokenize tweets
    tweet_tokens = tokenizer.tokenize(tweet0)
    
    return tweet_tokens

In [31]:
def remove_stopwords_punc(tweet_tokens0):
    clean_tweet = [] #create new list to store clean tweet
    for word in tweet_tokens0:
        if word not in stop_words and word not in string.punctuation: #do not append stopwords or punctuation
            clean_tweet.append(word)
    return clean_tweet
        

In [32]:
stemmer = PorterStemmer()
def stem(clean_tweet0):
    #create clean list
    stem_tweet = []
    
    #stem each word in the list
    for word in clean_tweet0:
        stemmed = stemmer.stem(word)
        stem_tweet.append(stemmed)
    return stem_tweet

In [33]:
def tweet_preprocess(tweet0):
    tweet0 = remove_unnecessary_text(tweet0)
    tweet0 = tokenize(tweet0)
    tweet0 = remove_stopwords_punc(tweet0)
    tweet0 = stem(tweet0)
    return tweet0

In [34]:
def lookup(freqs, word, label):
    '''
    Input:
        freqs: a dictionary with the frequency of each pair (or tuple)
        word: the word to look up
        label: the label corresponding to the word
    Output:
        n: the number of times the word with its corresponding label appears.
    '''
    n = 0  # freqs.get((word, label), 0)

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [35]:
labels = []
for i in range(2363):
    labels.append(0)
    
for i in range(9178):
    labels.append(1)
    
for i in range(3099):
    labels.append(2)
    

In [36]:
# function below preprocesses the tweets then creates a frequency count of each word for each class 
# results are stored in a dictionary

def count_tweets(result, tweets, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of tweets
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    
    result = {}
    for ys, tweet in zip(ys, tweets):
        for word in tweet_preprocess(tweet):
            pair = (word, ys)
            if pair in result:
                result[pair] += 1
                
            else:
                result[pair] = 1
    return result


In [37]:
#build frequency count for train and test
freqs_train = count_tweets({}, train, labels_train)

freqs_test = count_tweets({}, test, labels_test)

In [38]:
# fucntion below calculates the logprior (log(number of positive class/ number of negative class))
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood_pos = {}
    loglikelihood_neg = {}
    loglikelihood_neu = {}
    logprior = 0

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = V_pos = V_neg = N_neu = V_neu = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] == 0:
            # increment the count of unique neutral words by 1
            V_neu += 1 # how many uinque words occur

            # Increment the number of neutral words by the count for this (word, label) pair
            N_neu += freqs[pair] # how many times each unique word occurs

        elif pair[1]==1:
            # increment the count of unique negative words by 1
            V_neg += 1

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]
            
        else:
            # increment the count of unique positive words by the county by 1
            V_pos += 1
            
            # increment the number of positive words by the count for this (word, label) pair
            N_pos += 1

    # Calculate D, the number of tweets
    D = len(train_y)

    # Calculate D_pos, the number of neutral tweets
    D_pos = (len(list(filter(lambda x: x == 0, train_y))))

    # Calculate D_neg, the number of negative documents
    D_neg = (len(list(filter(lambda x: x == 1, train_y))))
    
    # Calculate D_neu, the number of neutral documents
    D_neu = (len(list(filter(lambda x: x == 2, train_y))))

    # Calculate logprior
    logprior_pos = D_pos/D
    logprior_neg = D_neg/D
    logprior_neu = D_neu/D

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = lookup(freqs,word,2)
        freq_neg = lookup(freqs,word,1)
        freq_neu = lookup(freqs,word,0)

        # calculate the probability that each word is positive, negative, and neutral
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)
        p_w_neu = (freq_neu + 1) / (N_neu + V)

        # calculate the log likelihood that the word is pos, neg, and neutral
        loglikelihood_pos[word] = (p_w_pos/(p_w_neg+p_w_neu))
        loglikelihood_neg[word] = (p_w_neg/(p_w_pos+p_w_neu))
        loglikelihood_neu[word] = (p_w_neu/(p_w_neg+p_w_pos))

    ### END CODE HERE ###

    return logprior_pos, logprior_neg, logprior_neu, loglikelihood_pos, loglikelihood_neg,  loglikelihood_neu

In [39]:
# calculate the probability that each word is positive, negative, and neutral for each tweet
# for log prior, pos = (number of positive tweets) / all tweets
# calculate three different log liklihoods: one for positive, negative, and neutral

In [40]:
def naive_bayes_predict(tweet, logprior_pos, logprior_neu, logrprior_neg,
                        loglikelihood_pos, loglikelihood_neg, loglikelihood_neu):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the tweet (if found in the dictionary) + logprior (a number)

    '''
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    # process the tweet to get a list of words
    word_l = tweet_preprocess(tweet)
    
    # initialize probability to zero
    pos = 0
    neg = 0 
    neu = 0

    # add the logprior
    pos += logprior_pos
    neg += logprior_neg
    neu += logprior_neu
    
    outcome = []


    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood_pos:
            # add the log likelihood of that word to the probability
            pos += loglikelihood_pos[word]
            outcome.append(pos)
            
        if word in loglikelihood_neg:

            neg += loglikelihood_neg[word]
            outcome.append(neg)
            
        if word in loglikelihood_neu:
            # add the log likelihood of that word to the probability
            neu += loglikelihood_neu[word]
            outcome.append(neu)
    ### END CODE HERE ###
    
    
    if outcome != []:
        outcome = max(outcome)
    else: 
        outcome = -1
    if outcome == pos:
        prediction = 2
    elif outcome == neg:
        prediction = 1
    else: 
        prediction = 0
    print(prediction)
    
   
    return prediction

In [20]:
def test_naive_bayes(test_x, test_y, logprior_pos, logprior_neu, logrprior_neg,
                        loglikelihood_pos, loglikelihood_neg, logliklihood_neu):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of tweets classified correctly)/(total # of tweets)
    """
    accuracy = 0  # return this properly

    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    y_hats = []
    for tweet11 in test_x:
        # if the prediction is > 0
        prediction =  naive_bayes_predict(tweet11, logprior_pos, logprior_neu, logrprior_neg,
                        loglikelihood_pos, loglikelihood_neg, logliklihood_neu) 

        # append the predicted class to the list y_hats
        y_hats.append(prediction)
    n = 0
    num_correct = 0
    # error is the average of the absolute values of the differences between y_hats and test_y
    for i in y_hats:
        actual = test_y[n]
        if i == actual:
            num_correct += 1
        n += 1
    accuracy = num_correct / len(y_hats)
    ### END CODE HERE ###

    return accuracy


In [46]:
logprior_pos, logprior_neg, logprior_neu, loglikelihood_pos, loglikelihood_neg, logliklihood_neu = train_naive_bayes(freqs_train, train, labels_train)

accuracy = test_naive_bayes(train, labels_train, logprior_pos, logprior_neu, logprior_neg,
                        loglikelihood_pos, loglikelihood_neg, logliklihood_neu)
print(accuracy)

In [45]:
logprior_pos, logprior_neg, logprior_neu, loglikelihood_pos, loglikelihood_neg, logliklihood_neu = train_naive_bayes(freqs_test, test, labels_train)

accuracy = test_naive_bayes(test, labels_test, logprior_pos, logprior_neu, logprior_neg,
                        #loglikelihood_pos, loglikelihood_neg, logliklihood_neu)
print(accuracy)

In [44]:
accuracy = test_naive_bayes(["i was cold"], [1], logprior_pos, logprior_neu, logprior_neg,
                        loglikelihood_pos, loglikelihood_neg, logliklihood_neu)
print(accuracy)