In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import pickle, argparse, os, sys
from collections import Counter
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import pickle
import nltk
import math
import re

import torch.functional as F
from torch import optim
import torch.nn as nn
import torch

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

In [27]:
# model 2
EMBEDDING_DIM = 50
HIDDEN_DIM = 16
NUM_LAYERS = 1
MAX_SEQ_LENGTH = 20
LEARNING_RATE = 0.005
EPOCH = 3
BATCH_SIZE = 10
WEIGHT_DECAY = 0
GLOVE_PATH = "glove.6B 2"
DROPOUT = 0

### Manually checked the quality of the lables and decided to remove rows that have confidence less than 0.4. The removed ones accounted for 1.6 percent of the original data.

In [2]:
def read_airline_data(data_path = None):
    '''
        read csv data and filter rows with confidence level of the sentiment label lower than 0.4
        
        parms: data path str
        
        return: pandas dataframe
    '''
    #vader sentiment scorer
    analyzer = SentimentIntensityAnalyzer()
    data = pd.read_csv(data_path)
    data = data[["airline_sentiment", "airline_sentiment_confidence", "airline", "text"]]
    data = data[data["airline_sentiment_confidence"] > .4]
    data["vader_sentiment"] = data["text"].apply(lambda x: analyzer.polarity_scores(x)["compound"])
    data = data.reset_index(drop = True)
    return data


In [13]:
def split_data(data):
    '''
        Split data into two parts
        
        params: pandas dataframe
        return: two pandas dataframe and save as pickle files
    '''
    # split data into training_validation(0.9) and testing sets(0.1)
    train_valid, test = train_test_split(data, test_size=0.1, random_state=42, shuffle=True, \
                                         stratify=data["airline_sentiment"])

    train_pkl = open("airline_pkl/train.pkl", "wb")
    pickle.dump(train_valid, train_pkl)
    train_pkl.close()
    
    test_pkl = open("airline_pkl/test.pkl", "wb")
    pickle.dump(test, test_pkl)
    test_pkl.close()
        
    return

In [14]:
def prepare_data(data):
    '''
        create token list for the tweets and create a vocabulary dictionary 
        to replace unfrequent words with UNKA
        
        Also create a token_to_idx dictionary include special token <PAD>
        
        Besides, convert sentiment label into integers and create one hot vectors

        param: pandas dataframe
        return: token_to_idx, dict
                #one hot vector embedding of labels, tensor
    '''
    
    # remove stopwords
    # tokenize tweets
    # create vocab list for checking frequency
    stop_words = set(stopwords.words('english'))
    filtered_text_list = []
    vocab_list = []
    for tweet in tqdm(data["text"]):
        tweet = re.sub("\@[A-Za-z]+", "", tweet)
        tweet = re.sub("http\S+", "", tweet)
        tweet = re.sub("[.!,@#$%^&*]", "", tweet)
        tokens = word_tokenize(tweet)
        filtered_text = [w for w in tokens if not w in stop_words] 
        filtered_text_list.append(filtered_text)
        vocab_list += filtered_text
    data["filtered_text"] = filtered_text_list
    
    # replace unfrequent words with "UNKA"
    # create token_to_idx dict
    vocab_dict = Counter(vocab_list)
    token_to_replace = {k: v for k, v in vocab_dict.items() if v < 3}
    token_to_idx = {"<PAD>":0}
    token_list = []
    
    for tweet in tqdm(data["filtered_text"]):
        for token_idx in range(len(tweet)):
            if tweet[token_idx] in token_to_replace:
                tweet[token_idx] = "UNKA"
        token_list += tweet
    for token in Counter(token_list).keys():
        if token not in token_to_idx:
            token_to_idx[token] = len(token_to_idx)
            
    # create index for sentiment labels
    sentiment_to_idx = {"neutral":0, "negative":1, "positive":2}
    data["label_idx"] = data["airline_sentiment"].apply(lambda x: sentiment_to_idx[x])

    return token_to_idx


In [15]:
def ecnode_sequence(tweet, token_to_idx):
    '''
        convert tweet into sequence of index using token_to_idx dict

        params: str and token to index dictionary
        return: eccoded sequence list in tensor
    '''
    encoded_seq_list = []
    
    for token in tweet:
        try:
            encoded_seq_list.append(token_to_idx[token])
        except KeyError:
            encoded_seq_list.append(token_to_idx["UNKA"])
            
    return torch.tensor(encoded_seq_list, dtype=torch.long)

def pad_sequence(encoded_sequence, max_seq_length, vader_sentiment):
    # with vader sentiment feature
    '''
        truncate or pad the sequence with 0 if the sequence is shorter 
        than the number defined for training:max_seq_length
        
        parmas: tensor of encoded sequence
        return: tensor of padded sequence
        
    '''
    padded_sentence = torch.zeros(max_seq_length, dtype = torch.long)
    value_to_pad = min(len(encoded_sequence), max_seq_length)
    padded_sentence[:value_to_pad] = encoded_sequence[:value_to_pad]
    padded_sentence[-1] = vader_sentiment
    return padded_sentence


# def pad_sequence(encoded_sequence, max_seq_length):
#     '''
#         truncate or pad the sequence with 0 if the sequence is shorter 
#         than the number defined for training:max_seq_length
        
#         parmas: tensor of encoded sequence
#         return: tensor of padded sequence
        
#     '''
#     padded_sentence = torch.zeros(max_seq_length, dtype = torch.long)
#     value_to_pad = min(len(encoded_sequence), max_seq_length)
#     padded_sentence[:value_to_pad] = encoded_sequence[:value_to_pad]
#     return padded_sentence

In [16]:
def load_glove(glove_path):
    '''
        open the glove pre-trained embeddings and process it
        save the word embedding vectors to pytorch tensor and 
        the words and word_to_idx dictionary to pickle files

        params: glove path
        return: word embedding tensor
    '''
    words = []
    idx = 0
    word2idx = {}
    vectors = []

    stop = 0
    with open(f'{glove_path}/glove.6B.50d.txt', 'rb') as f:
        for l in f:
            line = l.decode().split()
            word = line[0]
            words.append(word)
            word2idx[word] = idx
            idx += 1
            vect = np.array(line[1:]).astype(np.float)
            vect = torch.tensor(vect, dtype=torch.double)
            vect = torch.reshape(vect, [1, 50])
            vectors.append(vect)

    vectors = torch.cat(vectors)
    pickle.dump(words, open(f'{glove_path}/6B.50_words.pkl', 'wb'))
    pickle.dump(word2idx, open(f'{glove_path}/6B.50_idx.pkl', 'wb'))

    return vectors

In [17]:
def word_to_glove(vectors, glove_path, token_to_idx):
    '''
        open the words and word_to_index pickle files
        and map every token in the training data with a vector

    '''
    # load glove embeddings
    words = pickle.load(open(f'{glove_path}/6B.50_words.pkl', 'rb'))
    word2idx = pickle.load(open(f'{glove_path}/6B.50_idx.pkl', 'rb'))
    glove = {w: vectors[word2idx[w]] for w in words}

    # load training data
    embedding_weights = np.zeros((len(token_to_idx), EMBEDDING_DIM))

    for token, index in token_to_idx.items():
        if token == "UNKA":
            embedding_weights[index, :] = np.random.normal(scale=0.6, size=(EMBEDDING_DIM, ))
        elif index == 0: #padding
            embedding_weights[index, :] = np.random.normal(scale=0.6, size=(EMBEDDING_DIM, ))
        else:
            try:
                embedding_weights[index, :] = glove[token]
            except KeyError:
                embedding_weights[index, :] = np.random.normal(scale=0.6, size=(EMBEDDING_DIM, ))

    return torch.tensor(embedding_weights, dtype=torch.double)

In [18]:
class RNNTagger(nn.Module):
    def __init__(self, token_to_idx, sentiment_to_idx):
        super(RNNTagger, self).__init__()
        self.embedding_dim = EMBEDDING_DIM
        self.hidden_dim = HIDDEN_DIM
        self.num_layers = NUM_LAYERS
        self.vocab_size = len(token_to_idx)
        self.tagset_size = len(sentiment_to_idx)
        self.bidirectional = False
        self.dropout = nn.Dropout(DROPOUT)

        self.word_embedding = nn.Embedding(
            num_embeddings=self.vocab_size, 
            embedding_dim=self.embedding_dim,
            padding_idx = 0
            )

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim, 
            hidden_size=self.hidden_dim,
            num_layers=self.num_layers,
            batch_first = True,
            bidirectional=self.bidirectional)

        # The linear layer that maps from hidden state space to tag space
        if self.bidirectional:
            self.hidden2tag = nn.Linear(2 * self.hidden_dim, self.tagset_size)
        else:
            self.hidden2tag = nn.Linear(self.hidden_dim, self.tagset_size)
            
        self.sigmoid = nn.Sigmoid()

    def forward(self, sentence, prev_state):
        sentence = sentence.long()
        embeds = self.word_embedding(sentence)
        lstm_out, state = self.lstm(embeds, prev_state)
        # add a fully connected layer to convert the high level info into our goal
        # only use the last output of the lstm layer for many to one
        lstm_out = self.hidden2tag(lstm_out[:, -1, :])
        return lstm_out, state

    def load_embedding(self, embedding_weights):
        # using pre-trained embedding
        self.word_embedding.load_state_dict({'weight': embedding_weights})

    def init_state(self):
        if self.bidirectional:
            return (torch.zeros(2 * self.num_layers, BATCH_SIZE, self.hidden_dim),
                    torch.zeros(2 * self.num_layers, BATCH_SIZE, self.hidden_dim))
        else:
            return (torch.zeros(self.num_layers, BATCH_SIZE, self.hidden_dim),
                    torch.zeros(self.num_layers, BATCH_SIZE, self.hidden_dim))

class Dataset(torch.utils.data.Dataset):
    def __init__(self, seq_list, label_list, token_to_idx, vader_sentiment):
        self.seq_list = seq_list
        self.label_list = label_list
        self.token_to_idx = token_to_idx
        self.vader_sentiment = vader_sentiment

    def __len__(self):
        return len(self.seq_list)
    
    def __getitem__(self, index):
        # Select sample
        # with vader sentiment feature
        seq = self.seq_list[index]
        label_list = self.label_list[index]
        vader_score = self.vader_sentiment[index]

        sentence_input = ecnode_sequence(seq, self.token_to_idx)
        padded_sentence = pad_sequence(sentence_input, MAX_SEQ_LENGTH, vader_score)
        return padded_sentence, label_list

#     def __getitem__(self, index):
#         # Select sample
#         seq = self.seq_list[index]
#         label_list = self.label_list[index]

#         sentence_input = ecnode_sequence(seq, self.token_to_idx)
#         padded_sentence = pad_sequence(sentence_input, MAX_SEQ_LENGTH)
# #         padded_targets = pad_sequence(targets, MAX_SEQ_LENGTH)
# #         seq_len = len(seq)

#         return padded_sentence, label_list

In [19]:
def calculate_f1(prediction, ground_truth):
    '''
        calculate the f1 score for positive and
        negative labels and the overall f1 score
        
        params: arrays
        return: three f1 scores
    '''
    accuracy_table = torch.zeros(3,3)

    for pred, truth in zip(prediction, ground_truth):
        if (pred, truth) == (2, 2):
            accuracy_table[0][0] += 1
        elif (pred, truth) == (2, 0):
            accuracy_table[0][1] += 1
        elif (pred, truth) == (2, 1):
            accuracy_table[0][2] += 1
        elif (pred, truth) == (0, 2):
            accuracy_table[1][0] += 1
        elif (pred, truth) == (0, 0):
            accuracy_table[1][1] += 1
        elif (pred, truth) == (0, 1):
            accuracy_table[1][2] += 1
        elif (pred, truth) == (1, 2):
            accuracy_table[2][0] += 1
        elif (pred, truth) == (1, 0):
            accuracy_table[2][1] += 1
        elif (pred, truth) == (1, 1):
            accuracy_table[2][2] += 1


    presition_pos = accuracy_table[0][0] / (accuracy_table[0][0] +  accuracy_table[0][1] + accuracy_table[0][2])
    recall_pos = accuracy_table[0][0] / (accuracy_table[0][0] +  accuracy_table[1][0] + accuracy_table[2][0])
    presition_neg = accuracy_table[2][2] / (accuracy_table[2][0] +  accuracy_table[2][1] + accuracy_table[2][2])
    recall_neg = accuracy_table[2][2] / (accuracy_table[0][2] +  accuracy_table[1][2] + accuracy_table[2][2])

    f1_score_pos = (2 * presition_pos * recall_pos) / (presition_pos + recall_pos)
    f1_score_neg = (2 * presition_neg * recall_neg) / (presition_neg + recall_neg)
    f1_score = (f1_score_pos + f1_score_neg) / 2

    return f1_score_pos, f1_score_neg, f1_score, accuracy_table

In [20]:
def train(training_file):
    # load training_file
    train_file = open(training_file, "rb")
    data = pickle.load(train_file)
    train_file.close()
    
    # prepare data, remove stopwords, replace unfrequent words and create a token
    # to index dictionary
    token_to_idx = prepare_data(data)
    
    X = data["filtered_text"].tolist()
    Y = data["label_idx"].tolist()
    vader_sentiment = data["vader_sentiment"].tolist()
    
#     # create a tokenizer pickle file for cache, only need to run when first executing
    tokenizer = {'token_to_idx': token_to_idx}
    tokenizer_file = open("airline_pkl/tokenizer.pkl", "wb")
    pickle.dump(tokenizer, tokenizer_file)
    tokenizer_file.close()
    
    # create glove vectors
    vectors = load_glove(GLOVE_PATH)
    # create word embedding weight matrix for training
    embedding_weights = word_to_glove(vectors, GLOVE_PATH, token_to_idx)
    torch.save(embedding_weights, 'airline_pkl/embedding_weights.pt')
        
    # load the saved files, only need to run when already saved the files
    tokenizer_file = open("airline_pkl/tokenizer.pkl", "rb")
    tokenizer = pickle.load(tokenizer_file)
    tokenizer_file.close()
    token_to_idx = tokenizer['token_to_idx']
    embedding_weights = torch.load('airline_pkl/embedding_weights.pt')

    # initialize the RNN model
    sentiment_to_idx = {"neutral":0, "negative":1, "positive":2}
    model = RNNTagger(token_to_idx, sentiment_to_idx) 
    model.load_embedding(embedding_weights)
    loss_function = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    
    # dataset generator
    training_set = Dataset(X, Y, token_to_idx, vader_sentiment)
    training_generator = torch.utils.data.DataLoader(training_set, batch_size=BATCH_SIZE)
    
    print('Start training...')

    for epoch in range(EPOCH):
        print('Epoch %d:' % (epoch+1))
        state_h, state_c = model.init_state()
        
        prediction = None
        ground_truth = None
        losses = []
        for sequence, targets in tqdm(training_generator):
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            
            if sequence.shape[0] != 10:
                 state_h, state_c = (torch.zeros(NUM_LAYERS, sequence.shape[0], HIDDEN_DIM),
                    torch.zeros(NUM_LAYERS, sequence.shape[0], HIDDEN_DIM))
            
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of word indices.
             
            # Step 3. Run our forward pass.
            tag_space, (state_h, state_c) = model(sequence, (state_h, state_c))

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            targets = targets.long()
            loss = loss_function(tag_space, targets)

            losses.append(loss)
            loss.backward()
            optimizer.step()

            state_h = state_h.detach()
            state_c = state_c.detach()
            
            # Store history to calculate accuracy
            for i in range(sequence.shape[0]):
                if prediction is None:
                    prediction = torch.argmax(tag_space[i,:]).cpu().numpy()
                else:
                    prediction = np.append(prediction, torch.argmax(tag_space[i,:]).cpu().numpy())
                if ground_truth is None:
                    ground_truth = targets[i].cpu().numpy()
                else:
                    ground_truth = np.append(ground_truth, targets[i].cpu().numpy())
                                
        losses = torch.tensor(losses)
        train_perplexity = math.exp(torch.mean(losses))
        print(prediction)
        print(ground_truth)
        f1_score_pos, f1_score_neg, f1_score, accuracy_table = calculate_f1(prediction, ground_truth)
        print(accuracy_table)
        print(f'The f1 score of the pos labels is {100*f1_score_pos:6.2f}%')
        print(f'The f1 score of the neg labels is {100*f1_score_neg:6.2f}%')
        print(f'The f1 score of the model is {100*f1_score:6.2f}%')
        print(f'The perplexity of the model is {train_perplexity}')

    return model


In [21]:
def test(model_file, test_file):
    assert os.path.isfile(model_file), 'Model file does not exist'
    assert os.path.isfile(test_file), 'Data file does not exist'

    # load training_file
    test_file = open(test_file, "rb")
    data = pickle.load(test_file)
    test_file.close()
    
    # prepare data, remove stopwords, replace unfrequent words and create a token
    # to index dictionary
    token_to_idx_test = prepare_data(data)
    
    X = data["filtered_text"].tolist()
    Y = data["label_idx"].tolist()
    vader_sentiment = data["vader_sentiment"].tolist()
    
    # load the saved files, only need to run when already saved the files
    tokenizer_file = open("airline_pkl/tokenizer.pkl", "rb")
    tokenizer = pickle.load(tokenizer_file)
    tokenizer_file.close()
    token_to_idx = tokenizer['token_to_idx']
    
    # initialize the RNN model
    sentiment_to_idx = {"neutral":0, "negative":1, "positive":2}
    model = RNNTagger(token_to_idx, sentiment_to_idx) 
    model.load_state_dict(torch.load(model_file))
    model.eval()
    
    # dataset generator
    test_set = Dataset(X, Y, token_to_idx, vader_sentiment)
    test_generator = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE)
    
    prediction = None
    ground_truth = None
    state_h, state_c = model.init_state()
    for sequence, targets in tqdm(test_generator):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        if sequence.shape[0] != 10:
             state_h, state_c = (torch.zeros(NUM_LAYERS, sequence.shape[0], HIDDEN_DIM),
                torch.zeros(NUM_LAYERS, sequence.shape[0], HIDDEN_DIM))

        with torch.no_grad():   
            
            tag_space, (state_h, state_c) = model(sequence, (state_h, state_c))
            
            # Store history to calculate accuracy
            for i in range(sequence.shape[0]):
                if prediction is None:
                    prediction = torch.argmax(tag_space[i,:]).cpu().numpy()
                else:
                    prediction = np.append(prediction, torch.argmax(tag_space[i,:]).cpu().numpy())
                if ground_truth is None:
                    ground_truth = targets[i].cpu().numpy()
                else:
                    ground_truth = np.append(ground_truth, targets[i].cpu().numpy())
                                
    print(prediction)
    print(ground_truth)
    f1_score_pos, f1_score_neg, f1_score, accuracy_table = calculate_f1(prediction, ground_truth)
    print(accuracy_table)
    print(f'The f1 score of the pos labels is {100*f1_score_pos:6.2f}%')
    print(f'The f1 score of the neg labels is {100*f1_score_neg:6.2f}%')
    print(f'The f1 score of the model is {100*f1_score:6.2f}%')
    

### Run the training and testing command

In [None]:
data = read_airline_data(data_path = "Tweets_airlines.csv")
split_data(data)
model = train("airline_pkl/train.pkl")
torch.save(model.state_dict(), "RNN_airline_model_no_exclamation_with_vader")

In [None]:
test("RNN_airline_model_no_exclamation_with_vader", "airline_pkl/test.pkl")