In [5]:
"""xgb.py: Copyright 2020, Sentiment Analysis on Movie Reviews using LSTM"""
__authors__ = "Xueru Ye, Ruoran Liu, Keith Herbert"
__copyright__ = "Copyright 2020, Sentiment Analysis on Movie Reviews"
__license__ = "GPL"
__version__ = "1.0.0"
__maintained_by__ = "Xueru Ye, Ruoran Liu, Keith Herbert"
__email__ = "xye85@uwo.ca@uwo.ca, rliu454@uwo.ca, kherbe@uwo.ca"
__status__ = "Production"

import numpy as np
import pandas as pd
from memory_profiler import profile
from time import perf_counter
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
import sys
warnings.filterwarnings('ignore')

df_train = pd.read_csv('data.tsv', delimiter='\t')
df_train = df_train.sample(n = 2000)
df_train.fillna('null',inplace=True)

x_train, x_test, y_train, y_test = train_test_split(df_train['Phrase'], df_train['Sentiment'], test_size=0.25)

vectorizer = CountVectorizer(max_features=5000)
tf_idf_transformer = TfidfTransformer()
tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(x_train))
x_train_weight = tf_idf.toarray()  # TF-IDF matrix for training set
tf_idf = tf_idf_transformer.transform(vectorizer.transform(x_test))
x_test_weight = tf_idf.toarray()  # TF-IDF matrix for testing set

random_seed = 8
kfold_scoring = 'accuracy'
kfold_n_splits = 5
kfold_result_output = "%s KFold Validation: Mean %f (STD %f)"
model_name = 'XGBoost Model'

xlf = xgb.XGBClassifier(max_depth=6,
                learning_rate=0.51,
                n_estimators=8,
                silent=True,
                objective='multi:softmax',
                num_class = 5,
                nthread=-1,
                gamma=0,
                min_child_weight=1,
                max_delta_step=0,
                subsample=0.85,
                colsample_bytree=0.7,
                colsample_bylevel=1,
                reg_alpha=0,
                reg_lambda=1,
                scale_pos_weight=1,
                seed=1440,
                missing=None)

sys.setrecursionlimit(10000)
#@profile
def fit_predict(xlf_temp):
    t1_start = perf_counter()
    xlf.fit(x_train_weight, y_train, eval_metric='merror', verbose=True, eval_set=[(x_test_weight,  y_test)], early_stopping_rounds=100)
    t1_stop = perf_counter()
    print("Elapsed training time: ", t1_stop - t1_start)
    
    t1_start = perf_counter()
    y_pred = xlf.predict(x_test_weight, ntree_limit=xlf.best_ntree_limit)
    t1_stop = perf_counter()
    print("Elapsed Predicting time: ", t1_stop - t1_start)
    
    return y_pred

y_pred = fit_predict(xlf)

print(cross_val_score(xlf, x_train_weight, y_train, cv=5))

label_all = ['0', '1','2','3','4']
confusion_mat = metrics.confusion_matrix(y_test, y_pred)
df = pd.DataFrame(confusion_mat, columns=label_all)
df.index = label_all
print('accuracy', metrics.accuracy_score(y_test, y_pred))
print('confusion_matrix:\n', df)
print('classification report:\n', metrics.classification_report(y_test, y_pred))

# Reference - https://towardsdatascience.com/distilling-bert-how-to-achieve-bert-performance-using-logistic-regression-69a7fc14249d
# Reference - https://huggingface.co/transformers/examples.html#glue
# Reference - https://jalammar.github.io/a-visual-guide-to-using-bert-for-the-first-time/

[0]	validation_0-merror:0.55
Will train until validation_0-merror hasn't improved in 100 rounds.
[1]	validation_0-merror:0.528
[2]	validation_0-merror:0.52
[3]	validation_0-merror:0.516
[4]	validation_0-merror:0.522
[5]	validation_0-merror:0.536
[6]	validation_0-merror:0.526
[7]	validation_0-merror:0.522
Elapsed training time:  0.9725926999999501
Elapsed Predicting time:  0.01631760000009308
[0.53       0.49333333 0.5        0.48       0.51      ]
accuracy 0.484
confusion_matrix:
    0   1    2   3  4
0  0   4   13   3  0
1  0  10   67  10  1
2  0  13  213  11  2
3  2   6   93  17  2
4  0   5   18   8  2
classification report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           1       0.26      0.11      0.16        88
           2       0.53      0.89      0.66       239
           3       0.35      0.14      0.20       120
           4       0.29      0.06      0.10        33

    accuracy                          

In [6]:
"""xgb.py: Copyright 2020, Sentiment Analysis on Movie Reviews using LSTM"""
__authors__ = "Xueru Ye, Ruoran Liu, Keith Herbert"
__copyright__ = "Copyright 2020, Sentiment Analysis on Movie Reviews"
__license__ = "GPL"
__version__ = "1.0.0"
__maintained_by__ = "Xueru Ye, Ruoran Liu, Keith Herbert"
__email__ = "xye85@uwo.ca@uwo.ca, rliu454@uwo.ca, kherbe@uwo.ca"
__status__ = "Production"

import numpy as np
import pandas as pd
from memory_profiler import profile
from time import perf_counter
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_score
import sys
warnings.filterwarnings('ignore')

df_train = pd.read_csv('data.tsv', delimiter='\t')
df_train = df_train.sample(n = 2000)
df_train.fillna('null',inplace=True)

x_train, x_test, y_train, y_test = train_test_split(df_train['Phrase'], df_train['Sentiment'], test_size=0.25)

vectorizer = CountVectorizer(max_features=5000)
tf_idf_transformer = TfidfTransformer()
tf_idf = tf_idf_transformer.fit_transform(vectorizer.fit_transform(x_train))
x_train_weight = tf_idf.toarray()  # TF-IDF matrix for training set
tf_idf = tf_idf_transformer.transform(vectorizer.transform(x_test))
x_test_weight = tf_idf.toarray()  # TF-IDF matrix for testing set

random_seed = 8
kfold_scoring = 'accuracy'
kfold_n_splits = 5
kfold_result_output = "%s KFold Validation: Mean %f (STD %f)"
model_name = 'XGBoost Model'

xlf = xgb.XGBClassifier(max_depth=6,
                learning_rate=0.51,
                n_estimators=8,
                silent=True,
                objective='multi:softmax',
                num_class = 5,
                nthread=-1,
                gamma=0,
                min_child_weight=1,
                max_delta_step=0,
                subsample=0.85,
                colsample_bytree=0.7,
                colsample_bylevel=1,
                reg_alpha=0,
                reg_lambda=1,
                scale_pos_weight=1,
                seed=1440,
                missing=None)

sys.setrecursionlimit(10000)
#@profile
def fit_predict(xlf_temp):
    t1_start = perf_counter()
    xlf.fit(x_train_weight, y_train, eval_metric='merror', verbose=True, eval_set=[(x_test_weight,  y_test)], early_stopping_rounds=100)
    t1_stop = perf_counter()
    print("Elapsed training time: ", t1_stop - t1_start)
    
    t1_start = perf_counter()
    y_pred = xlf.predict(x_test_weight, ntree_limit=xlf.best_ntree_limit)
    t1_stop = perf_counter()
    print("Elapsed Predicting time: ", t1_stop - t1_start)
    
    return y_pred

y_pred = fit_predict(xlf)

print(cross_val_score(xlf, x_train_weight, y_train, cv=5))

label_all = ['0', '1','2','3','4']
confusion_mat = metrics.confusion_matrix(y_test, y_pred)
df = pd.DataFrame(confusion_mat, columns=label_all)
df.index = label_all
print('accuracy', metrics.accuracy_score(y_test, y_pred))
print('confusion_matrix:\n', df)
print('classification report:\n', metrics.classification_report(y_test, y_pred))

[0]	validation_0-merror:0.526
Will train until validation_0-merror hasn't improved in 100 rounds.
[1]	validation_0-merror:0.514
[2]	validation_0-merror:0.518
[3]	validation_0-merror:0.526
[4]	validation_0-merror:0.516
[5]	validation_0-merror:0.518
[6]	validation_0-merror:0.512
[7]	validation_0-merror:0.514
Elapsed training time:  0.9649547999999868
Elapsed Predicting time:  0.020792200000187222
[0.54333333 0.53333333 0.53333333 0.53333333 0.50333333]
accuracy 0.488
confusion_matrix:
    0   1    2   3  4
0  0   3   19   3  1
1  1  14   71   5  0
2  2  11  222  11  0
3  0   6   96   7  1
4  0   2   21   3  1
classification report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.39      0.15      0.22        91
           2       0.52      0.90      0.66       246
           3       0.24      0.06      0.10       110
           4       0.33      0.04      0.07        27

    accuracy                       

In [7]:
"""lstm.py: Copyright 2020, Sentiment Analysis on Movie Reviews using LSTM"""
__authors__ = "Xueru Ye, Ruoran Liu, Keith Herbert"
__copyright__ = "Copyright 2020, Sentiment Analysis on Movie Reviews"
__license__ = "GPL"
__version__ = "1.0.0"
__maintained_by__ = "Xueru Ye, Ruoran Liu, Keith Herbert"
__email__ = "kherbe@uwo.ca, xye85@uwo.ca@uwo.ca, rliu454@uwo.ca"
__status__ = "Production"

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import model_selection

import unicodedata, re, string
import nltk
#nltk.download()

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
import os
from memory_profiler import profile
from time import perf_counter

import sys
warnings.filterwarnings('ignore')

print(os.listdir("./"))

df_train = pd.read_csv('data.tsv', delimiter='\t')
df_train = df_train.sample(n = 2000)

# =========== Pre-processing =========== #
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Remove all interger occurrences in list of tokenized words with textual representation"""
    new_words = []
    for word in words:
        new_word = re.sub("\d+", "", word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
#    words = remove_stopwords(words)
    return words


# =========

# First step - tokenizing phrases

### nltk.download() # If you get an error here, uncomment this line and download the missing package
df_train['Words'] = df_train['Phrase'].apply(nltk.word_tokenize)

# Second step - passing through prep functions
df_train['Words'] = df_train['Words'].apply(normalize)
df_train['Words'].head()

# Third step - creating a list of unique words to be used as dictionary for encoding
word_set = set()
for l in df_train['Words']:
    for e in l:
        word_set.add(e)

word_to_int = {word: ii for ii, word in enumerate(word_set, 1)}

# Check if they are still the same lenght
print('len(word_set)',len(word_set))
print('len(word_to_int)',len(word_to_int))

# Now the dict to tokenize each phrase
df_train['Tokens'] = df_train['Words'].apply(lambda l: [word_to_int[word] for word in l])
df_train['Tokens'].head()


# Step four - get the len of longest phrase
max_len = df_train['Tokens'].str.len().max()
print('max_len',max_len)

# Pad each phrase representation with zeroes, starting from the beginning of sequence
# Will use a combined list of phrases as np array for further work. This is expected format for the Pytorch utils to be used later

all_tokens = np.array([t for t in df_train['Tokens']])
encoded_labels = np.array([l for l in df_train['Sentiment']])

# Create blank rows
features = np.zeros((len(all_tokens), max_len), dtype=int)
# for each phrase, add zeros at the end
for i, row in enumerate(all_tokens):
    features[i, :len(row)] = row

#print first 3 values of the feature matrix
print(features[:3])

# =======================================================
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*0.8)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of  resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

# =======================================================
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 10

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

# Check the size of the loaders (how many batches inside)
print(len(train_loader))
print(len(valid_loader))
print(len(test_loader))

# =======================================================
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

# =======================================================
class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
                            dropout=drop_prob, batch_first=True)

        # dropout layer
        self.dropout = nn.Dropout(0.3)

        # linear
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x.to(torch.int64))

        lstm_out, hidden = self.lstm(embeds, hidden)

        # transform lstm output to input size of linear layers
        lstm_out = lstm_out.transpose(0, 1)
        lstm_out = lstm_out[-1]

        out = self.dropout(lstm_out)
        out = self.fc(out)

        return out, hidden

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden



# ========================================================================
# Instantiate the model w/ hyperparams

# Override - Tuned
# epochs = 3 
# embedding_dim = 80
# hidden_dim = 50
# lr = 0.04

# Override - Untuned
# epochs = 3 
# embedding_dim = 40
# hidden_dim = 25
# lr = 0.02

vocab_size = len(word_to_int)+1 # +1 for the 0 padding
output_size = 5
embedding_dim = 80
hidden_dim = 50
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

# ========================================================================
# loss and optimization functions
lr=0.004

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

# ========================================================================

# training params
epochs = 3  # 3-4 is approx where I noticed the validation loss stop decreasing

print_every = 100
clip = 5  # gradient clipping

# move model to GPU, if available
if (train_on_gpu):
    net.cuda()

sys.setrecursionlimit(10000)
#@profile
def time_complexity():
    counter = 0
    t1_start = perf_counter()
    # ========================================================================
    net.train()
    # train for some number of epochs
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)

        # batch loop
        for inputs, labels in train_loader:
            counter += 1
            if (train_on_gpu):
                inputs, labels = inputs.cuda(), labels.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])
            # zero accumulated gradients
            net.zero_grad()

            # get the output from the model
            output, h = net(inputs, h)
            # calculate the loss and perform backprop
            loss = criterion(output, labels.to(torch.int64))
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.

            nn.utils.clip_grad_norm_(net.parameters(), clip)
            optimizer.step()

            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for inputs, labels in valid_loader:

                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])

                    if (train_on_gpu):
                        inputs, labels = inputs.cuda(), labels.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, labels.to(torch.int64))

                    val_losses.append(val_loss.item())

                net.train()
                print("Epoch: {}/{}...".format(e + 1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))


    t1_stop = perf_counter()
    print("Elapsed Training Time:", t1_stop - t1_start)
    t1_start = perf_counter()
    # ========================================================================
    # Get test data loss and accuracy
    predict_list = []
    label_list = []
    test_losses = []  # track loss
    num_correct = 0

    # init hidden state
    h = net.init_hidden(batch_size)

    net.eval()
    # iterate over test data
    for inputs, labels in test_loader:

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        if (train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # get predicted outputs
        output, h = net(inputs, h)
        # calculate loss
        test_loss = criterion(output, labels.to(torch.int64))
        test_losses.append(test_loss.item())

        # convert output probabilities to predicted class
        _, pred = torch.max(output, 1)

        # compare predictions to true label
        correct_tensor = pred.eq(labels.view_as(pred))
        temp_label = labels.view_as(pred).tolist()
        temp_pred = pred.tolist()
        label_list.extend(temp_label)
        predict_list.extend(temp_pred)
        correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
        num_correct += np.sum(correct)

    # -- stats! -- ##
    # avg test loss
    print("Test loss: {:.3f}".format(np.mean(test_losses)))
    # accuracy over all test data
    test_acc = num_correct / len(test_loader.dataset)
    print("Test accuracy: {:.3f}".format(test_acc))

    t1_stop = perf_counter()
    print("Elapsed Predicting Time:", t1_stop - t1_start)

    # -- stats! -- ##
    print(accuracy_score(label_list, predict_list))
    print(confusion_matrix(label_list, predict_list))
    print(classification_report(label_list, predict_list))

time_complexity()

['.git', '.ipynb_checkpoints', 'archive', 'assignment1', 'assignment2', 'data.tsv', 'dbl_hybrid.py', 'lstm.py', 'lstm_kfold.py', 'project.ipynb', 'Untitled.ipynb', 'xgb.py', '__pycache__']
len(word_set) 4207
len(word_to_int) 4207
max_len 43
[[ 116   49 1094 1750 1517  319 1402 2584 1094    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [1591    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [4039 2199 3238 1737 4130 1750 2842 1401 1354 1237 1152    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0]]
			Feature Shapes:
Train set: 		(1600, 43) 
Validation set: 	(200, 43) 
Test set: 		(2