<a href="https://colab.research.google.com/github/srinath2022/enhanced-SLAM/blob/master/sentiment_analysis_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# define model.
import math
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from gensim.models import Word2Vec
import csv

from torch.utils.data import Dataset, DataLoader
import os
import pandas as pd
import random
from torchvision import transforms, utils
import numpy as np


In [3]:
# Constants
root = '/content/drive/MyDrive/CS260CDeepLearning/DeepLearningProject'
preprocessOutputFolder="/preprocessOutput"
tokenizedTweetsFileTrain   = root+preprocessOutputFolder+"/tokenizedTweetsTrain.csv"
tokenizedTweetsLabelFileTrain  = root+preprocessOutputFolder+"/tokenizedTweetsLabelTrain.csv"
tokenizedTweetsFileTest   = root+preprocessOutputFolder+"/tokenizedTweetsTest.csv"
tokenizedTweetsLabelFileTest  = root+preprocessOutputFolder+"/tokenizedTweetsLabelTest.csv"
word2VecModelFile     = root+preprocessOutputFolder+"/word2VecModel"
embeddingVectorsFile  = root+preprocessOutputFolder+"/embeddingVectors.txt"

SEQUENCE_LENGTH=150
EMBEDDING_VECTOR_DIMENSION = 512

In [4]:
# dataFrame = pd.read_csv(tokenizedTweetsFile)
# print(dataFrame.head(5))

with open(tokenizedTweetsFileTrain) as f:
    my_csv_data = list(csv.reader(f))
print(my_csv_data[0])

['http://twitpic.com/2y1zl', '-', 'awww', ',', "that's", 'a', 'bummer', '.', 'you', 'shoulda', 'got', 'david', 'carr', 'of', 'third', 'day', 'to', 'do', 'it', '.', ';D']


In [5]:
# Load tokenized training data
# Train word embeddings
csvInput  = open(tokenizedTweetsFileTrain, 'r')
csvReader = csv.reader(csvInput)
wordModel = Word2Vec(sentences=csvReader, size=EMBEDDING_VECTOR_DIMENSION, window=5, min_count=5, workers=4) # why is vector_size not allowed here?
csvInput.close()
wordVectors = wordModel.wv


In [6]:
from torch._C import double
# Create TRAIN DataSet
# Use tokenized trining data to create train data of shape N*L*D
# N -> Number of training samples
# L -> Sequence length
# D -> Emvedding vector dimention

class TwitterDataset(Dataset):

    def __init__(self, data_csv_file, label_csv_file, wordVectors, SEQUENCE_LENGTH, EMBEDDING_VECTOR_DIMENSION, transform=None):
        # chunksize = 10 ** 6
        # self.train_data_frame = pd.read_csv(csv_file)
        with open(data_csv_file) as f:
          self.train_data_frame = list(csv.reader(f))

        with open(label_csv_file) as f:
          self.label_data_frame = list(csv.reader(f))

        self.transform = transform

        self.wordVectors = wordVectors
        self.SEQUENCE_LENGTH = SEQUENCE_LENGTH
        self.EMBEDDING_VECTOR_DIMENSION = EMBEDDING_VECTOR_DIMENSION

    def __len__(self):
        return len(self.train_data_frame)

    def __getitem__(self, idx):
        sentence = self.train_data_frame[idx]
        sentenceVectorEmbedding = self.__getSentenceVectorEmbedding__(sentence)
        if sentenceVectorEmbedding is None:
          return self.__getitem__(random.randint(0, len(self.train_data_frame)-1))

        label = self.label_data_frame[idx]
        label = [int(x) for x in label]
        label = np.array(label)
        sample = {'sentence': sentenceVectorEmbedding, 'label': label}

        if self.transform:
            sample = self.transform(sample)

        return sample

    def __getSentenceVectorEmbedding__(self, sentence):
        train_i=[]
        for word in sentence:
          if word in self.wordVectors:
            embeddingVector = self.wordVectors[word]
            train_i.append(embeddingVector)
          else:
            continue

        curr_word_count = len(train_i)
        if curr_word_count>self.SEQUENCE_LENGTH:
          print("Input contains sequence of length greater than SEQUENCE_LENGTH. Consider a larger sequence length")
          return None
        
        if curr_word_count<self.SEQUENCE_LENGTH:
          ZERO_VECTOR = [0]*self.EMBEDDING_VECTOR_DIMENSION
          while curr_word_count!=self.SEQUENCE_LENGTH:
            train_i.append(ZERO_VECTOR)
            curr_word_count += 1

        return np.array(train_i)

class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        sentenceVectorEmbedding, label = sample['sentence'], sample['label']
        return {'sentence': torch.from_numpy(sentenceVectorEmbedding),
                'label': torch.from_numpy(label)}


In [7]:
batchSize=32
my_transforms = [ToTensor()]
train_dataset = TwitterDataset(
                                  data_csv_file=tokenizedTweetsFileTrain,
                                  label_csv_file=tokenizedTweetsLabelFileTrain,
                                  wordVectors=wordVectors, 
                                  SEQUENCE_LENGTH=SEQUENCE_LENGTH, 
                                  EMBEDDING_VECTOR_DIMENSION=EMBEDDING_VECTOR_DIMENSION,
                                  transform=transforms.Compose(my_transforms)
                              )

test_dataset = TwitterDataset(
                                  data_csv_file=tokenizedTweetsFileTest,
                                  label_csv_file=tokenizedTweetsLabelFileTest,
                                  wordVectors=wordVectors, 
                                  SEQUENCE_LENGTH=SEQUENCE_LENGTH, 
                                  EMBEDDING_VECTOR_DIMENSION=EMBEDDING_VECTOR_DIMENSION,
                                  transform=transforms.Compose(my_transforms)
                              )

train_dataloader  = DataLoader(train_dataset, batch_size=batchSize, shuffle=True, num_workers=2)
test_dataloader   = DataLoader(test_dataset,  batch_size=batchSize, shuffle=True, num_workers=2)

In [8]:
# Test a sample from dataloader
dataiter = iter(train_dataloader)
sample = dataiter.next()
sentenceEmbeddingVectors = sample['sentence']
labels = sample['label']
print(sentenceEmbeddingVectors.shape)
sentenceEmbeddingVectors = torch.swapaxes(sentenceEmbeddingVectors, 0, 1)
print(sentenceEmbeddingVectors.shape)
print(labels.shape)

torch.Size([32, 150, 512])
torch.Size([150, 32, 512])
torch.Size([32, 1])


In [9]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [10]:
class TransformerModel(nn.Module):

    def __init__(self, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, n_class: int = 2, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        #self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.n_class = n_class
        #will implement a NN model here
        self.classifier = nn.Linear(d_model, n_class)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        # self.encoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        #src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.classifier(output)
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [11]:
# create a model
emsize = 512  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 8  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability
n_class = 2
model = TransformerModel(emsize, nhead, d_hid, nlayers, n_class, dropout)

In [20]:
# New Training code to accomodate dataloader
import copy
import time

criterion = nn.CrossEntropyLoss()
lr = 5  # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(model: nn.Module, dataloader: DataLoader) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(SEQUENCE_LENGTH)

    # num_batches = len(train_data) // bptt
    total_batches = len(dataloader)
    for i_batch, sample_batch in enumerate(dataloader):
        data, targets = sample_batch['sentence'], sample_batch['label']
        data = torch.swapaxes(data, 0, 1) # To convert it to L*N*D

        output = model(data, src_mask)
        loss = criterion(output, targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if i_batch % log_interval == 0 and i_batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {i_batch:5d}/{total_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, dataloader: DataLoader) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt)
    with torch.no_grad():
        for i_batch, sample_batch in enumerate(dataloader):
          data, targets = sample_batch['sentence'], sample_batch['label']
          batch_size = data.size(0)
          data = torch.swapaxes(data, 0, 1) # To convert it to L*N*D
          output = model(data, src_mask)
          total_loss += batch_size * criterion(output, targets).item()

    return total_loss / (len(dataloader) - 1)

In [21]:
best_val_loss = float('inf')
epochs = 3
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model, train_dataloader)
    val_loss = evaluate(model, test_dataloader)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    scheduler.step()

RuntimeError: ignored

In [None]:
# Not necessary anymore: TO-DO: Delete this cell
# bptt = 35 #sequence length
# def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
#     """
#     Args:
#         source: Tensor, shape [full_seq_len, batch_size]
#         i: int

#     Returns:
#         tuple (data, target), where data has shape [seq_len, batch_size] and
#         target has shape [seq_len * batch_size]
#     """
#     seq_len = min(bptt, len(source) - 1 - i)
#     data = source[i:i+seq_len]
#     target = source[i+1:i+1+seq_len].reshape(-1)
#     return data, target

In [None]:
# import copy
# import time

# criterion = nn.CrossEntropyLoss()
# lr = 5  # learning rate
# optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

# def train(model: nn.Module) -> None:
#     model.train()  # turn on train mode
#     total_loss = 0.
#     log_interval = 200
#     start_time = time.time()
#     src_mask = generate_square_subsequent_mask(bptt)

#     num_batches = len(train_data) // bptt
#     for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
#         data, targets = get_batch(train_data, i)
#         batch_size = data.size(0)
#         if batch_size != bptt:  # only on last batch
#             src_mask = src_mask[:batch_size, :batch_size]
#         output = model(data, src_mask)
#         loss = criterion(output, targets)

#         optimizer.zero_grad()
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
#         optimizer.step()

#         total_loss += loss.item()
#         if batch % log_interval == 0 and batch > 0:
#             lr = scheduler.get_last_lr()[0]
#             ms_per_batch = (time.time() - start_time) * 1000 / log_interval
#             cur_loss = total_loss / log_interval
#             ppl = math.exp(cur_loss)
#             print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
#                   f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
#                   f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
#             total_loss = 0
#             start_time = time.time()

# def evaluate(model: nn.Module, eval_data: Tensor) -> float:
#     model.eval()  # turn on evaluation mode
#     total_loss = 0.
#     src_mask = generate_square_subsequent_mask(bptt)
#     with torch.no_grad():
#         for i in range(0, eval_data.size(0) - 1, bptt):
#             data, targets = get_batch(eval_data, i)
#             batch_size = data.size(0)
#             if batch_size != bptt:
#                 src_mask = src_mask[:batch_size, :batch_size]
#             output = model(data, src_mask)
#             total_loss += batch_size * criterion(output, targets).item()
#     return total_loss / (len(eval_data) - 1)