<a href="https://colab.research.google.com/github/srinath2022/enhanced-SLAM/blob/master/sentiment_analysis_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# define model.
import math
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from gensim.models import Word2Vec
import csv

from torch.utils.data import Dataset, DataLoader
import os
import pandas as pd
import random
from torchvision import transforms, utils
import numpy as np


In [13]:
# Constants
root = '/content/drive/MyDrive/CS260CDeepLearning/DeepLearningProject'
preprocessOutputFolder="/preprocessOutput"
tokenizedTweetsFile   = root+preprocessOutputFolder+"/tokenizedTweets.csv"
tokenizedTweetsLabelFile  = root+preprocessOutputFolder+"/tokenizedTweetsLabel.csv"
word2VecModelFile     = root+preprocessOutputFolder+"/word2VecModel"
embeddingVectorsFile  = root+preprocessOutputFolder+"/embeddingVectors.txt"

SEQUENCE_LENGTH=150
EMBEDDING_VECTOR_DIMENSION = 512

In [15]:
dataFrame = pd.read_csv(tokenizedTweetsFile)
print(dataFrame.head(5))

ParserError: ignored

In [14]:
# Load tokenized training data
# Train word embeddings
csvInput  = open(tokenizedTweetsFile, 'r')
csvReader = csv.reader(csvInput)
wordModel = Word2Vec(sentences=csvReader, size=EMBEDDING_VECTOR_DIMENSION, window=5, min_count=5, workers=4) # why is vector_size not allowed here?
csvInput.close()
wordVectors = wordModel.wv


KeyboardInterrupt: ignored

In [10]:
# Create TRAIN DataSet
# Use tokenized trining data to create train data of shape N*L*D
# N -> Number of training samples
# L -> Sequence length
# D -> Emvedding vector dimention

# Pandas Large File Read CSV Error
# https://stackoverflow.com/questions/25962114/how-do-i-read-a-large-csv-file-with-pandas

class TwitterDataset(Dataset):

    def __init__(self, csv_file, wordVectors, SEQUENCE_LENGTH, EMBEDDING_VECTOR_DIMENSION, transform=None):
        # chunksize = 10 ** 6
        self.train_data_frame = pd.read_csv(csv_file)
        self.transform = transform

        self.wordVectors = wordVectors
        self.SEQUENCE_LENGTH = SEQUENCE_LENGTH
        self.EMBEDDING_VECTOR_DIMENSION = EMBEDDING_VECTOR_DIMENSION

    def __len__(self):
        return len(self.train_data_frame)

    def __getitem__(self, idx):
        label = self.train_data_frame.iloc[idx, 0]
        sentence = self.train_data_frame.iloc[idx, 1:]
        sentenceVectorEmbedding = self.__getSentenceVectorEmbedding__(sentence)
        if sentenceVectorEmbedding is None:
          return self.__getitem__(random.randint(0, len(self.train_data_frame)-1))

        sample = {'sentence': sentenceVectorEmbedding, 'label': label}

        if self.transform:
            sample = self.transform(sample)

        return sample

    def __getSentenceVectorEmbedding__(self, sentence):
        train_i=[]
        for word in sentence:
          if word in self.wordVectors:
            embeddingVector = self.wordVectors[word]
            train_i.append(embeddingVector)
          else:
            continue

        curr_word_count = len(train_i)
        if curr_word_count>self.SEQUENCE_LENGTH:
          print("Input contains sequence of length greater than SEQUENCE_LENGTH. Consider a larger sequence length")
          return None
        
        if curr_word_count<self.SEQUENCE_LENGTH:
          ZERO_VECTOR = [0]*self.EMBEDDING_VECTOR_DIMENSION
          while curr_word_count!=self.SEQUENCE_LENGTH:
            train_i.append(ZERO_VECTOR)
            curr_word_count += 1

        return np.array(train_i)

class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        sentenceVectorEmbedding, label = sample['sentence'], sample['label']

        return {'sentence': torch.from_numpy(sentenceVectorEmbedding),
                'label': torch.from_numpy(label)}


In [8]:
batchSize=32
my_transforms = [ToTensor()]
train_dataset = TwitterDataset(
                                  csv_file=tokenizedTweetsFile,
                                  wordVectors=wordVectors, 
                                  SEQUENCE_LENGTH=SEQUENCE_LENGTH, 
                                  EMBEDDING_VECTOR_DIMENSION=EMBEDDING_VECTOR_DIMENSION,
                                  transform=transforms.Compose(my_transforms)
                              )
dataloader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True, num_workers=4)

ParserError: ignored

In [None]:
# Test a sample from dataloader
dataiter = iter(dataloader)
sample = dataiter.next()
sentenceEmbeddingVectors = sample['sentence']
labels = sample['label']
print(sentenceEmbeddingVectors.shape)
print(labels.shape)

In [17]:
def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into bsz separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Args:
        data: Tensor, shape [N]
        bsz: int, batch size

    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // bsz
    print(seq_len)
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data

In [20]:
a=torch.randn(10)
print(a)
print("========")
data=batchify(a,4)
print(data)

tensor([-0.0990, -0.0761, -1.2792,  1.2983, -0.7822,  0.4490,  0.7400, -1.0694,
        -1.1487,  0.8761])
2
tensor([[-0.0990, -1.2792, -0.7822,  0.7400],
        [-0.0761,  1.2983,  0.4490, -1.0694]])


In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
class TransformerModel(nn.Module):

    def __init__(self, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, n_class: int = 2, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        #self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.n_class = n_class
        #will implement a NN model here
        self.classifier = nn.Linear(d_model, n_class)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        #src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.classifier(output)
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [None]:
# create a model
emsize = EMBEDDING_VECTOR_DIMENSION  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 5  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability
n_class = 2
model = TransformerModel(emsize, nhead, d_hid, nlayers, n_class, dropout)

In [None]:
bptt = 35 #sequence length
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int

    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and
        target has shape [seq_len * batch_size]
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

In [None]:
import copy
import time

criterion = nn.CrossEntropyLoss()
lr = 5  # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(model: nn.Module, dataloader: DataLoader) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(bptt)

    num_batches = len(train_data) // bptt
    for batch, i in enumerate(dataloader):
        data, targets = get_batch(train_data, i)
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        output = model(data, src_mask)
        loss = criterion(output, targets)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            total_loss += batch_size * criterion(output, targets).item()
    return total_loss / (len(eval_data) - 1)

In [None]:
best_val_loss = float('inf')
epochs = 3
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    val_loss = evaluate(model, val_data)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)

    scheduler.step()

NameError: ignored

In [42]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

train_iter = WikiText2(split='train')
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    # i=10
    # itm=None
    # while i>=0:
    #   itm=next(raw_text_iter)
    #   print(itm)
    #   print(tokenizer(itm))
    #   print(vocab(tokenizer(itm)))
    #   print("==============================")
    #   i = i-1
    # return torch.randn(10)
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    # print(len(data))
    smd = data[0:4]
    print(smd)
    print("==============================")
    print(torch.cat(tuple(filter(lambda t: t.numel() > 0, smd))))
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

# train_iter was "consumed" by the process of building the vocab,
# so we have to create it again
train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter)
# val_data = data_process(val_iter)
# test_data = data_process(test_iter)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into bsz separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Args:
        data: Tensor, shape [N]
        bsz: int, batch size

    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)  # shape [seq_len, batch_size]
# val_data = batchify(val_data, eval_batch_size)
# test_data = batchify(test_data, eval_batch_size)

[tensor([], dtype=torch.int64), tensor([   9, 3849, 3869,  881,    9]), tensor([], dtype=torch.int64), tensor([20000,    83,  3849,    88,     0,  3869,    21,   780, 28780,     2,
         6182,     3,  3849,     4,     1,  5023,    88,    20,     2,  1837,
         1018,     7,    14,  3849,  3869,   881,   629,   976,     2,    23,
            8,  5790,   299,    12,   575,   232,    67,   452,    19, 13722,
            5,   757,     3,  2500,    17,     1,  1767,  5637,     3,   155,
            6,   246,   354,     6,   976,     2,    24,    23,     1,   237,
           67,     6,     1,  3849,    93,     3,     0,     1,   156,  4419,
            4,  5790,     5,   729,    12,    58,  2096,    14,    43,  7075,
            2,     1,   333,  1085,  3218,     7,     1,    37,    67,     5,
         1694,     1, 11219,     2,     8, 19698,   313,  1063,  2082,     1,
         1702,     4, 19009,    56,     1,    95, 25357,   107,    52,  1938,
         1644,   288,   598,     5,    