<a href="https://colab.research.google.com/github/shreevigneshs/UCSC-243/blob/main/assignments/1_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from typing import List
import re
import string

import torch
import torchtext
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
from collections import Counter

from tqdm import tqdm

In [2]:
# df = pd.read_csv("/content/drive/MyDrive/NLP243/assignments/1/hw1_train-1.csv")
# df["split"] = "train"
# df.columns = ["id", "text", "labels", "split"]
# df.labels = df.labels.replace(np.nan, "none", regex=True)

In [3]:
# np.random.seed(20221002)
# train_proportion = 0.80
# val_proportion = 0.20

In [4]:
# item_list = list(row.to_dict() for _, row in df.iterrows())
# np.random.shuffle(item_list)

In [5]:
# # Create split data
# final_list = []

# n_total = len(item_list)
# n_train = int(train_proportion * n_total)
# n_val = int(val_proportion * n_total)

# # Give data point a split attribute
# for item in item_list[:n_train]:
#   item['split'] = 'train'
    
# for item in item_list[n_train:n_train+n_val+1]:
#   item['split'] = 'val'

# # Add to final list
# final_list.extend(item_list)

# final_df = pd.DataFrame(final_list)

In [6]:
# print(final_df)

In [7]:
# def preprocess_text(text):
#   text = text.lower()
#   text = re.sub(r"([.,!?])", r" \1 ", text)
#   text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
#   return text

# final_df.text = final_df.text.apply(preprocess_text)

In [8]:
# print(final_df.text)

In [9]:
# final_df.to_csv("/content/drive/MyDrive/NLP243/assignments/1/clean_hw1_train-1.csv")

In [10]:
class Vocabulary(object):
    """Class to process text and extract Vocabulary for mapping"""

    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<unk>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}

        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        
        
    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [11]:
class Vectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, text_vocab):
        """
        Args:
            review_vocab (Vocabulary): maps words to integers
            rating_vocab (Vocabulary): maps class labels to integers
        """
        self.text_vocab = text_vocab
        # self.label_vocab = label_vocab

    def vectorize(self, text):
        """Create a collapsed one-hit vector for the review
        
        Args:
            review (str): the review
        Returns:
            one_hot (np.ndarray): the collapsed one-hot encoding
        """
        one_hot = np.zeros(len(self.text_vocab), dtype=np.float32)
        
        for token in text.split(" "):
            if token not in string.punctuation:
                one_hot[self.text_vocab.lookup_token(token)] = 1

        return one_hot

    @classmethod
    def from_dataframe(cls, df, cutoff=100):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        text_vocab = Vocabulary(add_unk=True)
        
        # label_vocab = Vocabulary(add_unk=False)
        
        # # Add ratings
        # for rating in sorted(set(df.labels)):
        #     rating_vocab.add_token(rating)

        # Add top words if count > provided count
        word_counts = Counter()
        for text in df.text:
            for word in text.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        
        print(word_counts.most_common(100))

        for word, count in dict(word_counts.most_common(cutoff)).items():
            text_vocab.add_token(word)
        
        # for word, count in word_counts.items():
        #     if count > cutoff:
        #         text_vocab.add_token(word)

        # return cls(review_vocab, rating_vocab)
        
        return cls(text_vocab)

    @classmethod
    def from_serializable(cls, contents):
        """Intantiate a ReviewVectorizer from a serializable dictionary
        
        Args:
            contents (dict): the serializable dictionary
        Returns:
            an instance of the ReviewVectorizer class
        """
        text_vocab = Vocabulary.from_serializable(contents['text_vocab'])
        
        # rating_vocab =  Vocabulary.from_serializable(contents['rating_vocab'])

        # return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)

        return cls(text_vocab=text_vocab)

    def to_serializable(self):
        """Create the serializable dictionary for caching
        
        Returns:
            contents (dict): the serializable dictionary
        """
        # return {'review_vocab': self.review_vocab.to_serializable(),
        #         'rating_vocab': self.rating_vocab.to_serializable()}

        return {'text_vocab': self.text_vocab.to_serializable()}

In [12]:
# Label Indexer
# For converting labels into indices 0 - L1, 1 - L2

class LabelIndexer(object):
    def __init__(self, labels):

        # self.labels = list(set([_split_label for label in movie_df.labels.unique().flatten() for _split_label in label.split()]))
        
        self.labels = labels
        self.n_labels = len(self.labels)
        self.label2idx = {label: i for i, label in enumerate(self.labels)}
        self.idx2label = {i:label for label, i in self.label2idx.items()}

    def encode(self, y):
        one_hot_encoded = np.zeros(self.n_labels, dtype=int)
        normalized_label = list(set([_split_label for _split_label in y.split()]))
        for _label in normalized_label:
          one_hot_encoded[self.label2idx[_label]] = 1.
        return one_hot_encoded

    def encode_batch(self, ys):
        return torch.LongTensor([self.encode(y) for y in ys])
    
    @classmethod
    def from_dataframe(cls, df):
        labels = list(set([_split_label for label in df.labels.unique().flatten() for _split_label in label.split()]))
        return cls(labels)
    
    @classmethod
    def from_serializable(cls, contents):
        """Intantiate a ReviewVectorizer from a serializable dictionary
        
        Args:
            contents (dict): the serializable dictionary
        Returns:
            an instance of the ReviewVectorizer class
        """
        labels = LabelIndexer.from_serializable(contents['labels'])
        
        # rating_vocab =  Vocabulary.from_serializable(contents['rating_vocab'])

        # return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)

        return cls(labels=labels)

    def to_serializable(self):
        """Create the serializable dictionary for caching
        
        Returns:
            contents (dict): the serializable dictionary
        """
        # return {'review_vocab': self.review_vocab.to_serializable(),
        #         'rating_vocab': self.rating_vocab.to_serializable()}

        return {'labels': self.labels.to_serializable()}


In [13]:
class MovieDataset(Dataset):
    def __init__(self, movie_df, vectorizer, label_indexer):
        """
        Args:
            movie_df (pandas.DataFrame): the dataset
            vectorizer (ReviewVectorizer): vectorizer instantiated from dataset
        """
        self.movie_df = movie_df
        self._vectorizer = vectorizer
        self._label_indexer = label_indexer

        self.train_df = self.movie_df[self.movie_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.movie_df[self.movie_df.split=='val']
        self.validation_size = len(self.val_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size)}

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer_and_label_indexer(cls, movie_csv):
        """Load dataset and make a new vectorizer from scratch
        
        Args:
            review_csv (str): location of the dataset
        Returns:
            an instance of ReviewDataset
        """
        movie_df = pd.read_csv(movie_csv)



        return cls(movie_df, Vectorizer.from_dataframe(movie_df), LabelIndexer.from_dataframe(movie_df))

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer

    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe 
        
        Args:
            split (str): one of "train" or "val"
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dict of the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        text_vector = \
            self._vectorizer.vectorize(row.text)

        label_index = \
            self._label_indexer.encode(row.labels)

        return {'x_data': text_vector,
                'y_target': label_index}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

In [14]:
dataset = MovieDataset.load_dataset_and_make_vectorizer_and_label_indexer(movie_csv="/content/drive/MyDrive/NLP243/assignments/1/clean_hw1_train-1.csv")
vectorizer = dataset.get_vectorizer()

[('the', 919), ('movies', 873), ('of', 422), ('movie', 407), ('me', 385), ('in', 377), ('what', 351), ('show', 329), ('for', 318), ('find', 238), ('who', 236), ('i', 233), ('is', 215), ('to', 194), ('by', 177), ('list', 173), ('a', 167), ('was', 160), ('produced', 156), ('about', 141), ('directed', 139), ('are', 104), ('want', 100), ('with', 97), ('all', 95), ('can', 93), ('director', 93), ('information', 87), ('when', 86), ('films', 84), ('will', 81), ('did', 78), ('see', 77), ('search', 77), ('you', 76), ('rated', 75), ('made', 73), ('spielberg', 72), ('rating', 70), ('that', 70), ('released', 68), ('how', 68), ('from', 68), ('avatar', 66), ('on', 65), ('pg', 64), ('finding', 64), ('nemo', 64), ('s', 64), ('has', 60), ('please', 60), ('language', 57), ('look', 57), ('budget', 53), ('producer', 51), ('like', 49), ('some', 48), ('ferrell', 47), ('up', 47), ('tom', 46), ('cast', 46), ('steven', 45), ('would', 44), ('get', 43), ('dancing', 43), ('date', 42), ('info', 42), ('have', 42), (

In [15]:
# data loader

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [16]:
batched_data = generate_batches(dataset=dataset, batch_size=2)

In [17]:
# Zipping the dataloader with range(N) lets us only print the first N batches
for _, batch in zip(range(5), batched_data):
    print(batch["x_data"].shape, batch["y_target"].shape)
    # Do something here; maybe print the batch to see if it looks right to you?
    # print(batch.shape)

torch.Size([2, 101]) torch.Size([2, 19])
torch.Size([2, 101]) torch.Size([2, 19])
torch.Size([2, 101]) torch.Size([2, 19])
torch.Size([2, 101]) torch.Size([2, 19])
torch.Size([2, 101]) torch.Size([2, 19])
