**Data Preprocessing** <br/>
This Notebook is responsible for saving the vocabulary, train, val and test data for its subsequent use in the project.

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
import torch
import torchtext
from torch.utils.data import Dataset,DataLoader
import torch.nn as nn
import torch.optim as optim
import spacy
import numpy as np

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def extract_phrases(my_tree, phrase):
  """
  This method is extract the valid phrases according to the grammar from the parse tree of a sentence.
  Parameters
  ----------
  my_tree: This is the parse tree of the sentence
  phrases: Non terminals of the grammar rules. This basically defines the structure of phrases we are trying to extract

  Returns
  -------
  a list of tokens 
  """

  my_phrases = []
  if my_tree.label() in phrase:

    my_phrases.append(my_tree.copy(True))

  for child in my_tree:

    if type(child) is nltk.Tree:

      list_of_phrases = extract_phrases(child, phrase)
      if len(list_of_phrases) > 0:
        my_phrases.extend(list_of_phrases)
  
  return my_phrases


def custom_tokenizer(text):
  """
  This method tokenizes the input text
  Parameters
  ----------
  text: The sentence to be tokenized

  Returns
  -------
  a list of tokens 
  """
  grammar = """NP: {<RB>*<DT>?(<JJ>|<JJS>|<JJR>)*(<NN>|<NNP>|<NNS>)+}
               RBJJ:{(<RB>|<RBR>|<RBS>)+(<JJ>|<JJS>|<JJR>)+}
               JJ: {<JJ>}
               JJS: {<JJS>}
               JJR: {<JJR>}
               VB: {<VB>}
               VBG: {<VBG>}
               VBN: {<VBN>}
               VBP: {<VBP>}
               VBZ: {<VBZ>}
               VBD: {<VBD>}
               MD: {<MD>}
               RB: {<RB>}
               RBR: {<RBR>}
               RBS: {<RBS>}
               PRP: {<PRP>}
               IN: {<IN>}
               CC: {<CC>}
                """
  cp = nltk.RegexpParser(grammar)
  sentence = nltk.pos_tag(nltk.tokenize.word_tokenize(text))
  tree = cp.parse(sentence)
  list_of_noun_phrases = extract_phrases(tree, ['NP','VBD','IN','VB','VBN','VBP','VBZ','RBR','RB','RBS','PRP','JJ','JJS','JJR','RBJJ','CC'])
  tokens=[]
  for phrase in list_of_noun_phrases:
    tokens.append("_".join([x[0] for x in phrase.leaves()]))
  return tokens 

In [None]:
#Using the custom tokenizer to tokenize our text
TEXT = torchtext.data.Field(tokenize=custom_tokenizer,lower=True)
LABEL = torchtext.data.LabelField(dtype=torch.float)

In [None]:
train_data,val_test_data = torchtext.data.TabularDataset(
    path='IMDB Dataset.csv',
    format="CSV",
    fields=[('review',TEXT),('sentiment',LABEL)],
    skip_header = True
).split(0.8)

val_data,test_data=val_test_data.split(0.5)

In [None]:
TEXT.build_vocab(train_data,max_size=200000, vectors = "glove.6B.100d")
LABEL.build_vocab(train_data)

In [None]:
weights = TEXT.vocab.vectors

In [None]:
def create_embeddings(word2idx, weights,embeddingbag):
  """
  This method is used to create embedding representations for the phrases.
  Parameters
  ----------
  word2idx: A mapping from words to index
  weights: GloVe word vectors for individual words.
  embeddingbag: an instance of EmbeddingBag to average the word vectors.
  
  Returns
  -------
  word embeddings for all the words including the phrases. 
  """
  new_weights=weights.detach().clone()
  for word,index in list(word2idx.items()):
    if '_' in word:
      tokens = word.split('_')
      token_id=[]
      for token in tokens:
        token_id.append(word2idx[token])
      inputs = torch.LongTensor([token_id])
      new_vec = embeddingbag(inputs)
      new_weights[index] = new_vec
      token_id=[]
  return new_weights

In [None]:
embeddingbag = nn.EmbeddingBag.from_pretrained(weights)
modified_embeddings = create_embeddings(TEXT.vocab.stoi,weights,embeddingbag)

In [None]:
TEXT.vocab.set_vectors(TEXT.vocab.stoi,modified_embeddings,100)

**Saving the vocabulary and the data for subsequent use.**

In [None]:
import dill

In [None]:
with open("/content/vocab","wb")as f:
     dill.dump(TEXT,f)

In [None]:
with open("/content/label","wb")as f:
     dill.dump(LABEL,f)

In [None]:
with open("/content/vocab","rb")as f:
     TEXT1=dill.load(f)

In [None]:
with open("/content/train_data","wb")as f:
     dill.dump(train_data.examples,f)

In [None]:
with open("/content/fields","wb")as f:
     dill.dump(train_data.fields,f)

In [None]:
with open("/content/val_data","wb")as f:
     dill.dump(val_data.examples,f)

In [None]:
with open("/content/test_data","wb")as f:
     dill.dump(test_data.examples,f)