# Sources
* [YT](https://www.youtube.com/watch?v=KRgq4VnCr7I&t=114s)
* [YT-code](https://github.com/aladdinpersson/Machine-Learning-Collection/blob/master/ML/Pytorch/more_advanced/torchtext/torchtext_tutorial1.py)
* [BucketIterator](https://gmihaila.medium.com/better-batches-with-pytorchtext-bucketiterator-12804a545e2a)
* [BucketIterator](https://github.com/gmihaila/ml_things/blob/master/notebooks/pytorch/pytorchtext_bucketiterator.ipynb)
* [PyTorch-Official-Tutorial](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html)
* [Variable sentence length](https://medium.com/@sonicboom8/sentiment-analysis-with-variable-length-sequences-in-pytorch-6241635ae130)
* [torchtext.data.Field](https://pytorch.org/text/_modules/torchtext/data/field.html)
* [torchtext.data.TabularDataset](https://pytorch.org/text/_modules/torchtext/data/dataset.html)
* [tochtext.datasets.translation](https://pytorch.org/text/_modules/torchtext/datasets/translation.html)
* [torchtext.data.BucketIterator](https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.BucketIterator)
* [Sentiment-analysis-transformer](https://towardsdatascience.com/fine-grained-sentiment-analysis-part-3-fine-tuning-transformers-1ae6574f25a6)
* [Kaggle-IMDB dataset](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

# Imports

In [1]:
import torch, torchtext
import torch.nn as nn
import torch.optim as optim
import spacy, io, os, sys
import pandas as pd
import dill as pickle
from IPython.display import display, HTML
from torchtext.utils import unicode_csv_reader
import transformer.Constants as Constants
from torchtext.data import Field, TabularDataset, BucketIterator

# Some environment checks

In [2]:
print('Torchtext version:{}, Torch version:{}'.format(torchtext.__version__, torch.__version__))
print('Is CUDA available:{}'.format(torch.cuda.is_available()))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Device is{}'.format(device))

Torchtext version:0.8.0, Torch version:1.7.1
Is CUDA available:True
Device iscuda:0


# Setting up the tokenization for the language model

## Some constants

In [3]:
MIN_FREQ = 3
MAX_LEN = 256

## Load a pre-built tokenization model

In [4]:
src_lang_model = spacy.load('en')

In [18]:
type(src_lang_model)

spacy.lang.en.English

In [20]:
vars(src_lang_model)

{'_meta': {'lang': 'en',
  'name': 'core_web_sm',
  'license': 'MIT',
  'author': 'Explosion',
  'url': 'https://explosion.ai',
  'email': 'contact@explosion.ai',
  'description': 'English multi-task CNN trained on OntoNotes. Assigns context-specific token vectors, POS tags, dependency parse and named entities.',
  'sources': [{'name': 'OntoNotes 5',
    'url': 'https://catalog.ldc.upenn.edu/LDC2013T19',
    'license': 'commercial (licensed by Explosion)'}],
  'pipeline': ['tagger', 'parser', 'ner'],
  'version': '2.3.1',
  'spacy_version': '>=2.3.0,<2.4.0',
  'parent_package': 'spacy',
  'accuracy': {'las': 89.7572754092,
   'uas': 91.6570115569,
   'token_acc': 99.756964111,
   'las_per_type': {'advmod': {'p': 85.6065101297,
     'r': 84.9512113055,
     'f': 85.2776018577},
    'aux': {'p': 97.9464841319, 'r': 98.0772654442, 'f': 98.0118311613},
    'nsubj': {'p': 95.530627567, 'r': 94.7522887555, 'f': 95.1398662913},
    'root': {'p': 89.5162856958, 'r': 91.1692936754, 'f': 90.3352

In [30]:
ip_text = 'I am a good person, and I have a few dollars $!! and no dollars'
temp = src_lang_model.tokenizer(ip_text)
temp

I am a good person, and I have a few dollars $!! and no dollars

In [42]:
def tokenize_src(ip_text):
    return [tok.text for tok in src_lang_model.tokenizer(ip_text)]

## Build `source`, and `target` fields.

In [43]:
print(tokenize_src('I am a good person, and I have a few dollars $!! and no dollars'))

['I', 'am', 'a', 'good', 'person', ',', 'and', 'I', 'have', 'a', 'few', 'dollars', '$', '!', '!', 'and', 'no', 'dollars']


In [48]:
text = Field(tokenize=tokenize_src, lower=True, pad_token=Constants.PAD_WORD, sequential=True,
               init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD, is_target=False)
label = Field(sequential=False, use_vocab=False, is_target=True)
fields = {'text':('text', text), 'label':('label', label)} ## a dictionary of tuples.
#fields = [('text', text), ('label', label)]

In [50]:
fields.values()

dict_values([('text', <torchtext.data.field.Field object at 0x7fbbabdb7910>), ('label', <torchtext.data.field.Field object at 0x7fbbabdb7d50>)])

## Load the dataset


````##Source: https://stackoverflow.com/questions/17912307/u-ufeff-in-python-string
path = '/home/visionteam/tf_tutorials/imdb_dataset/Test.csv'
with io.open(os.path.expanduser(path), encoding="utf-8") as f:
    reader = unicode_csv_reader(f, delimiter=',')
    print(next(reader))
    ['\ufefftext', 'label'] <-- Notice the extra \ufeff, to prevent this use encoding="utf-8-sig"
````

In [None]:
path = 'imdb_dataset/Train.csv'
df = pd.read_csv(path, encoding="utf-8-sig")
display(df.head())
df.to_csv(path, encoding='utf-8', index=False)
df = pd.read_csv(path)
display(df.head()) 

In [None]:
path = 'imdb_dataset/Test.csv'
df = pd.read_csv(path, encoding="utf-8-sig")
display(df.head())
df.to_csv(path, encoding='utf-8', index=False)
df = pd.read_csv(path)
display(df.head()) 

In [None]:
path = 'imdb_dataset/Valid.csv'
df = pd.read_csv(path, encoding="utf-8-sig")
display(df.head())
df.to_csv(path, encoding='utf-8', index=False)
df = pd.read_csv(path)
display(df.head()) 

* x is an object that contains 'text', and 'label' attributes. Value of the 'text' is simply the input sentence, and the value of the 'label' is simply the label for the review.

In [51]:
def filter_examples_with_length(x):
    return len(vars(x)['text']) <= MAX_LEN 

In [52]:
train_data, test_data, valid_data = TabularDataset.splits(path='/home/visionteam/tf_tutorials/imdb_dataset', 
                                                          train='Train.csv',validation='Valid.csv', 
                                                          test='Test.csv',format='csv', fields=fields,
                                                         filter_pred=filter_examples_with_length)
# <--- instantiates a Dataset class and return it
# try without passing filter_pred=filter_examples_with_length



In [60]:
vars(x)['text']

['someone',
 'needed',
 'to',
 'make',
 'a',
 'car',
 'payment',
 '...',
 'this',
 'is',
 'truly',
 'awful',
 '...',
 'makes',
 'jean',
 'claude',
 "'s",
 'cyborg',
 'look',
 'like',
 'gone',
 'with',
 'the',
 'wind',
 '...',
 'this',
 'is',
 'an',
 'hour',
 'i',
 'wish',
 'i',
 'could',
 'sue',
 'to',
 'get',
 'back',
 '...',
 'luckily',
 'it',
 'produced',
 'severe',
 'somnolence',
 '...',
 'from',
 'which',
 'i',
 'fell',
 'asleep',
 '.',
 'how',
 'can',
 'actors',
 'of',
 'this',
 'caliber',
 'create',
 'this',
 'dog',
 '?',
 'i',
 'would',
 'rather',
 'spend',
 'the',
 'time',
 'watching',
 'algae',
 'grow',
 'on',
 'the',
 'side',
 'of',
 'a',
 'fish',
 'tank',
 'than',
 'partake',
 'of',
 'this',
 'wholly',
 'awful',
 'concoction',
 'of',
 'several',
 'genre',
 '.',
 'i',
 'now',
 'use',
 'the',
 'dvd',
 'as',
 'a',
 'coaster',
 'on',
 'my',
 'coffee',
 'table',
 '.',
 '$',
 '5.99',
 'at',
 'walmart',
 'is',
 'far',
 'too',
 'much',
 'to',
 'spend',
 'on',
 'this',
 'movie',
 '.

In [56]:
x = test_data[0]
vars(x).keys()

dict_keys(['text', 'label'])

In [53]:
len(test_data)

3268

In [61]:
len(train_data)

25588

In [62]:
type(train_data)

torchtext.data.dataset.TabularDataset

In [74]:
vars(train_data).keys()

dict_keys(['examples', 'fields'])

In [75]:
example_text = vars(train_data.examples[0])['text']
print('Example text:{}\n'.format(example_text))
example_label = vars(train_data.examples[0])['label']
print('Example label:{}'.format(example_label))

Example text:['i', 'grew', 'up', '(', 'b.', '1965', ')', 'watching', 'and', 'loving', 'the', 'thunderbirds', '.', 'all', 'my', 'mates', 'at', 'school', 'watched', '.', 'we', 'played', '"', 'thunderbirds', '"', 'before', 'school', ',', 'during', 'lunch', 'and', 'after', 'school', '.', 'we', 'all', 'wanted', 'to', 'be', 'virgil', 'or', 'scott', '.', 'no', 'one', 'wanted', 'to', 'be', 'alan', '.', 'counting', 'down', 'from', '5', 'became', 'an', 'art', 'form', '.', 'i', 'took', 'my', 'children', 'to', 'see', 'the', 'movie', 'hoping', 'they', 'would', 'get', 'a', 'glimpse', 'of', 'what', 'i', 'loved', 'as', 'a', 'child', '.', 'how', 'bitterly', 'disappointing', '.', 'the', 'only', 'high', 'point', 'was', 'the', 'snappy', 'theme', 'tune', '.', 'not', 'that', 'it', 'could', 'compare', 'with', 'the', 'original', 'score', 'of', 'the', 'thunderbirds', '.', 'thankfully', 'early', 'saturday', 'mornings', 'one', 'television', 'channel', 'still', 'plays', 'reruns', 'of', 'the', 'series', 'gerry', '

In [78]:
train_data.fields

{'text': <torchtext.data.field.Field at 0x7fbbabdb7910>,
 'label': <torchtext.data.field.Field at 0x7fbbabdb7d50>}

## Build the vocabulary

In [91]:
print(list(text.vocab.stoi.values())[0:20]) ## shouldn't work because we haven't build the vocabulary yet, and try making
# the BucketIterator without building the vocabulary

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [92]:
print(list(text.vocab.stoi.keys())[0:20])

['<unk>', '<blank>', '<s>', '</s>', 'the', '.', ',', 'and', 'a', 'of', 'to', 'is', 'i', 'it', 'this', 'in', 'that', 'movie', 'was', '"']


In [93]:
print(len(list(text.vocab.stoi.keys())))

30195


In [80]:
text.build_vocab(train_data.text, min_freq=MIN_FREQ)
#text.build_vocab(valid_data.text, min_freq=MIN_FREQ)
#text.build_vocab(test_data.text, min_freq=MIN_FREQ)

## Save the Field (that has built vocabulary), and examples

In [94]:
data = {
    'fields_with_and_without_vocab': {'text': text, 'label':label},
    'train_examples': train_data.examples,
    'valid_examples': valid_data.examples,
    'test_examples': test_data.examples}

In [95]:
save_data = os.path.expanduser('~/tf_tutorials/imdb_dataset/imdb_fields_and_vocab.pkl')
print(save_data)

/home/visionteam/tf_tutorials/imdb_dataset/imdb_fields_and_vocab.pkl


In [96]:
pickle.dump(data, open(save_data, 'wb'))

In [97]:
data = pickle.load(open(save_data, 'rb'))

In [98]:
print(list(data['fields_with_and_without_vocab']['text'].vocab.stoi.keys())[0:20])
print(list(data['fields_with_and_without_vocab']['text'].vocab.stoi.values())[0:20])
print
print(list(data['fields_with_and_without_vocab']['text'].vocab.stoi.keys())[-1])
print(list(data['fields_with_and_without_vocab']['text'].vocab.stoi.values())[-1])

['<unk>', '<blank>', '<s>', '</s>', 'the', '.', ',', 'and', 'a', 'of', 'to', 'is', 'i', 'it', 'this', 'in', 'that', 'movie', 'was', '"']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
·
30194


In [99]:
train_iterator, test_iterator, valid_iterator = BucketIterator.splits(
    (train_data, test_data, valid_data), batch_size=2, device=device
)  #<--- bucketiterator expects a dataset object and fields that already have vocabularay built.



In [100]:
len(train_iterator)

12794

In [101]:
len(test_iterator)

1634

In [102]:
len(valid_iterator)

1611

In [103]:
count = 0
for batch in train_iterator:
    print(batch.text.shape)
    print(batch.label.shape)
    count += 1
    if(count == 5):
        sys.exit()



torch.Size([240, 2])
torch.Size([2])
torch.Size([225, 2])
torch.Size([2])
torch.Size([230, 2])
torch.Size([2])
torch.Size([203, 2])
torch.Size([2])
torch.Size([174, 2])
torch.Size([2])


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [104]:
batch.text

tensor([[    2,     2],
        [   12,    12],
        [  877,    86],
        [   14,    29],
        [  490,   705],
        [   31,    10],
        [   46,  4375],
        [11111,     4],
        [  578,   237],
        [   51,   127],
        [10849,  2502],
        [    5,    42],
        [   12,    91],
        [  202,    54],
        [  927,     4],
        [   12,    26],
        [   81,     5],
        [    5,    60],
        [   13,     8],
        [   18,  1192],
        [   59,    33],
        [  379,    14],
        [ 2406,   896],
        [   10,    35],
        [ 1694,     9],
        [ 2918,     4],
        [    7,   109],
        [  454,  1380],
        [    4,   131],
        [  490,    12],
        [    5,   146],
        [   12,   112],
        [  177,  1742],
        [    4,     5],
        [ 2488,    56],
        [   82,    82],
        [    4,    42],
        [  125,   129],
        [   12,   122],
        [   32,  2328],
        [  112,     6],
        [  102, 