# Sentiment Analysis Classification Hans-on


## Hans-on target
- This is for beginners of NLP.

## Datasets
- Cornell MR(movie review) Dataset (https://www.cs.cornell.edu/people/pabo/movie-review-data/)

## Implement Models
- RNN
- LSTM
- Bi-LSTM
- LSTM with Attention
- CNN

## References
- [Recent Trends in Deep Learning Based Natural Language Processing, 2018](https://arxiv.org/pdf/1708.02709.pdf)

# Hans On

## Pre processing
- Conell MR data is formmated as sentence and label file.
- So we need to convert TSV dataset for training easily.
- And also we need to split dataset as `train`, `dev(valid)`, `test` for training.
- ref. [preprocessing.py](preprocessing.py)

In [6]:
import os

class TSVGenerator(object):
    def __init__(self, root_dir='data'):
        self.root_dir = root_dir
        self.phase = ['train', 'dev', 'test']
        self.corpus_path = os.path.join(root_dir, "{}.sen")
        self.label_path = os.path.join(root_dir, "{}.lab")

    def __call__(self, phase):
        assert phase in self.phase, 'Unable phase'

        corpus_path = self.corpus_path.format(phase)
        label_path = self.label_path.format(phase)

        corpus = [line.replace('\n', '').strip()
                  for line in open(corpus_path, 'r').readlines()]
        label = [line.replace('\n', '').strip()
                 for line in open(label_path, 'r').readlines()]

        with open(os.path.join(self.root_dir, f'{phase}.tsv'), 'w') as f:
            for sen, lab in zip(corpus, label):
                f.write('{}\t{}\n'.format(sen, lab))

In [7]:
generator = TSVGenerator()
target = ['train', 'dev', 'test']
for val in target:
    generator(val)

## Dataset Loader
- After converting Conell MR dataset to TSV format, now we need to implement dataset loader for training.
- In hands-on we use the torchtext library(https://pytorch.org/text/stable/index.html).
- ref. [dataset.py](dataset.py)

### Prequisite
- You need to install spacy lauguage first
``` bash
python -m spacy download en
python -m spacy download en_core_web_md
```

In [11]:
!python -m spacy download en
!python -m spacy download en_core_web_md

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl#egg=en_core_web_sm==3.0.0 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617[0m[33m
[0mCollecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.0.0
[38

In [13]:
import os

import torch
from torchtext.data import Field, LabelField, TabularDataset, Iterator
from torchtext.vocab import Vectors

torch.manual_seed(1234)
torch.cuda.manual_seed(1234)


class MyDataset(object):

    def __init__(self, root_dir='data', batch_size=64, use_vector=True, pdevice = 'cpu'):
        self.TEXT = Field(sequential=True, use_vocab=True, tokenizer_language='en_core_web_sm',
                          tokenize='spacy', lower=True, batch_first=True)
        self.LABEL = LabelField(dtype=torch.float)
        vectors = Vectors(name='mr_vocab.txt', cache='./')
        dataset_path = os.path.join(root_dir, '{}.tsv')
        self.dataset = {}
        self.dataloader = {}
        for target in ['train', 'dev', 'test']:
            self.dataset[target] = TabularDataset(
                path=dataset_path.format(target),
                format='tsv',
                fields=[('text', self.TEXT), ('label', self.LABEL)]
            )
            if use_vector:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors)
            else:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000)

            self.LABEL.build_vocab(self.dataset[target])
            self.dataloader[target] = Iterator(self.dataset[target],
                                               batch_size=batch_size,
                                               device=pdevice,
                                               repeat=False,
                                               sort_key=lambda x: len(x.text),
                                               shuffle=True)

In [18]:
# Sampling dataset format
dataset = MyDataset(batch_size=1)
for idx, v in enumerate(dataset.dataloader['test']):
    print(f"{v.text},{v.label}")
    if idx == 10:
        break

tensor([[  5, 296, 442,   2]]),tensor([1.])
tensor([[  54,  346,  322,  145, 1570,    8,  140,    4, 4116,    7, 1240, 1830,
          304,   37, 2749,    2]]),tensor([0.])
tensor([[2562,    4,  374, 4440,   16,    4, 1292, 2042, 1136,   12, 2015,    8,
         2510,    4,  495,  612,   16,  495,  612,  658]]),tensor([1.])
tensor([[   4,  106,  292,    4,   19,    8,    5, 1450,  276,    2]]),tensor([1.])
tensor([[  80,  101, 1088,  265,    8,  306,   36,    4, 1541,   76,    6,  259,
           50,   31,  108,  151, 1901,   45,  258,    3,    6, 3827,   11, 1057,
         4291,   12,  279,  263,  255,  356,   91,    8,  567,  664,    2]]),tensor([0.])
tensor([[   5, 4723,  349,  224,   36,   21, 1866,  769, 1198,   58,   12, 1916,
           23,  469,    6,    5, 1235,    8, 1652,   44,  301,    2]]),tensor([1.])
tensor([[ 773,   37, 1420,    6,  581,   11, 3002,  106,    3,    4,   18,   11,
          332, 1526,   13,   25,  607,    2,  118,   12,  305,    6,  751,    2]]),tensor([1

## Training
- Now you just select model, hyperparameter, structure, and so on like below
- ref. [train.py](train.py)


In [31]:
!mkdir -p checkpoints
!PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python train.py \
    --optim=adam \
    --lr=1e-3 \
    --batch_size=64 \
    --epoch=15 \
    --ed=300 \
    --model=lstm_attn

Namespace(optim='adam', lr=0.001, batch_size=64, epoch=1, cuda=False, ed=300, word_vector=True, model='lstm_attn', hd=512, layer=2, bidirectional=True, dropout=True)
Model: LSTM with Attension
Optim: Adam
Epoch: 01, Train Loss: 0.571, Train Acc: 69.81%, Val. Loss: 0.509, Val. Acc: 74.88%, Test Loss: 0.495, Test Acc: 75.49%


- If you want realtime training inforamtion on tensorboard, just type like below and enter it

In [24]:
%tensorboard --logdir logs/fit

UsageError: Line magic function `%tensorboard` not found.


## Inference
- Finally, you can use your trainied model!
- ref. [demo.py](demo.py)

In [27]:
import torch
import torch.nn.functional as F
import spacy
import argparse
import re

from dataset import MyDataset
nlp = spacy.load('en_core_web_sm')

def get_models(ckpt_path, device, dataset):
    from train import build_model
    info = torch.load(ckpt_path)
    args = info['args']
    pth = info['pth']

    model = build_model(args, device, dataset)
    model.load_state_dict(pth)
    return model


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--ckpt_path', type=str, required=True)
    parser.add_argument('input', type=str)
    args = parser.parse_args()


    device = torch.device("cpu")
    dataset = MyDataset(batch_size=1, use_vector=True)
    model = get_models(args.ckpt_path, device, dataset)

    tokenized = [tok.text for tok in nlp.tokenizer(args.input)]
    indexed = [dataset.TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    tensor = torch.cat(2*[tensor], dim=1)
    preds = model(tensor)
    max_preds = preds.argmax(dim=1)
    print(max_preds[0].item())



TypeError: 'required' is an invalid argument for positionals