<a href="https://colab.research.google.com/github/remre/StriveSchool-ai/blob/main/m7NLP/d3/notes/Amazon_Reviews_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To perform sentence classification, and many other classification tasks for NLP, we need to do three main steps:

- Preprocessing the data
- Prepare the dataloader
- Build the model

Of course, all of these steps requires a lot of other steps, and also they can include many different solutions. 

To make you to jumpstart on this task, I will provide you a pretty clean dataset, the Amazon Reviews one, that you can extensively find online, and it's also included in the `torxchtext.datasets` module. 

For this example, I will use just a little part of it, to give some guidance on how to start, without actually training the whole model.

### Load the data



In [1]:
import pandas as pd
import spacy
import torch 
import torchtext  ## for downloading the data from pytorch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data.dataloader import default_collate
from torch.utils.data.dataset import random_split
from torch.utils.data.dataset import ConcatDataset
from torch.utils.data.dataset import Subset

In [2]:
torchtext.datasets.AmazonReviewFull(root='./data/' , split= ('train','test'))

(<torchtext.data.datasets_utils._RawTextIterableDataset at 0x7f5c752aba10>,
 <torchtext.data.datasets_utils._RawTextIterableDataset at 0x7f5b82d83fd0>)

In [3]:
df = pd.read_csv('/content/data/AmazonReviewFull/amazon_review_full_csv/train.csv', nrows= 4000, header=None)

In [4]:
df.rename({0: 'ratings', 1: 'review_title',2:"review"}, axis=1, inplace=True)

In [5]:
df["reviews"] = df["review_title"] + " " + df["review"]

In [6]:
df.head()

Unnamed: 0,ratings,review_title,review,reviews
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...,more like funchuck Gave this to my dad for a g...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...,Inspiring I hope a lot of people hear this cd....
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,The best soundtrack ever to anything. I'm read...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...,Chrono Cross OST The music of Yasunori Misuda ...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...,Too good to be true Probably the greatest soun...


In [7]:
df.drop(["review_title", "review"], axis=1, inplace=True)

In [8]:
df.ratings = df.ratings.apply(lambda x: int(x) - 1)

In [9]:
df.ratings.unique()

array([2, 4, 3, 0, 1])

In [10]:
nlp = spacy.load('en_core_web_sm')

# preprocessing


In [11]:
def preprocessing(sentences):
    """ perform : tokenization & lemmatization, removes stopwords & punctuations, lower cases"""

    doc = nlp(sentences)
    # tokenization, we return the tokens that aren't stopwords or punctuations
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return tokens

In [12]:
txt = "This is the day's what i'm looking for actually"

In [13]:
preprocessing(txt)

['day', 'look', 'actually']

# Encoder - Fastext 

In [14]:
from torchtext.vocab import FastText  # Glove, FastText, Word2Vec


In [15]:
fasttext = FastText("simple") 

In [16]:
fasttext.dim

300

In [17]:

def token_encoder(token, vec):

    """ encodes a single word"""
    if token == "<pad>":
        return 1
    else:
        try:
            return vec.stoi[token]  # if the token is in the vocabulary, string to index
        except:
            if type(token) != str:
                print("Error, we need a word which is in string format")
            else:
                return 0  # if the token is not in the vocabulary, return 0
                

In [18]:
token_encoder("Paramveer", fasttext)

0

In [19]:
fasttext.itos[2610], fasttext.stoi["hello"]

('hello', 2610)

In [20]:
def encoder(tokens, voc):
    """ encodes a list of tokens"""
    return [token_encoder(token, voc) for token in tokens]


In [21]:
preprocessing(txt)

['day', 'look', 'actually']

In [22]:
txt = "Python is kicking me and, so is Paramveer "
encoder(preprocessing(txt),fasttext)

[7856, 5577, 0]

In [23]:
fasttext.itos[66032]

'murtaza'

In [24]:
fasttext.itos[0], fasttext.itos[1]

('</s>', '.')

In [25]:
def padding(list_of_indexes, max_seq_len, padding_index = 1):
    output = list_of_indexes + (max_seq_len - len(list_of_indexes)) * [padding_index]
    return output[:max_seq_len]

In [26]:
list_of_indexes = encoder(preprocessing(txt),fasttext)
list_of_indexes

[7856, 5577, 0]

In [27]:
padding(list_of_indexes, 10)

[7856, 5577, 0, 1, 1, 1, 1, 1, 1, 1]

In [28]:
ntxt = "Python is kicking me and, so is Paramveer Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer  Python is kicking me and, so is Paramveer Python is kicking me and, so is Paramveer  "
list_of_indexes = encoder(preprocessing(ntxt),fasttext)
len(list_of_indexes)

42

In [29]:
padding(list_of_indexes, 10)

[7856, 5577, 0, 7856, 5577, 0, 0, 7856, 5577, 0]

# Dataloader

In [30]:
class TrainData(Dataset):
    def __init__(self, df, max_seq_len =32):
        self.max_seq_len = max_seq_len
        train_iter =iter(df.reviews.values)
        self.vec = FastText("simple")
        self.vec.vectors[1] = -torch.ones(self.vec.vectors[1].shape[0])
        self.vec.vectors[0] = torch.zeros(self.vec.vectors[0].shape[0])
        self.vectorize = lambda x: self.vec.vectors[x]
        self.labels = df.ratings.values
        self.sequences =[padding(encoder(preprocessing(sequence),fasttext),max_seq_len=32) for sequence in train_iter]

    def __len__(self):
        return len(self.sequences)
           
    def __getitem__(self, idx):
        assert len(self.sequences[idx]) == self.max_seq_len
        return self.sequences[idx], self.labels[idx]  #idx so thatT we dont flood the memory with tensors 

        
   

In [31]:
#[padding(encoder(preprocessing(sequence),fasttext),max_seq_len=32) for sequence in df.reviews.values[:10]]

In [32]:
fasttext.vectors[7856].shape

torch.Size([300])

In [33]:
dataset = TrainData(df[:20])

In [None]:
dataset[1]

In [35]:
df

Unnamed: 0,ratings,reviews
0,2,more like funchuck Gave this to my dad for a g...
1,4,Inspiring I hope a lot of people hear this cd....
2,4,The best soundtrack ever to anything. I'm read...
3,3,Chrono Cross OST The music of Yasunori Misuda ...
4,4,Too good to be true Probably the greatest soun...
...,...,...
3995,1,Horrible. I have had the first DVD since I bou...
3996,1,Cheap! I haven't been able to watch this much ...
3997,4,More Excellent Instruction from the Hoopnotica...
3998,3,"Great book. As a former teenage girl, I decide..."


In [36]:
dataset.vectorize(0).shape

torch.Size([300])

In [37]:
iter(df.reviews.values)

<iterator at 0x7f5b804e3310>

In [38]:
def collate(batch, vectorizer = dataset.vectorize):
    # batch of sentence
    # from these sentences we want the tokens, for each sentence
    # we want to get the tokens, and then we want to get the embeddings
    # we need to pass vectorizer
    inputs = torch.stack([torch.stack([vectorizer(token) for token in sentence[0]]) for sentence in batch])  
    targets = torch.LongTensor([item[1] for item in batch])

    return inputs, targets



In [39]:
batch_size = 16
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate)
train_iter = iter(train_loader)
sentence, target = next(train_iter)
sentence.shape

torch.Size([16, 32, 300])

# Model 

In [55]:
emb_dim = fasttext.dim 

from torch import nn
import torch.nn.functional as F
emb_dim = 300
class Classifier(nn.Module):
    def __init__(self, max_seq_len, emb_dim, hidden1=16, hidden2=16):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(max_seq_len*emb_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 5)
        self.out = nn.LogSoftmax(dim=1)
    
    
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return self.out(x)


In [56]:
MAX_SEQ_LEN = 32
model = Classifier(MAX_SEQ_LEN, 300, 16, 16)
model

Classifier(
  (fc1): Linear(in_features=9600, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=5, bias=True)
  (out): LogSoftmax(dim=1)
)

In [59]:
criterion = nn.NLLLoss()
optimizer = optim.Adagrad(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

In [62]:

epochs = 15
print_every = 40

for e in range(epochs):
    running_loss = 0
    print(f"Epoch: {e+1}/{epochs}")

    for i, (sentences, labels) in enumerate(iter(train_loader)):

        sentences.resize_(sentences.size()[0], 32* emb_dim)
        
        optimizer.zero_grad()
        
        output = model.forward(sentences)   # 1) Forward pass
        loss = criterion(output, labels) # 2) Compute loss
        loss.backward()                  # 3) Backward pass
        optimizer.step()                 # 4) Update model
        
        running_loss += loss.item()
        
        if i % print_every == 0:
            print(f"\tIteration: {i}\t Loss: {running_loss/print_every:.4f}")
            running_loss = 0

Epoch: 1/15
	Iteration: 0	 Loss: 0.0234
Epoch: 2/15
	Iteration: 0	 Loss: 0.0255
Epoch: 3/15
	Iteration: 0	 Loss: 0.0225
Epoch: 4/15
	Iteration: 0	 Loss: 0.0226
Epoch: 5/15
	Iteration: 0	 Loss: 0.0203
Epoch: 6/15
	Iteration: 0	 Loss: 0.0214
Epoch: 7/15
	Iteration: 0	 Loss: 0.0218
Epoch: 8/15
	Iteration: 0	 Loss: 0.0231
Epoch: 9/15
	Iteration: 0	 Loss: 0.0200
Epoch: 10/15
	Iteration: 0	 Loss: 0.0201
Epoch: 11/15
	Iteration: 0	 Loss: 0.0189
Epoch: 12/15
	Iteration: 0	 Loss: 0.0193
Epoch: 13/15
	Iteration: 0	 Loss: 0.0199
Epoch: 14/15
	Iteration: 0	 Loss: 0.0198
Epoch: 15/15
	Iteration: 0	 Loss: 0.0178


In [None]:
epochs = 10
print_every = 100

for e in range(epochs):
    running_loss = 0
    for i, (sentence, labels) in enumerate(iter(train_loader)): 
        sentence.resize_(sentence.size()[0], max_seq_len*emb_dim)

        optimizer.zero_grad()
        
        output = model.forward(sentence)

        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        if i % print_every == 0:
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Loss: {:.4f}".format(running_loss/print_every))
            running_loss = 0

In [59]:
type(labels)

torch.Tensor

In [None]:
dff = pd.read_csv("/content/data/AmazonReviewFull/amazon_review_full_csv/test.csv", nrows=3000, header=None)
dff

In [65]:
dff.rename({0:"star", 1:"rating1", 2:"rating2"}, axis=1, inplace=True)

Since we are going to predict the number of stars a certain product has got based on the semantics of the text, we could merge the title of the review together with the body of the review, just by concatenating them:

In [66]:
dff["review"] = dff["rating1"] + " " +  dff["rating2"]

In [None]:
dff

and then of course we can drop the other two columns:

In [68]:
dff.drop(columns=["rating1", "rating2"], inplace=True)

In [69]:
dff

Unnamed: 0,star,review
0,1,mens ultrasheer This model may be ok for seden...
1,4,Surprisingly delightful This is a fast read fi...
2,2,"Works, but not as advertised I bought one of t..."
3,2,Oh dear I was excited to find a book ostensibl...
4,2,"Incorrect disc! I am a big JVC fan, but I do n..."
...,...,...
2995,2,A MAJOR ( PUN INTENDED) DISAPPOINTMENT I was s...
2996,4,Good Inside look at the U.S. Open The author d...
2997,1,"A Good Open Spoiled The subtitle should be, ""I..."
2998,2,"Good ideas, but horrible context I praise Oste..."


👏

The `star`column is what we want to predict, given the text of the review. I think we are all Amazon users, and we are all aware of how many stars a rating can have, but let's just double check:

In [None]:
dff.star.unique()

array([1, 4, 2, 3, 5])

In [70]:
dff.star = dff.star.apply(lambda x: int(x) -1)

Ok, now that our data are in order, we need to preprocess them. We can take advantage of spacy for basically of the steps:

In [71]:
nlp = spacy.load("en_core_web_sm")

Let's create a function that, given a sentence, it preprocess it by doing:
- tokenization
- removing stopwords
- remove special characters/punctuation
- make everything lower case
- lemmatize it

With spacy, we can do it in a very compact form:

In [None]:
def preprocessing(sentence):
    """
    params sentence: a str containing the sentence we want to preprocess
    return the tokens list
    """
    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens
    

In [None]:
preprocessing("This is an example! Hello")

['example', 'hello']

The preprocessing phase has not finished yet. In fact, we want to create a neural network, and a neural network works with numbers. In general, computers work with numbers...

So we need to use embeddings to transform a sentence into a tensor: the embeddings are usually one-dimensional, and in the following example they will have size 300, that means that if you have a sentence of 10 words (after have it preprocessed), the shape of the sentence will be $10\times 300$. You will notice another dimension, that is the batch size. So you will train and run a model that receive as input a tensor of shape:

`batch_size*length_of_the_sentence*embedding_size`.

Let's do things in order:

In [72]:
import torch
from collections import Counter
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook

If you are using the whole dataset, you should not need to split the dataset into train and test 'cause it should be already. If not, and if you are using any other dataset, remember to split into train and test (eventually validation).

In [73]:
train_dff, test_dff = dff.iloc[:2000], dff.iloc[2000:]

To get the vectors for each token, we are going to use some pretrained embeddings. Specifically, we are going to use the FastText embeddings that you can find at this link https://pytorch.org/text/stable/vocab.html#fasttext .

We need to download and load them by doing:

In [74]:
from torchtext.vocab import FastText

In [None]:
fasttext = FastText("simple")

You can run `help(fasttext)` and/or `dir(fasttext)` to get more info about the methods and the attributes this object contains.

In [None]:
dir(fasttext)

I want to highlight a couple of things:

- `dim` is the dimensions of the vectors (in our case it is 300)
- `itos` stands for *index to string* and it maps an integer to the corresponding string. The reason for having such a method is that it's much lighter to store integers and use them to index the vectors instead of having a string per word (In addition to that, heuristics can be used so that the most frequent words get lower value for the index, resulting in a better memory management. I know, it sounds like minor things, but the model is going to make billions of operations!)
- `stoi` is the opposite: it's a dictionary that given the string returns the index



Above you can see the embeddings associated with the word "hello". Let's inspect the shape:

In [None]:
fasttext["hello"].shape

torch.Size([300])

300, as anticipated. 

Let's inspect what's the index associated with "hello":

In [None]:
fasttext.stoi["hello"]

2610

and viceversa:

In [None]:
fasttext.stoi

We can create and *encoder* which can transform each word into an integer:

In [None]:
def token_encoder(token, vec):
    if token == "<pad>":
        return 1
    else:
        try:
            return vec.stoi[token]
        except:
            return 0

In [None]:
def encoder(tokens, vec):
    return [token_encoder(token, vec) for token in tokens]

In [75]:
text = "Antonio is learning Python"
encoder(preprocessing(text), fasttext)

[2228, 1660, 7856]

Why all those zeros?
Well, in the function that we have defined, we have put a try and except, in which we are basically saying: if the word is not in the vocabulary, return the index 0. Clearly, Antonio and Python weren't in the corpus used by FastText!


What about the `<pad>` thing? 

Well, not all the reviews have same length, so we need to find a solution for it. Why? Cause our Neural Network is waiting for input that are all of the same size! It needs to know how many weights it needs to initialize!

There are several possibilities, but the easiest is to just set a cap with a `max_seq_len` parameter, so that all the reviews that are shorter than that length will be padded by using a vector associated with the padding index, and all the ones that are longer than `max_seq_len` will be just cut.

Do you see problems? I actually don't see that much problems for it. I think that the sentiment of a comment can be seen already from the first words of the review.

In the encoder part, the `<pad>` is a made up token that we know is very unlikely to be part of the text. To that, I assigned the index 1. 

You may ask: what does it happen to things at index 0 and 1? Well, let's inspect them:

In [76]:
fasttext.itos[0], fasttext.itos[1]

('</s>', '.')

and in our preprocessing pipeline they can never appear! So we are fine with that!

Now let's create a function for padding:

In [None]:
def padding(list_of_indexes, max_seq_len, padding_index=1):
    output = list_of_indexes + (max_seq_len - len(list_of_indexes))*[padding_index]
    return output[:max_seq_len]

In [77]:
text = "this is a sample review"
list_of_indexes = encoder(preprocessing(text), fasttext)
list_of_indexes

[3697, 1363]

In [None]:
padding(list_of_indexes, max_seq_len=10)

[3697, 1363, 1, 1, 1, 1, 1, 1, 1, 1]

In this way, any sentence shorter than 10 becomes of length 10 and anything longer...

In [None]:
text = "this is a sample review this is a sample review this is a sample review this is a sample review this is a sample review v this is a sample review this is a sample review this is a sample review this is a sample review this is a sample review"
list_of_indexes = encoder(preprocessing(text), fasttext)
padding(list_of_indexes, max_seq_len=10)

[3697, 1363, 3697, 1363, 3697, 1363, 3697, 1363, 3697, 1363]

...get just cut to ten!

All right. I feel confident enough to say that we have all of what we need for the preprocessing part!

Now we need to create the:


### Data Loader

Yes, they are back. [Is it a good or a bad memory?]("https://github.com/Strive-School/ai_mar21/blob/main/M5_Deep_Learning/D7/Custom%20DataLoader%20and%20Dataset.ipynb")

If you take a look at that notebook, you remember that to create a custom data loader you need to override some method of the `Dataset` class from `torch.utils.data`. Before doing so, let's define the steps we need to do while loading the data:

- Receive as input a row from the dataframe that we have defined above, that contains two columns: "star" and "review"
- we separate "star" from "review"
- we preprocess the "review" columns by doing what we have so far (tokenization etc but excluding the embeddings for now)
- Padding 
- Store a list containing the sequence of indices with the associated labels

Then we need to override also the `__len__` and the `__getitem__`methods of the `Dataset` class.

Ok, stop talking, more action:

In [None]:
class TrainData(Dataset):
    def __init__(self, dff, max_seq_len=32): # dff is the input dff, max_seq_len is the max lenght allowed to a sentence before cutting or padding
        self.max_seq_len = max_seq_len
        
        counter = Counter()
        train_iter = iter(dff.review.values)
        self.vec = FastText("simple")
        self.vec.vectors[1] = -torch.ones(self.vec.vectors[1].shape[0]) # replacing the vector associated with 1 (padded value) to become a vector of -1.
        self.vec.vectors[0] = torch.zeros(self.vec.vectors[0].shape[0]) # replacing the vector associated with 0 (unknown) to become zeros
        self.vectorizer = lambda x: self.vec.vectors[x]
        self.labels = dff.star
        sequences = [padding(encoder(preprocessing(sequence), self.vec), max_seq_len) for sequence in dff.review.tolist()]
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, i):
        assert len(self.sequences[i]) == self.max_seq_len
        return self.sequences[i], self.labels[i]

In [None]:
dataset = TrainData(train_dff, max_seq_len=32)

When we index dataset with a `dataset[index]` notation, we get the pair containing the padded sequence of indices with the associated label: 

In [None]:
dataset[0]

([468,
  0,
  868,
  2613,
  58316,
  360,
  818,
  12130,
  1044,
  13126,
  520,
  42977,
  2996,
  14931,
  197,
  2901,
  992,
  10051,
  42977,
  0,
  0,
  2603,
  0,
  4085,
  454,
  1736,
  631,
  338,
  3332,
  10770,
  5302,
  5512],
 0)

In [None]:
dataset[1][0]

[15391,
 47950,
 1508,
 934,
 4672,
 11584,
 15402,
 24369,
 14401,
 542,
 71101,
 1097,
 3851,
 19201,
 0,
 5815,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

What are the ones there? They are the product of the padding! 

What is the vector associated with the index 1?

In [None]:
dataset.vec.vectors[1]

All negative ones! Makes sense! This is what we have defined!

Storing into memory a lot of tensors containing all the embedded vectors, it can be very costly. This is why we load them by indexing with an integer. However, when we train our model, we need the embedded vectors!

So let's define the `collate` function that will index our vocabulary only when it needs it!

As argument it takes the batch (which will contains a `batch_size*max_seq_len` shape tensor) and the vectorizer. What is the vectorizer in our case? It's the vectorizer we have built in the TrainData class, that assign the vector associated with an index.

In [None]:
def collate(batch, vectorizer=dataset.vectorizer):
    inputs = torch.stack([torch.stack([vectorizer(token) for token in sentence[0]]) for sentence in batch])
    target = torch.LongTensor([item[1] for item in batch]) # Use long tensor to avoid unwanted rounding
    return inputs, target

And now, we can use the `DataLoader` class as we did for images:

In [None]:
batch_size = 16
train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate)


In [None]:
next(iter(train_loader))[0].shape

torch.Size([16, 32, 300])

Ready to train? Following is a small model to *makes things to run on my computer*. You can expect to be kicked out if you come at the debrief with this model! 



In [None]:
from torch import nn
import torch.nn.functional as F
emb_dim = 300
class Classifier(nn.Module):
    def __init__(self, max_seq_len, emb_dim, hidden1=16, hidden2=16):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(max_seq_len*emb_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 5)
        self.out = nn.LogSoftmax(dim=1)
    
    
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return self.out(x)

In [None]:
MAX_SEQ_LEN = 32
model = Classifier(MAX_SEQ_LEN, 300, 16, 16)
model

Classifier(
  (fc1): Linear(in_features=9600, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=5, bias=True)
  (out): LogSoftmax(dim=1)
)

In [None]:
from torch import optim
criterion = nn.NLLLoss()

# Only train the classifier parameters, feature parameters are frozen
optimizer = optim.Adam(model.parameters(), lr=0.003)


In [None]:
dataiter = iter(train_loader)
sentences, labels = dataiter.next()

In [78]:
# Forward pass through the network
sentence_idx = 0
sentences.resize_(16, 1, MAX_SEQ_LEN*emb_dim).shape
log_ps = model.forward(sentences[sentence_idx,:])

sentence = sentences[sentence_idx]
torch.exp(log_ps)

tensor([[0.0398, 0.0718, 0.1070, 0.5720, 0.2095]], grad_fn=<ExpBackward>)

We got 5 probabilities: one for each of the possible rating star!

In [None]:
epochs = 3
print_every = 40

for e in range(epochs):
    running_loss = 0
    print(f"Epoch: {e+1}/{epochs}")

    for i, (sentences, labels) in enumerate(iter(train_loader)):

        sentences.resize_(sentences.size()[0], 32* emb_dim)
        
        optimizer.zero_grad()
        
        output = model.forward(sentences)   # 1) Forward pass
        loss = criterion(output, labels) # 2) Compute loss
        loss.backward()                  # 3) Backward pass
        optimizer.step()                 # 4) Update model
        
        running_loss += loss.item()
        
        if i % print_every == 0:
            print(f"\tIteration: {i}\t Loss: {running_loss/print_every:.4f}")
            running_loss = 0

Epoch: 1/3
	Iteration: 0	 Loss: 0.0412
	Iteration: 40	 Loss: 1.6470
	Iteration: 80	 Loss: 1.5952
	Iteration: 120	 Loss: 1.6148
Epoch: 2/3
	Iteration: 0	 Loss: 0.0385
	Iteration: 40	 Loss: 1.5419
	Iteration: 80	 Loss: 1.4906
	Iteration: 120	 Loss: 1.6104
Epoch: 3/3
	Iteration: 0	 Loss: 0.0343
	Iteration: 40	 Loss: 1.4839
	Iteration: 80	 Loss: 1.3513
	Iteration: 120	 Loss: 1.4886


Eventually:

In [79]:
from torchtext import datasets

In [82]:
train, test = datasets.AmazonReviewFull()

### Exercises

- Create a real training process: use the train, val, test split for the dataset
- Create a training loop that includes validation and test at the end
    - You can borrow from your previous work, no need to write it from scratch
- If you want to, feel free to change dataset


In [94]:
dff.rename(columns={'star': 'ratings', 'review': 'reviews'}, inplace=True)

In [229]:
frames = [df, dff]

alldataf = pd.concat(frames)

In [230]:
alldataf

Unnamed: 0,ratings,reviews
0,2,more like funchuck Gave this to my dad for a g...
1,4,Inspiring I hope a lot of people hear this cd....
2,4,The best soundtrack ever to anything. I'm read...
3,3,Chrono Cross OST The music of Yasunori Misuda ...
4,4,Too good to be true Probably the greatest soun...
...,...,...
2995,1,A MAJOR ( PUN INTENDED) DISAPPOINTMENT I was s...
2996,3,Good Inside look at the U.S. Open The author d...
2997,0,"A Good Open Spoiled The subtitle should be, ""I..."
2998,1,"Good ideas, but horrible context I praise Oste..."


In [210]:
torchtext.datasets.AmazonReviewFull(root='./data/')

(<torchtext.data.datasets_utils._RawTextIterableDataset at 0x7f5b7857fbd0>,
 <torchtext.data.datasets_utils._RawTextIterableDataset at 0x7f5b7857fa90>)

In [231]:
train_df = alldataf.groupby('ratings').apply(lambda x: x.sample(1000)).reset_index(drop=True)
x_train, y_train  = train_df.iloc[:, 1:], train_df.iloc[:, 0]

valid_df = alldataf.groupby('ratings').apply(lambda x: x.sample(200)).reset_index(drop=True)
x_valid, y_valid  = valid_df.iloc[:, :1], valid_df.iloc[:, 0]

test_df  = alldataf.groupby('ratings').apply(lambda x: x.sample(200)).reset_index(drop=True)
x_test, y_test    = test_df.iloc[:, :1], test_df.iloc[:, 0]

In [232]:
x_train

Unnamed: 0,reviews
0,The Paradise War I wanted to like this book bu...
1,Buyers Beware!!!!!!!!! This is one of those ch...
2,"Horrible! Only 4 of the Columbia shorts, one w..."
3,that kid wit wickie yell like a woman all movi...
4,BUYER BEWARE This is the biggest waste of mone...
...,...
4995,WOW! My favorite of all time....Giovanni intri...
4996,Beautiful but unbelievably sad This is a very ...
4997,"A sweeping, epic novel that touches the heart ..."
4998,Classic When I heard they were making a file o...


In [233]:
import numpy as np
class GetData(Dataset):


  def __init__(self, X,Y,dataframe):
    self.dataframe = dataframe

    self.X = X
    self.Y = Y


    self.labels   = np.asarray(self.Y.iloc[0])

  def __len__(self):

    return len(self.X)

  def __getitem(self,idx):

    review_label = self.labels[idx]

    reviews = self.X.iloc[idx,0]
    reviews = np.array(reviews)

    return review_label, reviews

In [234]:
train_ds = GetData(x_train, y_train,alldataf)
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)

valid_ds = GetData(x_valid, y_valid,alldataf)
valid_dataloader = torch.utils.data.DataLoader(valid_ds, batch_size=16, shuffle=True)

test_ds = GetData(x_test, y_test,alldataf)
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=16, shuffle=True)

In [None]:
i,review,star = enumerate(iter(train_dataloader))
print(star,review)

In [166]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f5b785627d0>

In [None]:
epochs = 10
train_size = len(train_ds)
valid_size = len(valid_ds)

valid_conf_matrixes = []

for epoch in range(epochs):
    labels = torch.tensor([]).to(device).detach()
    preds  = torch.tensor([]).to(device).detach()
    
    total_preds = 0
    correct_preds = 0
    
    train_running_loss = 0.0
    
    for index, data in enumerate(train_loader):
        model.train()
        
        batch_inputs, batch_labels = data[0][:].to(device).type(torch.float), data[1][:].to(device)
        
        outputs = model(batch_inputs)
        
        loss = criterion(outputs, batch_labels) # expects distribution from model softmax as pred and target_index as target
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_running_loss += loss.mean()
        
        labels = torch.cat((labels, batch_labels))
        #total_preds += 1
        
        for index, item in enumerate(outputs):
            #if labels[index] == torch.argmax(item):
            #    correct_preds += 1
                
            preds  = torch.cat((preds, torch.argmax(item).unsqueeze(-1)))
        
        if index+1 == int(train_size / batch_size):
            print(f'Training Epoch: {epoch+1}, step: {index+1}, mean training loss: {train_running_loss / int(train_size / batch_size)}')
            train_running_loss = 0.0
    
    print('Calculating conf_matrix')
    conf_mat = confusion_matrix(labels.cpu().numpy(), preds.cpu().numpy())
    
    total = np.sum(conf_mat)
    
    correct_count = 0
    
    for i, data in enumerate(conf_mat[0]):
        correct_count += conf_mat[i][i]
    
    
    print(f'Training Epoch {epoch+1}:\n Accuracy: {correct_count/total}\n{conf_mat}\n')
    print()

In [None]:

    
    
    total_preds = 0
    correct_preds = 0
    
    valid_running_loss = 0.0
    
    for index, data in enumerate(valid_dataloader):
        model.eval()
        
        batch_inputs, batch_labels = data[0][:].to(device).type(torch.float), data[1][:].to(device)
        
        outputs = model(batch_inputs)
        
        loss = criterion(outputs, batch_labels) # expects distribution from model softmax as pred and target_index as target
        
        train_running_loss += loss.mean()
        
        labels = torch.cat((labels, batch_labels))
        #total_preds += 1
        
        for i, item in enumerate(outputs):
            #if labels[i] == torch.argmax(item):
            #    correct_preds += 1
                
            preds  = torch.cat((preds, torch.argmax(item).unsqueeze(-1)))
        
        if index+1 == int(train_size / batch_size):
            print(f'Validation Epoch: {epoch+1}, step: {index+1}, mean validation loss: {valid_running_loss / int(valid_size / batch_size)}')
            train_running_loss = 0.0
        
    print('Calculating conf_matrix')
    conf_mat = confusion_matrix(labels.cpu().numpy(), preds.cpu().numpy())
    
    total = np.sum(conf_mat)
    
    correct_count = 0
    
    for i, data in enumerate(conf_mat[0]):
        correct_count += conf_mat[i][i]
    
    valid_conf_matrixes.append(conf_mat)   
    
    print(f'Validation Epoch {epoch+1}:\n Accuracy: {correct_count/total}\n{conf_mat}')
    print()