In [2]:
# import libraries

import pandas as pd
import spacy
import torch
from collections import Counter
from torchtext.vocab import Vocab
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook
from torch import nn
import torch.nn.functional as F
from torch import optim

In [3]:
# loading train data

train_df = pd.read_csv('/content/drive/MyDrive/amazon_reviews_data/train.csv', header=None)
train_df.head()

Unnamed: 0,0,1,2
0,3,more like funchuck,Gave this to my dad for a gag gift after direc...
1,5,Inspiring,I hope a lot of people hear this cd. We need m...
2,5,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
3,4,Chrono Cross OST,The music of Yasunori Misuda is without questi...
4,5,Too good to be true,Probably the greatest soundtrack in history! U...


In [4]:
# loading test data

test_df = pd.read_csv('/content/drive/MyDrive/amazon_reviews_data/test.csv', header=None)
test_df.head()

Unnamed: 0,0,1,2
0,1,mens ultrasheer,"This model may be ok for sedentary types, but ..."
1,4,Surprisingly delightful,This is a fast read filled with unexpected hum...
2,2,"Works, but not as advertised",I bought one of these chargers..the instructio...
3,2,Oh dear,I was excited to find a book ostensibly about ...
4,2,Incorrect disc!,"I am a big JVC fan, but I do not like this mod..."


In [5]:
# rename columns
train_df.rename({0:"star", 1:"rating1", 2:"rating2"}, axis = 1, inplace = True)

# merge the two reviews columns
train_df["review"] = train_df["rating1"] + " " +  train_df["rating2"]

# drop unnecessary columns
train_df.drop(columns=["rating1", "rating2"], inplace=True)

In [6]:
# sample train_df

train_df = train_df.groupby('star', group_keys=False).apply(lambda x: x.sample(1000))

In [7]:
# rename columns
test_df.rename({0:"star", 1:"rating1", 2:"rating2"}, axis = 1, inplace = True)

# merge the two reviews columns
test_df["review"] = test_df["rating1"] + " " +  test_df["rating2"]

# drop unnecessary columns
test_df.drop(columns=["rating1", "rating2"], inplace=True)

In [16]:
# sample test_df

test_df = test_df.groupby('star', group_keys=False).apply(lambda x: x.sample(200))

In [41]:
test_df['tmp'] = test_df['review'].apply(preprocessing).apply(len)
test_df.describe()

Unnamed: 0,star,tmp
count,1000.0,1000.0
mean,3.0,35.796
std,1.414921,19.473253
min,1.0,7.0
25%,2.0,20.0
50%,3.0,32.0
75%,4.0,49.0
max,5.0,141.0


In [9]:
# load nlp model pretrained from spacy

nlp = spacy.load("en_core_web_sm")

In [10]:
# define preprocessing function: take the lemma of the token if the token is not punctuation or stop-word

def preprocessing(sentence):
    """
    params sentence: a str containing the sentence we want to preprocess
    return the tokens list
    """
    doc = nlp(sentence)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens

In [11]:
# define custom dataloader for train data

class TrainData(Dataset):
    # initiate the class with df and maximum number of tokens as argument
    def __init__(self, df, max_seq_len=32):
        self.max_seq_len = max_seq_len
        counter = Counter()    # instanziate counter
        train_iter = iter(df.review.values)    # make the review column iterable
        for text in train_iter:
            counter.update(preprocessing(text))    # update the counter object with the number of words and occurrencies
        self.vocab = Vocab(counter, min_freq=1)    # create pytorch Vocab from the counter with all the words (min_freq=1)
        self.vocab.load_vectors("fasttext.simple.300d")    # load pretrained embeddings
        label_pipeline = lambda x: int(x) -1     # make the label range 0-4 instead of 1-5
        self.token2idx = lambda x: self.vocab[x]  # Converts token to index
        self.idx2token = lambda x: self.vocab.itos[int(x)]   # converts index to token
        self.encode = lambda x:[self.token2idx(token) for token in preprocessing(x)]    # encode every token with its index
        self.pad = lambda x: x + (max_seq_len - len(x)) * [self.token2idx("<pad>")]     # add the index of "<pad>" as many time as needed to reach max_seq_len
        sequences = [self.encode(sequence)[:max_seq_len] for sequence in df.review.tolist()]    # truncate the sequence if it's longer than max_seq_len
        sequence, self.labels = zip(*[(sequence, label_pipeline(label)) for sequence, label in zip(sequences, df.star.tolist()) if sequence]) # map every sequence to its label
        self.sequences = [self.pad(sequence) for sequence in sequences]

    # mandatory methods for custom dataloader
    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, i):
        assert len(self.sequences[i]) == self.max_seq_len
        return self.sequences[i], self.labels[i]

In [13]:
# instantiate a object of TrainData with the data loaded before

train_dataset = TrainData(train_df, max_seq_len=32)

.vector_cache/wiki.simple.vec: 293MB [00:25, 11.4MB/s]                           
  0%|          | 0/111051 [00:00<?, ?it/s]Skipping token b'111051' with 1-dimensional vector [b'300']; likely a header
100%|█████████▉| 110866/111051 [00:30<00:00, 7680.68it/s]

In [14]:
# define custom dataloader for test data (with the Vocab took from the train to avoid data leakages)

class TestData(Dataset):
    # initiate the class with df and maximum number of tokens as argument
    def __init__(self, df, max_seq_len=32, vocab=train_dataset.vocab):
        self.max_seq_len = max_seq_len
        self.vocab = vocab
        label_pipeline = lambda x: int(x) -1     # make the label range 0-4 instead of 1-5
        self.token2idx = lambda x: self.vocab[x]  # Converts token to index
        self.idx2token = lambda x: self.vocab.itos[int(x)]   # converts index to token
        self.encode = lambda x:[self.token2idx(token) for token in preprocessing(x)]    # encode every token with its index
        self.pad = lambda x: x + (max_seq_len - len(x)) * [self.token2idx("<pad>")]     # add the index of "<pad>" as many time as needed to reach max_seq_len
        sequences = [self.encode(sequence)[:max_seq_len] for sequence in df.review.tolist()]    # truncate the sequence if it's longer than max_seq_len
        sequence, self.labels = zip(*[(sequence, label_pipeline(label)) for sequence, label in zip(sequences, df.star.tolist()) if sequence]) # map every sequence to its label
        self.sequences = [self.pad(sequence) for sequence in sequences]

    # mandatory methods for custom dataloader
    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, i):
        assert len(self.sequences[i]) == self.max_seq_len
        return self.sequences[i], self.labels[i]

In [17]:
# instantiate a object of TestData with the data loaded before

test_dataset = TestData(test_df, max_seq_len=32)

In [19]:
# define collate function to create the batch (convert the sentences to tensor)

def collate(batch, vectorizer=train_dataset.vocab.vectors):
    # stacking the tokens, transforming them into vector with the vectorizer then staking them again to get the actual batch
    inputs = torch.stack([torch.stack([vectorizer[token] for token in sentence[0]]) for sentence in batch])

    # creating a tensor for the target
    target = torch.LongTensor([item[1] for item in batch])
    return inputs, target

In [113]:
# define batch_size and create a dataloader from the dataset

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate)
dataloaders = {"train": train_loader, "val": test_loader}
dataset_sizes = {"train": len(train_df), "val": len(test_df)}


In [103]:
## Created a class with layers and activation functions to train Neural Networks
max_seq_len = 32
emb_dim = 300
class Classifier(nn.Module):
    def __init__(self, max_seq_len, emb_dim, hidden1=256, hidden2=128):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(max_seq_len * emb_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 5)
        self.out = nn.LogSoftmax(dim=1)

    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = self.fc2(x)
        x =  self.fc3(x)

        return self.out(x)

In [86]:
model = Classifier(32, 300, 256, 128) # 32 = max_seq_leng, 300 = dim of vectors
model

Classifier(
  (fc1): Linear(in_features=9600, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=5, bias=True)
  (out): LogSoftmax(dim=1)
)

In [97]:
# NLLLoss for log_soft_max activation and Adam optimizer

criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)
#exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
"""
dataiter = iter(train_loader)
sentences, labels = dataiter.next()
sentences.resize_(16, 1, 32*emb_dim).shape
sentence_idx = 0
log_ps = model.forward(sentences[sentence_idx, :])
sentence = sentences[sentence_idx]
torch.exp(log_ps)
"""

In [88]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [89]:
device

device(type='cuda', index=0)

In [115]:
# training loop
emb_dim = 300
epochs = 10
steps = 0
running_loss = 0
print_every = 50
max_accuracy = 0
model = model.to(device)
for epoch in range(epochs):
    for sentences, labels in train_loader:
        sentences.resize_(sentences.size()[0], 32*emb_dim)
        #sentences.resize_(sentences.size()[0], 1, max_seq_len * emb_dim) # resize the vector to be of [batch size, max_seq_len * embedding_dimension] (fc layer input)

        steps += 1
        # Move input and label tensors to the default device
        sentences, labels = sentences.to(device), labels.to(device)
        
        ## bringing the weights to zero 
        optimizer.zero_grad()
        
        log_ps = model.forward(sentences)
        loss = criterion(log_ps, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        
        if steps % print_every == 0:
            test_loss = 0
            accuracy = 0
            model.eval()
            with torch.no_grad():
                for sentences, labels in test_loader:
                    sentences.resize_(sentences.size()[0], 1, 32* emb_dim)
                    sentences, labels = sentences.to(device), labels.to(device)
                    log_ps = model.forward(sentences)
                    batch_loss = criterion(log_ps, labels)
                    
                    test_loss += batch_loss.item()
                    
                    # Calculate accuracy
                    ps = torch.exp(log_ps)
                    top_p, top_class = ps.topk(1, dim=1)
                    equals = top_class == labels.view(*top_class.shape)
                    accuracy += torch.mean(equals.type(torch.FloatTensor)).item()
                    if accuracy >= max_accuracy:
                        max_accuracy = accuracy
                        torch.save(model.state_dict(), 'checkpoint.pth')
            print(f"Epoch {epoch+1}/{epochs}.. "
                  f"Train loss: {running_loss/print_every:.3f}.. "
                  f"Test loss: {test_loss/len(test_loader):.3f}.. "
                  f"Test accuracy: {accuracy/len(test_loader):.3f}")
            running_loss = 0
            model.train()

Epoch 1/10.. Train loss: 0.055.. Test loss: 27.616.. Test accuracy: 0.282
Epoch 1/10.. Train loss: 0.038.. Test loss: 26.142.. Test accuracy: 0.285
Epoch 1/10.. Train loss: 0.002.. Test loss: 25.273.. Test accuracy: 0.294
Epoch 1/10.. Train loss: 0.019.. Test loss: 25.660.. Test accuracy: 0.300
Epoch 1/10.. Train loss: 0.030.. Test loss: 26.888.. Test accuracy: 0.294
Epoch 1/10.. Train loss: 0.100.. Test loss: 28.030.. Test accuracy: 0.275
Epoch 2/10.. Train loss: 0.019.. Test loss: 28.541.. Test accuracy: 0.279
Epoch 2/10.. Train loss: 0.030.. Test loss: 29.374.. Test accuracy: 0.284
Epoch 2/10.. Train loss: 0.003.. Test loss: 27.727.. Test accuracy: 0.290
Epoch 2/10.. Train loss: 0.080.. Test loss: 27.105.. Test accuracy: 0.280
Epoch 2/10.. Train loss: 0.126.. Test loss: 26.783.. Test accuracy: 0.285
Epoch 2/10.. Train loss: 0.022.. Test loss: 27.000.. Test accuracy: 0.281
Epoch 3/10.. Train loss: 0.024.. Test loss: 27.492.. Test accuracy: 0.291
Epoch 3/10.. Train loss: 0.004.. Test 

In [108]:
def train_model(model, criterion, optimizer, dataloaders, emb_dim, epochs=15):
    since = time.time()
    best_acc = 0.0

    for e in range(0,epochs):
        print("_____Epoch {}/{} ____".format(e+1, epochs))
        
        for phase in ["train","val"]:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode
            
            running_loss = 0.0
            running_corrects = 0

            for sentences, labels in dataloaders[phase]:
                # sentences.resize_(sentences.shape[0], 1,  32*emb_dim)
                sentences.resize_(sentences.size()[0], 32*emb_dim)
                sentences, labels = sentences.to(device), labels.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(sentences)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * sentences.size(0)
                running_corrects += torch.sum(preds == labels.data)
            #if phase == 'train':
                #scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))  

In [114]:
import time
train_model(model, criterion, optimizer, dataloaders,emb_dim=300, epochs=10)

_____Epoch 1/10 ____
train Loss: 0.1288 Acc: 0.9854

val Loss: 24.5293 Acc: 0.2860

_____Epoch 2/10 ____
train Loss: 0.0809 Acc: 0.9922

val Loss: 25.0115 Acc: 0.3090

_____Epoch 3/10 ____
train Loss: 0.1264 Acc: 0.9872

val Loss: 25.3238 Acc: 0.3010

_____Epoch 4/10 ____
train Loss: 0.0799 Acc: 0.9918

val Loss: 26.9982 Acc: 0.2910

_____Epoch 5/10 ____
train Loss: 0.0437 Acc: 0.9938

val Loss: 27.2009 Acc: 0.3100

_____Epoch 6/10 ____
train Loss: 0.0370 Acc: 0.9930

val Loss: 24.4489 Acc: 0.2920

_____Epoch 7/10 ____
train Loss: 0.0539 Acc: 0.9958

val Loss: 27.0281 Acc: 0.2830

_____Epoch 8/10 ____
train Loss: 0.0958 Acc: 0.9888

val Loss: 32.3303 Acc: 0.2930

_____Epoch 9/10 ____
train Loss: 0.1615 Acc: 0.9848

val Loss: 26.2909 Acc: 0.2860

_____Epoch 10/10 ____
train Loss: 0.1352 Acc: 0.9902

val Loss: 26.6597 Acc: 0.2970

Training complete in 0m 16s
Best val Acc: 0.000000


In [None]:
from google.colab import drive
drive.mount('/content/drive')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=d7499417-dbdf-4925-a286-60df25457ef5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>