In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazonreviews/test.ft.txt.bz2
/kaggle/input/amazonreviews/train.ft.txt.bz2


In [2]:
import pandas as pd
import numpy as np
import bz2
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

%matplotlib inline

In [3]:
device = "cuda:0"

In [4]:
def labels_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts



In [5]:
train_label, train_text = labels_texts('../input/amazonreviews/train.ft.txt.bz2')
test_label, test_text = labels_texts('../input/amazonreviews/test.ft.txt.bz2')

In [6]:
print(train_label[0])
print(train_text[0])

1
Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^


In [7]:
import re
not_numChar = re.compile(r'[\W]')
no_encode = re.compile(r'[^a-z0-1\s]')
def normalisation(texts):
    norm_text = []
    for word in texts:
        lower = word.lower()
        not_punct = not_numChar.sub(r' ', lower)
        exclude_no_encode = no_encode.sub(r'', not_punct)
        norm_text.append(exclude_no_encode)
    return norm_text

In [8]:
train_text = normalisation(train_text)
test_text = normalisation(test_text)

In [9]:
print(train_text[0])

stuning even for the non gamer  this sound track was beautiful  it paints the senery in your mind so well i would recomend it even to people who hate vid  game music  i have played the game chrono cross but out of all of the games i have ever played it has the best music  it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras  it would impress anyone who cares to listen    


In [10]:
y_train = np.array(train_label)
y_test = np.array(test_label)

In [11]:
y_test.shape

(400000,)

In [12]:
max_features = 8192
maxlen = 128

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_text)

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary Size :", vocab_size)

Vocabulary Size : 905946


In [13]:
import os 
with open('amazon_dictionary.txt', 'w') as file:
    for key in word_index.keys():
        file.write(key + " " + str(word_index[key]) + ",")

In [14]:
training_token = tokenizer.texts_to_sequences(train_text)
testing_token = tokenizer.texts_to_sequences(test_text)

In [15]:
x_train = pad_sequences(training_token, maxlen = maxlen, padding = 'post')
x_test = pad_sequences(testing_token, maxlen = maxlen, padding = 'post')

In [16]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
BATCH_SIZE = 50

train_data = TensorDataset(torch.from_numpy(x_train), torch.from_numpy(y_train))
test_data = TensorDataset(torch.from_numpy(x_test), torch.from_numpy(y_test))

train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE, drop_last = True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=BATCH_SIZE, drop_last = True)

In [17]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
    
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.2):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(drop_prob)
        self.embedding_dim = embedding_dim
        self.sigmoid = nn.Sigmoid()
        
        
        self.encoder = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = PositionalEncoding(embedding_dim, drop_prob)
        encoder_layer = nn.TransformerEncoderLayer(embedding_dim, 2, hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        
        self.fc = nn.Linear(embedding_dim * 128 , 1)

    
    def forward(self, src):
        src = self.encoder(src) * math.sqrt(self.embedding_dim)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = output.view(output.size(0), -1)
        
        output = self.sigmoid(self.fc(output))
        return output

In [18]:
def model_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

In [19]:
import math

output_size = 1
embedding_dim = 128
hidden_dim = 4
n_layers = 1

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)
print(model_params(model))
lr=0.01
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

116045189


In [20]:
epochs = 1
counter = 0
print_every = 500
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
    
    for inputs, labels in train_loader:
        counter += 1
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output = model(inputs.long())
        
        loss = criterion(output.squeeze(1), labels.float())
        
        
        loss.backward()
        #nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_losses = []
            model.eval()
            for inp, lab in test_loader:
                inp, lab = inp.to(device), lab.to(device)
                out = model(inp.long())
                val_loss = criterion(out.squeeze(1), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Epoch: 1/1... Step: 500... Loss: 44.000000... Val Loss: 50.000000
Validation loss decreased (inf --> 50.000000).  Saving model ...
Epoch: 1/1... Step: 1000... Loss: 50.000000... Val Loss: 50.000000
Validation loss decreased (50.000000 --> 50.000000).  Saving model ...
Epoch: 1/1... Step: 1500... Loss: 56.000000... Val Loss: 50.000000
Validation loss decreased (50.000000 --> 50.000000).  Saving model ...
Epoch: 1/1... Step: 2000... Loss: 40.000000... Val Loss: 50.000000
Validation loss decreased (50.000000 --> 50.000000).  Saving model ...
Epoch: 1/1... Step: 2500... Loss: 52.000000... Val Loss: 50.000000
Validation loss decreased (50.000000 --> 50.000000).  Saving model ...
Epoch: 1/1... Step: 3000... Loss: 44.000000... Val Loss: 50.000000
Validation loss decreased (50.000000 --> 50.000000).  Saving model ...
Epoch: 1/1... Step: 3500... Loss: 48.000000... Val Loss: 50.000000
Validation loss decreased (50.000000 --> 50.000000).  Saving model ...
Epoch: 1/1... Step: 4000... Loss: 46.0000

In [21]:
sentence = "I love you"
trial = torch.tensor(pad_sequences(tokenizer.texts_to_sequences([sentence]), maxlen = maxlen)).long().to(device)

model(trial)

tensor([[1.]], device='cuda:0', grad_fn=<SigmoidBackward>)

In [22]:
import torch.onnx
torch.onnx.export(model,               # model being run
                  trial,                  # model input (or a tuple for multiple inputs)
                  "transformer-amazon.onnx", opset_version = 11)