In [142]:
import pandas as pd
import gzip
import json

import numpy as np 
import matplotlib.pyplot as plt

import spacy

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

from live_plot import LivePlot

In [3]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [4]:
# import gc
# torch.cuda.empty_cache()
# gc.collect()

In [36]:
device = 'cuda'

In [6]:
df = pd.read_csv('./sampled_df.csv')

In [7]:
texts = np.array(df['reviewText'])
labels = np.array(df['overall']).reshape(-1, 1)

In [8]:
for text in texts:
  if type(text) != str:
    print(text)
    print('warning')

In [9]:
# df = df.dropna()

In [10]:
nlp = spacy.load('en_core_web_lg')

def prepare_text(text):
  doc = nlp(text)
  vectors =  torch.tensor(np.array([token.vector for token in doc]), dtype=torch.float32)
  return vectors


In [11]:
texts_matrices = [prepare_text(text) for text in texts]


In [12]:
X = pad_sequence(texts_matrices, batch_first=True)
y = torch.tensor(labels, dtype=torch.float32)

In [13]:
# changing a shape of X because pytorch's conv1d takes the second dimension as channels (and we want word features to be channels and slide by ngram words crops)

X = X.movedim(1, 2)

In [14]:
X.shape

torch.Size([10000, 300, 358])

In [15]:
dev_size = 0.15
test_size = 0.15

bord1 = int(len(X) * (1 - (dev_size + test_size)))
bord2 = int(len(X) * (1 - test_size))

X_train, X_dev, X_test = X[:bord1].to(device), X[bord1:bord2].to(device), X[bord2:]
y_train, y_dev, y_test = y[:bord1].to(device), y[bord1:bord2].to(device), y[bord2:]

In [16]:
# %matplotlib inline
# plt.hist(np.array(y_test.clone().to('cpu')).reshape(-1))

In [17]:
class CustomDataset(Dataset):
    def __init__(self, input_data, labels):
        self.input_data = input_data
        self.labels = labels

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        return self.input_data[idx], self.labels[idx]


In [18]:
train_ds = CustomDataset(X_train, y_train)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=False, drop_last=True)

In [19]:
input_channels = X_train.shape[1]
input_len = X_train.shape[2]
output_size = 1

In [20]:
class SentimentBinaryClass(nn.Module):
  def __init__(self, input_channels, input_len, output_size):
    super().__init__()
    
    self.cnn_mpool_linear_relu_sigmoid = nn.Sequential(
      nn.Conv1d(input_channels, 64, kernel_size=5, padding=2),
      nn.MaxPool1d(kernel_size=2, stride=2),
      
      nn.Conv1d(64, 64, kernel_size=5, padding=2),
      nn.MaxPool1d(kernel_size=2, stride=2),

      nn.Flatten(),

      nn.Linear(64*(input_len // 2 // 2), 2048),
      nn.ReLU(),
      nn.Linear(2048, 2048),
      nn.ReLU(),
      nn.Linear(2048, output_size),
      nn.Sigmoid()
    )


  def forward(self, x):
    return self.cnn_mpool_linear_relu_sigmoid(x)


In [59]:
model = SentimentBinaryClass(input_channels, input_len, output_size).to(device)

train_loss_histroty = []
dev_loss_history = []

In [20]:
# torch.nn.utils.parameters_to_vector(model.parameters()).numel()

In [203]:
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0000001, weight_decay=2)

In [200]:
plt.close()
%matplotlib qt
live_plot = live_plot.LivePlot(slice_prop=0.01)

In [204]:
for epoch in range(10):
  print(f'Epoch {epoch+1}')
  optimizer.zero_grad()

  for (X_batch, y_batch) in train_dl:
    batch_outputs = model(X_batch)
    loss_batch = loss_fn(batch_outputs, y_batch)
    loss_batch.backward()
    optimizer.step()

  with torch.no_grad():
    # measure train and dev loss at the end of epoch
    train_outputs = model(X_train)
    train_loss = loss_fn(train_outputs, y_train)
    train_loss_histroty.append(train_loss.item())

    dev_outputs = model(X_dev)
    dev_loss = loss_fn(dev_outputs, y_dev)
    dev_loss_history.append(dev_loss.item())

    print(f'Train loss: {train_loss}; Dev loss: {dev_loss}')
    live_plot.update(train_loss_histroty, dev_loss_history)

Epoch 1
Train loss: 0.217936173081398; Dev loss: 0.35606250166893005
Epoch 2
Train loss: 0.21803295612335205; Dev loss: 0.356123685836792
Epoch 3
Train loss: 0.21790429949760437; Dev loss: 0.3563642203807831
Epoch 4
Train loss: 0.21806053817272186; Dev loss: 0.3561486005783081
Epoch 5
Train loss: 0.21784688532352448; Dev loss: 0.35623452067375183
Epoch 6
Train loss: 0.21785248816013336; Dev loss: 0.3561565577983856
Epoch 7
Train loss: 0.21788650751113892; Dev loss: 0.3561250865459442
Epoch 8
Train loss: 0.21780245006084442; Dev loss: 0.35620248317718506
Epoch 9
Train loss: 0.2178519368171692; Dev loss: 0.35611605644226074
Epoch 10
Train loss: 0.21776936948299408; Dev loss: 0.35615843534469604
Epoch 11
Train loss: 0.21779771149158478; Dev loss: 0.35610726475715637
Epoch 12
Train loss: 0.21773959696292877; Dev loss: 0.3561405837535858
Epoch 13
Train loss: 0.21775715053081512; Dev loss: 0.3561057150363922
Epoch 14
Train loss: 0.21770671010017395; Dev loss: 0.3561266362667084
Epoch 15
Trai

KeyboardInterrupt: 

In [None]:
plt.close()
%matplotlib inline

plt.plot(train_loss_histroty)
plt.plot(dev_loss_history)

In [93]:
torch.save(model.state_dict(), './models/best_model_regular_cnn_arch1.pt')

In [205]:
with torch.no_grad():
  X_test = X_test.to(device)
  y_test = y_test.to(device)

  print('ACCRURACIES:')
  accuracies = []

  Xs = [X_train, X_dev, X_test]
  ys = [y_train, y_dev, y_test]
  comments = ['1) Train', '2) Dev', '3) Test']

  for x_i, y_i, comment in zip(Xs, ys, comments):
    output = model(x_i).to('cpu')
    labels_pred = torch.tensor([round(val.item()) for val in output], device='cpu')


    y_true = y_i.clone().to('cpu').reshape(-1)

    accuracy = (labels_pred == y_true).sum().item() / y_i.shape[0]
    accuracies.append(accuracy)

    print(comment)
    print((labels_pred == y_true).sum().item(), '/', len(y_true))
    print(accuracy)

ACCRURACIES:


1) Train
6550 / 7000
0.9357142857142857
2) Dev
1268 / 1500
0.8453333333333334
3) Test
1303 / 1500
0.8686666666666667


In [86]:
with torch.no_grad():
  X_test = X_test.to(device)
  y_test = y_test.to(device)

  print('ACCRURACIES:')
  accuracies = []

  Xs = [X_train, X_dev, X_test]
  ys = [y_train, y_dev, y_test]
  comments = ['1) Train', '2) Dev', '3) Test']

  for x_i, y_i, comment in zip(Xs, ys, comments):
    output = model(x_i).to('cpu')
    labels_pred = torch.tensor([round(val.item()) for val in output], device='cpu')


    y_true = y_i.clone().to('cpu').reshape(-1)

    accuracy = (labels_pred == y_true).sum().item() / y_i.shape[0]
    accuracies.append(accuracy)

    print(comment)
    print((labels_pred == y_true).sum().item(), '/', len(y_true))
    print(accuracy)

ACCRURACIES:
1) Train
6556 / 7000
0.9365714285714286
2) Dev
1274 / 1500
0.8493333333333334
3) Test
1293 / 1500
0.862


In [78]:
with torch.no_grad():
  X_test = X_test.to(device)
  y_test = y_test.to(device)

  print('ACCRURACIES:')
  accuracies = []

  Xs = [X_train, X_dev, X_test]
  ys = [y_train, y_dev, y_test]
  comments = ['1) Train', '2) Dev', '3) Test']

  for x_i, y_i, comment in zip(Xs, ys, comments):
    output = model(x_i).to('cpu')
    labels_pred = torch.tensor([round(val.item()) for val in output], device='cpu')


    y_true = y_i.clone().to('cpu').reshape(-1)

    accuracy = (labels_pred == y_true).sum().item() / y_i.shape[0]
    accuracies.append(accuracy)

    print(comment)
    print((labels_pred == y_true).sum().item(), '/', len(y_true))
    print(accuracy)

ACCRURACIES:


1) Train
6476 / 7000
0.9251428571428572
2) Dev
1270 / 1500
0.8466666666666667
3) Test
1284 / 1500
0.856
