In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

import spacy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

from live_plot import LivePlot

In [2]:
device = 'cuda'

In [3]:
df = pd.read_csv('./../sampled_df.csv')

In [4]:
texts = np.array(df['reviewText'])
labels = np.array(df['overall']).reshape(-1, 1)

In [5]:
for text in texts:
  if type(text) != str:
    print(text)
    print('warning')

In [6]:
nlp = spacy.load('en_core_web_lg')

def prepare_texts(texts):
    text_matrices = []
    docs = nlp.pipe(texts, batch_size=1000)
    for doc in docs:
        mat = np.array([token.vector for token in doc])
        mat = torch.tensor(mat, dtype=torch.float32)
        text_matrices.append(mat)
        
    return text_matrices

text_matrices = prepare_texts(texts)

In [11]:
torch.save(text_matrices, 'text_matrices.pt')

In [12]:
text_matrices_loaded = torch.load('text_matrices.pt')

In [14]:
len(text_matrices_loaded)

10000

In [10]:
for i in range(10):
    print(text_matrices[i].shape)

torch.Size([13, 300])
torch.Size([33, 300])
torch.Size([39, 300])
torch.Size([244, 300])
torch.Size([59, 300])
torch.Size([7, 300])
torch.Size([82, 300])
torch.Size([131, 300])
torch.Size([9, 300])
torch.Size([66, 300])


In [8]:
text_matrices.shape

torch.Size([10000, 300])

In [7]:
texts = np.array(df['reviewText'])

nlp = spacy.load('en_core_web_lg')

def prepare_text(text):
  doc = nlp(text)
  vectors =  torch.tensor(np.array(doc.vector), dtype=torch.float32)
  return vectors

texts_matrices = [prepare_text(text) for text in texts]

In [6]:
# df = df.dropna()

In [7]:
nlp = spacy.load('en_core_web_lg')

def prepare_text(text):
  doc = nlp(text)
  vectors =  torch.tensor(np.array([token.vector for token in doc]), dtype=torch.float32)
  return vectors

texts_matrices = [prepare_text(text) for text in texts]

In [None]:
pt_vec = np.vectorize(prepare_text)

In [11]:
text_matrices2 = pt_vec(texts)

ValueError: only one element tensors can be converted to Python scalars

In [9]:
texts_matrices = [pt_vec(text) for text in texts]

TypeError: Argument 'string' has incorrect type (expected str, got numpy.str_)

In [None]:
X = pad_sequence(texts_matrices, batch_first=True)
y = torch.tensor(labels, dtype=torch.float32)

In [None]:
# changing a shape of X because pytorch's conv1d takes the second dimension as channels (and we want word features to be channels and slide by ngram words crops)

X = X.movedim(1, 2)

In [None]:
X.shape

torch.Size([10000, 300, 358])

In [None]:
dev_size = 0.15
test_size = 0.15

bord1 = int(len(X) * (1 - (dev_size + test_size)))
bord2 = int(len(X) * (1 - test_size))

X_train, X_dev, X_test = X[:bord1].to(device), X[bord1:bord2].to(device), X[bord2:]
y_train, y_dev, y_test = y[:bord1].to(device), y[bord1:bord2].to(device), y[bord2:]

In [None]:
class CustomDataset(Dataset):
    def __init__(self, input_data, labels):
        self.input_data = input_data
        self.labels = labels

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        return self.input_data[idx], self.labels[idx]

In [None]:
train_ds = CustomDataset(X_train, y_train)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=False, drop_last=True)

In [None]:
input_channels = X_train.shape[1]
input_len = X_train.shape[2]
output_size = 1

In [None]:
from collections.abc import Iterable

class Multiple1DConv(nn.Module):
  def __init__(self, input_channels: int, out_channels: int = 64, kernel_sizes: Iterable[int] = (3, 4, 5)):
    super().__init__()

    self.multiple_1d_convs = nn.ModuleList()

    for kernel_size in kernel_sizes:

      self.multiple_1d_convs.append(
        nn.Sequential(
          nn.Conv1d(input_channels, out_channels,
                kernel_size=kernel_size, padding=(kernel_size // 2)),
          nn.AdaptiveMaxPool1d(1)
        )
      )

  def forward(self, inputs):
    outputs = torch.concat([conv_1d_max(inputs) for conv_1d_max in self.multiple_1d_convs], dim=1)
    return outputs

In [None]:
class Mulpiple1DBinaryClass(nn.Module):
  def __init__(self, input_channels, output_size, kernel_sizes=(3, 4, 5), out_channels=64):
    super().__init__()
    
    self.multiple_cnn_max_linear_relu_sigmoid = nn.Sequential(
      Multiple1DConv(input_channels, out_channels=out_channels, kernel_sizes=kernel_sizes),

      nn.Flatten(),

      nn.Linear(out_channels * len(kernel_sizes), 256),
      nn.ReLU(),
      nn.Linear(256, 256),
      nn.ReLU(),
      nn.Linear(256, output_size),
      nn.Sigmoid()
    )


  def forward(self, x):
    return self.multiple_cnn_max_linear_relu_sigmoid(x)


In [None]:
model = Mulpiple1DBinaryClass(input_channels, output_size, kernel_sizes=(3, 4, 5, 6, 7), out_channels=64).to(device)

train_loss_histroty = []
dev_loss_history = []


In [None]:
torch.nn.utils.parameters_to_vector(model.parameters()).numel()

625793

In [None]:
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0000003, weight_decay=2, betas=(0.9, 0.999))

In [190]:
plt.close()
%matplotlib qt
live_plot = LivePlot(slice_prop=0.03)

In [None]:
epochs = 10

In [197]:
def train():
  for epoch in range(epochs):
    model.train()
    print(f'Epoch {epoch+1}')
    optimizer.zero_grad()

    for (X_batch, y_batch) in train_dl:
      batch_outputs = model(X_batch)
      loss_batch = loss_fn(batch_outputs, y_batch)
      loss_batch.backward()
      optimizer.step()

    with torch.no_grad():
      model.eval()
      # measure train and dev loss at the end of epoch
      train_outputs = model(X_train)
      train_loss = loss_fn(train_outputs, y_train)
      train_loss_histroty.append(train_loss.item())

      dev_outputs = model(X_dev)
      dev_loss = loss_fn(dev_outputs, y_dev)
      dev_loss_history.append(dev_loss.item())

      print(f'Train loss: {train_loss}; Dev loss: {dev_loss}')
      live_plot.update(train_loss_histroty, dev_loss_history)

Epoch 1
Train loss: 0.12367183715105057; Dev loss: 0.2571112811565399


In [None]:
plt.close()
%matplotlib inline

plt.plot(train_loss_histroty)
plt.plot(dev_loss_history)

In [183]:
# torch.save(model.state_dict(), './models/best_model_mc_cnn_arch2.pt')

In [198]:
def test(X_test, y_test):
  with torch.no_grad():
    model.eval()

    X_test = X_test.to(device)
    y_test = y_test.to(device)

    print('ACCRURACIES:')
    accuracies = []

    Xs = [X_train, X_dev, X_test]
    ys = [y_train, y_dev, y_test]
    comments = ['1) Train', '2) Dev', '3) Test']

    for x_i, y_i, comment in zip(Xs, ys, comments):
      output = model(x_i).to('cpu')
      labels_pred = torch.tensor([round(val.item()) for val in output], device='cpu')


      y_true = y_i.clone().to('cpu').reshape(-1)

      accuracy = (labels_pred == y_true).sum().item() / y_i.shape[0]
      accuracies.append(accuracy)

      print(comment)
      print((labels_pred == y_true).sum().item(), '/', len(y_true))
      print(round(accuracy*100, 3), '%')

In [199]:
test(X_test, y_test)

ACCRURACIES:
1) Train
6869 / 7000
98.129 %
2) Dev
1346 / 1500
89.733 %
3) Test
1349 / 1500
89.933 %


In [181]:
test(X_test, y_test)

ACCRURACIES:
1) Train
6863 / 7000
98.043 %
2) Dev
1343 / 1500
89.533 %
3) Test
1350 / 1500
90.0 %
