<a href="https://colab.research.google.com/github/skanderbenmansour/nlp_study_group/blob/master/nina/movie_reviews_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install skorch

Collecting skorch
[?25l  Downloading https://files.pythonhosted.org/packages/42/21/4936b881b33de285faa0b36209afe4f9724a0875b2225abdc63b23d384a3/skorch-0.8.0-py3-none-any.whl (113kB)
[K     |██▉                             | 10kB 16.0MB/s eta 0:00:01[K     |█████▊                          | 20kB 1.7MB/s eta 0:00:01[K     |████████▋                       | 30kB 2.3MB/s eta 0:00:01[K     |███████████▌                    | 40kB 2.5MB/s eta 0:00:01[K     |██████████████▍                 | 51kB 2.0MB/s eta 0:00:01[K     |█████████████████▎              | 61kB 2.3MB/s eta 0:00:01[K     |████████████████████▏           | 71kB 2.5MB/s eta 0:00:01[K     |███████████████████████         | 81kB 2.7MB/s eta 0:00:01[K     |██████████████████████████      | 92kB 2.9MB/s eta 0:00:01[K     |████████████████████████████▉   | 102kB 2.8MB/s eta 0:00:01[K     |███████████████████████████████▊| 112kB 2.8MB/s eta 0:00:01[K     |████████████████████████████████| 122kB 2.8MB/s 
Install

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import re
from random import shuffle

from skorch import NeuralNetClassifier
from sklearn.datasets import make_classification

torch.manual_seed(1)

<torch._C.Generator at 0x7f19236d2bf0>

# Embeddings

## Load and preprocess

In [0]:
path_pos = '/content/drive/My Drive/Datasets/review_polarity/txt_sentoken/pos'
path_neg = '/content/drive/My Drive/Datasets/review_polarity/txt_sentoken/neg'

In [0]:
def preprocess(doc):
    return [re.sub('[^\w]|[\d]','', word.lower()) for word in doc]

def remove_words(vocab):
    return [i for i in vocab if i not in ['', 'a','it','the', 'i']]

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

def make_ngram_vector(sentence, word_to_ix, n=3):
    ngram_vectors = []
    for i in range(len(sentence) - n):
        curr_ngram = [[], None]     
        for j in range(n):
            if j == n-1:
                curr_ngram[1] = sentence[i + j]
            else:
                curr_ngram[0].append(sentence[i + j])
        ngram_vectors.append(curr_ngram)

    return ngram_vectors

In [5]:
import glob

data = []

# load data in
for path in glob.glob(path_pos+'/*.txt'):
  with open(path, 'r') as f:
    data.append((preprocess(f.read().replace('\n', ' ').split()), "P"))

print('finished positive reviews')

for path in glob.glob(path_neg+'/*.txt'):
  with open(path, 'r') as f:
    data.append((preprocess(f.read().replace('\n', ' ').split()), "N"))

print('finished negative reviews')

finished positive reviews
finished negative reviews


## Transform sentences

In [6]:
data[0][0][1]

'am'

In [0]:
import numpy as np

data_text = np.array([np.array(text)[:50] for text, label in data])

In [0]:
data_text = np.concatenate(data_text)

In [26]:
set("hello there".split())

{'hello', 'there'}

In [0]:
vocab = set(remove_words(data_text))
word_to_ix = {word: i for i, word in enumerate(vocab)}

# triples
ngram = []
X_ngram = []

ngram = make_ngram_vector(remove_words(data_text), word_to_ix)

In [0]:
X = [make_context_vector(i, word_to_ix) for i,j in ngram]
y = [torch.tensor([word_to_ix[j]], dtype=torch.long) for i,j in ngram ]

In [29]:
X[1], y[1]

(tensor([5478, 5480]), tensor([1498]))

# Model

In [0]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
VOCAB_SIZE = len(vocab)


class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        # print(inputs.shape)
        embeds = self.embeddings(inputs).view((-1, 20))
        out = self.linear1(embeds)
        out = F.relu(out)
        out = self.linear2(out)
        log_probs = F.softmax(out, dim=1)
        return log_probs


net_ngram = NeuralNetClassifier(
    NGramLanguageModeler(VOCAB_SIZE, EMBEDDING_DIM, CONTEXT_SIZE),
    max_epochs=20,
    lr=0.1,
    optimizer=optim.SGD,
    optimizer__lr=0.01,
    #train_split=None,
    # Shuffle training data on each epoch
    iterator_train__shuffle=False,
)

In [31]:
torch.stack(X).squeeze().shape

torch.Size([76806, 2])

In [0]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(torch.stack(X).squeeze(), torch.stack(y).squeeze(), test_size=0.25, random_state=42)

In [33]:
# X_train.shape, y_train.shape

(torch.Size([57604, 2]), torch.Size([57604]))

# Train

In [0]:
X_e = torch.stack(X).squeeze()
y_e = torch.stack(y).squeeze()

In [35]:
X_e.shape, y_e.shape

(torch.Size([76806, 2]), torch.Size([76806]))

In [36]:
net_ngram.fit(X_e, y_e)  



  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m9.3681[0m       [32m0.0318[0m        [35m9.2758[0m  26.2587
      2        [36m9.1761[0m       [32m0.0329[0m        [35m9.0737[0m  32.1723
      3        [36m8.9557[0m       0.0329        [35m8.8415[0m  31.9417
      4        [36m8.7282[0m       0.0324        [35m8.6360[0m  31.6852
      5        [36m8.5474[0m       0.0316        [35m8.4833[0m  31.5739
      6        [36m8.4090[0m       0.0324        [35m8.3587[0m  31.5368
      7        [36m8.2891[0m       0.0329        [35m8.2466[0m  31.5278
      8        [36m8.1824[0m       [32m0.0338[0m        [35m8.1500[0m  31.7271
      9        [36m8.0929[0m       0.0332        [35m8.0713[0m  31.5961
     10        [36m8.0199[0m       [32m0.0346[0m        [35m8.0074[0m  31.8888
     11        [36m7.9594[0m       [32m0.0356[0m        [35m7.9542[0m  31.8606
   

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=NGramLanguageModeler(
    (embeddings): Embedding(12359, 10)
    (linear1): Linear(in_features=20, out_features=128, bias=True)
    (linear2): Linear(in_features=128, out_features=12359, bias=True)
  ),
)