In [3]:
!pip install skorch

Collecting skorch
[?25l  Downloading https://files.pythonhosted.org/packages/42/21/4936b881b33de285faa0b36209afe4f9724a0875b2225abdc63b23d384a3/skorch-0.8.0-py3-none-any.whl (113kB)
[K     |██▉                             | 10kB 19.4MB/s eta 0:00:01[K     |█████▊                          | 20kB 1.7MB/s eta 0:00:01[K     |████████▋                       | 30kB 2.5MB/s eta 0:00:01[K     |███████████▌                    | 40kB 1.7MB/s eta 0:00:01[K     |██████████████▍                 | 51kB 2.1MB/s eta 0:00:01[K     |█████████████████▎              | 61kB 2.5MB/s eta 0:00:01[K     |████████████████████▏           | 71kB 2.9MB/s eta 0:00:01[K     |███████████████████████         | 81kB 2.2MB/s eta 0:00:01[K     |██████████████████████████      | 92kB 2.5MB/s eta 0:00:01[K     |████████████████████████████▉   | 102kB 2.7MB/s eta 0:00:01[K     |███████████████████████████████▊| 112kB 2.7MB/s eta 0:00:01[K     |████████████████████████████████| 122kB 2.7MB/s 
Install

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import re
from random import shuffle

from skorch import NeuralNetClassifier
from sklearn.datasets import make_classification

torch.manual_seed(1)

<torch._C.Generator at 0x7fc6a313a090>

# Bag of words

## Load and preprocess


In [0]:
path_pos = '/content/drive/My Drive/Datasets/review_polarity/txt_sentoken/pos'
path_neg = '/content/drive/My Drive/Datasets/review_polarity/txt_sentoken/neg'

In [0]:
def preprocess(doc):
    return [re.sub('[^\w]|[\d]','', word.lower()) for word in doc]

def generate_word_mapping(data):
    # word_to_ix maps each word in the vocab to a unique integer, which will be its
    # index into the Bag of words vector
    word_to_ix = {}
    for sent, _ in data:
        for word in sent:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    return word_to_ix

def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

def make_ngram_vector(sentence, word_to_ix, n=3):
    ngram_vectors = []
    for i in range(len(sentence) - n):
        curr_ngram = []      
        for j in range(n):
            curr_ngram.append(sentence[i + j])

        ngram_vectors.append(make_context_vector(curr_ngram, word_to_ix))

    return torch.stack(ngram_vectors)

def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

def get_probs(sm, output):
    probabilities = sm(output) 
    return probabilities

In [7]:
preprocess(['hello','there!','Why.','are','you**!2','here'])

['hello', 'there', 'why', 'are', 'you', 'here']

In [96]:
import glob

data = []

# load data in
for path in glob.glob(path_pos+'/*.txt'):
  with open(path, 'r') as f:
    data.append((preprocess(f.read().replace('\n', ' ').split()), "P"))

print('finished positive reviews')

for path in glob.glob(path_neg+'/*.txt'):
  with open(path, 'r') as f:
    data.append((preprocess(f.read().replace('\n', ' ').split()), "N"))

print('finished negative reviews')

finished positive reviews
finished negative reviews


## Transform sentences

In [0]:
label_to_ix = {"N": 0, "P": 1}
word_to_ix = generate_word_mapping(data)

X = []
y = []
for instance, label in data:
    words = [i for i in instance if i not in ['', 'a','it','the']][:200]
    X.append(make_bow_vector(words, word_to_ix))
    y.append(make_target(label, label_to_ix))

In [98]:
[i for i in data[0][0] if i not in ['', 'a','it', 'the']][:2]

['i', 'am']

In [99]:
(data[0][1], data[1][1]), X[:2], y[:2]

(('P', 'P'),
 [tensor([[6., 1., 1.,  ..., 0., 0., 0.]]),
  tensor([[2., 1., 0.,  ..., 0., 0., 0.]])],
 [tensor([1]), tensor([1])])

## Build model

In [0]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.softmax(self.linear(bow_vec), dim=1)



net_bag = NeuralNetClassifier(
    BoWClassifier(NUM_LABELS, VOCAB_SIZE),
    criterion=torch.nn.NLLLoss,
    max_epochs=20,
    lr=0.00025,
    optimizer=optim.SGD,
    optimizer__lr=0.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=False,
)

# Train

In [0]:
import numpy as np

In [42]:
torch.stack(X).shape
torch.stack(X).squeeze().shape

torch.Size([2000, 47038])

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(torch.stack(X).squeeze(), torch.stack(y).squeeze(), test_size=0.25, random_state=42)

In [132]:
y_train, y_test[:10]

(array([0, 1, 1, ..., 1, 0, 0]), array([0, 1, 0, 1, 0, 0, 1, 0, 1, 0]))

In [160]:
net_bag.fit(X_train, y_train)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m3.6371[0m       [32m0.5067[0m        [35m5.8536[0m  0.2304
      2        4.6363       0.5067        [35m3.3638[0m  0.2280
      3        [36m3.0277[0m       0.4933        5.2702  0.2146
      4        3.4089       0.4967        4.1637  0.2140
      5        3.3032       0.5000        3.7861  0.2161
      6        [36m2.8683[0m       [32m0.5433[0m        [35m2.7286[0m  0.2089
      7        [36m2.5024[0m       [32m0.5800[0m        [35m2.0790[0m  0.2164
      8        [36m2.1876[0m       [32m0.5967[0m        [35m1.7096[0m  0.2121
      9        [36m1.8969[0m       [32m0.6100[0m        [35m1.5508[0m  0.2088
     10        [36m1.6466[0m       [32m0.6167[0m        [35m1.3984[0m  0.2160
     11        [36m0.9415[0m       0.5167        3.0954  0.2118
     12        3.3791       0.5267        3.0802  0.2122
     13      

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=BoWClassifier(
    (linear): Linear(in_features=47038, out_features=2, bias=True)
  ),
)

# Evaluation

In [0]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [162]:
y_pred = net_bag.predict(X_test)
accuracy_score(y_test, y_pred)

0.644

In [163]:
confusion_matrix(y_test, y_pred)

array([[161,  82],
       [ 96, 161]])