In [1]:
import os
import math
import sys
import gzip

import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import torch.utils.data
import time

## Load training text and labels

In [2]:
answers = []
with open('classes.train', 'r') as f:
    for line in f:
        l = line.strip()
        answers.append(int(l))
    f.close()
print('Number of answers: {}'.format(len(answers)))
print('First 10 answers:, ', answers[:10])

Number of answers: 1000
First 10 answers:,  [2, 2, 1, 1, 1, 1, 2, 2, 2, 2]


In [3]:
docs = []
with open('docs.train', 'r') as f:
    for line in f:
        l = line.strip()
        docs.append(l)
    f.close()
print('Number of docs: {}'.format(len(docs)))
max_len = -1
for doc in docs:
    if len(doc.split()) > max_len:
        max_len = len(doc.split())
        
print('length of doc with > 1000 words:')
for doc in docs:
    if len(doc.split()) > 1000:
        print(len(doc.split()))
print('Max doc length is {}'.format(max_len))
print('The first doc: ', docs[0])

Number of docs: 1000
length of doc with > 1000 words:
1010
1193
1420
Max doc length is 1420
The first doc:  Good :* much smaller than I expected . It slides easily into a coat pocket . * great image quality . Even a novice / lazy photographer like me can get amazing pictures ! * easy to use ( see above ) Bad :* battery dies when connected to the computer . Buy a spare . * software program not intuitive . Also - " Olympus Master " ? Creepy . I edit in the camera or using MS Photo Editor or Photoshop . * no viewfinder means sometimes you 're not sure what you 're shooting . It 's more than a mild annoyance , but in balance it 's okay because you 're getting a great camera at a reduced price .


In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cuda:0


## Build word2idx
threshold is > 1; no modification on the original words (e.g. capital words are kept)

In [5]:
word2idx = {'[PAD]':0,'[UNK]':1}
word_count = {}
threshold = 0
for doc in docs:
    words = doc.split()
    for word in words:
        if word not in word_count:
            word_count[word] = 1
        else:
            word_count[word] += 1
for word, count in word_count.items():
    if count > threshold:
        word2idx[word] = len(word2idx)

print('Number of distinct words in docs.train: {}'.format(len(word_count)))
print('Size of word to index: {}'.format(len(word2idx)))

Number of distinct words in docs.train: 10526
Size of word to index: 10528


In [6]:
class textDataset(torch.utils.data.dataset.Dataset):
    def __init__(self, samples, classes, train=True):
        self.samples = samples
        self.classes = classes
        self.train = train
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, index):
        min_augs_sample_length = 100
        min_cropped_length = 50
        sample = self.samples[index]
#         if len(sample) > 1000:
#             sample = sample[:1000] # use only the first 1000 tokens
        sample_tokenized = sample.split()
        sample_indexes = []
        for token in sample_tokenized:
            if token in word2idx:
                sample_indexes.append(int(word2idx[token]))
            else:
                sample_indexes.append(int(word2idx['[PAD]'])) # NAACL 2015
        if len(sample_indexes) > min_augs_sample_length and self.train: # random crop for longer samples
            
            start = random.randint(0,len(sample_indexes)- min_cropped_length - 1) # expect at least 50 tokens between start and end
            end = random.randint(start + min_cropped_length, len(sample_indexes) - 1)
        
            sample_indexes = sample_indexes[start:end]

        if len(sample_indexes) < 1000:
            sample_indexes.extend([0]* (1000 - len(sample_indexes)))
        if len(sample_indexes) > 1000:
            sample_indexes = sample_indexes[:1000]
            
        # class 1 = new class 0; class 2 = new class 1
        cls = int(self.classes[index] -1)
        
        return torch.tensor(sample_indexes), cls
dataset = textDataset(docs, answers)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)

## Load pretrained word embeddings
code modified from the pdf

In [7]:
start = time.time()
# word2idx = {} # a Python hash table mapping word string to integer index
# TODO: to be initialized from the training data
with gzip.open("../vectors.txt.gz", "rt", encoding="utf8") as f:
    # First line contains the embedding vocabulary size and dimension
    # .strip() removes the trailing newline
    first_line = f.readline().strip()
    first_line_tokens = first_line.split(' ')
    emb_dim = first_line_tokens[1]
    emb_dim = int(emb_dim)
    # Embedding is to be stored in a PyTorch tensor, size is
    # training_text_vocab_size x emb_dim
    # initialized randomly between -0.25 to 0.25
    embeddings = torch.rand(len(word2idx), emb_dim) * 0.5 - 0.25
    # Read the remaining lines
    
    # get capitalization information
    num_small = 0
    num_total = 0
    num_have_pretrained_emb = 0
    for line in f: # traverse line in file descriptor f
        # Remove line trailing newline
        line = line.strip()
        # Split into word string and vector string
        # First space-separated column is the word string
        first_space_pos = line.find(' ', 1)
        word = line[:first_space_pos]
        # Check if word is in word2idx, skip if not
        num_total += 1
        if word[0].capitalize() != word[0]:
            # the word is not capitalized
            num_small += 1
        # trying to find special tokens
        if word[0] == '[' or word[0] == '<':
            print(word)
        if word not in word2idx:
            continue
        else:
            num_have_pretrained_emb += 1
        # Word index corresponds to the row in the embedding tensor
        idx = word2idx[word]
        # The remaining column is the vector string
        emb_str = line[first_space_pos+1:].strip()
        # Convert all vector strings into a list of floating point
        emb = [float(t) for t in emb_str.split(' ')]
        # Convert Python list into PyTorch tensor
        embeddings[idx] = torch.tensor(emb)
# Do not forget to close the file
f.close()
end = time.time()
print('Loading vectors takes {:.3f} seconds'.format(end-start))
print('num small {}'.format(num_small))
print('num total {}'.format(num_total))
print('Number of words with pretrained embedding: {}'.format(num_have_pretrained_emb))

</s>
Loading vectors takes 48.799 seconds
num small 632919
num total 3000000
Number of words with pretrained embedding: 8823


In [8]:
# print(embeddings.shape)
# print(len(word2idx))
# print(emb_dim)
# print(list(embeddings.shape))
# print([len(word2idx), emb_dim])
# print(list(embeddings.shape) == [len(word2idx), emb_dim])

# Define the model
embedding dim = 300, as stated in pdf,
out_channels = 100, tentative,
assume max text length is 1000 (will truncate)

In [17]:
number_of_filters = 1000
max_seq_length = 1000
stride = 1
#padding = kernel_size - 1

# fix the number of pooling units and dynamically determine the pooling region size on each
# data point so that the entire data is covered without overlapping.
# NAACL 2015

num_maxpool_outputs = 10

conv_mod_list = [3,4] # 1000: NAACL 2015

class ConvModules(nn.Module):
    def __init__(self, kernel_size):
        super(ConvModules, self).__init__()
        padding = kernel_size - 1 
        self.conv = nn.Conv1d(in_channels=emb_dim, out_channels=number_of_filters, kernel_size=kernel_size, stride=1, padding=padding)
        self.bn = nn.BatchNorm1d(num_features=number_of_filters)
        # see torch.nn documentation for the calculation, dilation is 1
        self.pool_size = int(((max_seq_length+2*padding-kernel_size)/stride+1)/num_maxpool_outputs)
        self.pool = nn.MaxPool1d(kernel_size=self.pool_size, stride=self.pool_size)
    
    def forward(self, x):
        x = self.conv(x)
        x = self.pool(F.relu(x))
        x = self.bn(x) # NAACL 2015 bn after pool
        x = x.view(-1, number_of_filters*num_maxpool_outputs)
        return x
    
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(word2idx), embedding_dim=emb_dim, scale_grad_by_freq=True, _weight=embeddings)
        self.conv_mods = [ConvModules(i).to(device) for i in conv_mod_list] # any other ways to put submodules on GPU?
        #self.bn = nn.BatchNorm1d(num_features=number_of_filters * len(self.conv_mods))
        self.fc = nn.Linear(in_features=number_of_filters * len(self.conv_mods)*num_maxpool_outputs, out_features=2)
        self.drop = nn.Dropout(p=0.5)
        # freeze the embedding layer
#         for param in self.embedding.parameters():
#             param.requires_grad = False
            
    def forward(self, x):
        x = self.embedding(x)
        # print('emb x', x.shape) need to reshape, see
        # https://mp.weixin.qq.com/s?__biz=MzI4MDYzNzg4Mw==&mid=2247489032&idx=4&sn=9e759a05ff5b9da255bae4fcf94e2af5&chksm=ebb42edcdcc3a7ca20eea203dae13105936926e1450f6b67f07010267aa840cc9fb455a66ee2&mpshare=1&scene=1&srcid=0313vVi2q20p2k3TqK6WRGjm&key=5064705dbe24d988c44e89bc5aacade833720db5bed7ef054acb62cd87625a58e4346ea63c5517f60db8e53826aae0a9fd520f9c74acbc5e1aec0c98653314da813162beff57b51a6b19cb03b1724563&ascene=1&uin=MTYxMjc0MjA2NA%3D%3D&devicetype=Windows+10&version=62060728&lang=zh_CN&pass_ticket=dYRZzFLSxLIOmdH5Lwk1l3ymeBOxSu%2B%2FSA7iiOPvUZImOklm%2BihH0Pt8NzO4bH7k
        x = x.permute(0,2,1)
        x = torch.cat([conv_m(x) for conv_m in self.conv_mods], dim=1)
        x = self.fc(self.drop(x)) # dropout following NAACL 2015
        return x
net = Net()
net.to(device)

Net(
  (embedding): Embedding(10528, 300, scale_grad_by_freq=True)
  (fc): Linear(in_features=20000, out_features=2, bias=True)
  (drop): Dropout(p=0.5)
)

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr = 0.01, weight_decay = 0.0005)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

## Train

In [19]:
start = time.time()

total_epochs = 35

for epoch in range(total_epochs):
    scheduler.step()
    running_loss = 0.0
    for i, data in enumerate(dataloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    print("epoch {}, running loss: {:.4f}".format(epoch, running_loss))
    for param_group in optimizer.param_groups:
        print('lr: ', param_group['lr'])
    running_loss = 0.0
print('Finished Training')
end = time.time()
print('Training takes {:.2f} seconds'.format(end - start))

epoch 0, running loss: 186.5204
lr:  0.01
epoch 1, running loss: 50.5820
lr:  0.01
epoch 2, running loss: 25.1066
lr:  0.01
epoch 3, running loss: 7.1287
lr:  0.01
epoch 4, running loss: 9.8152
lr:  0.01
epoch 5, running loss: 8.7699
lr:  0.01
epoch 6, running loss: 16.0904
lr:  0.01
epoch 7, running loss: 20.6023
lr:  0.01
epoch 8, running loss: 27.2779
lr:  0.01
epoch 9, running loss: 11.1746
lr:  0.01
epoch 10, running loss: 5.7833
lr:  0.001
epoch 11, running loss: 9.3249
lr:  0.001
epoch 12, running loss: 4.3535
lr:  0.001
epoch 13, running loss: 6.8687
lr:  0.001
epoch 14, running loss: 3.5222
lr:  0.001
epoch 15, running loss: 5.0858
lr:  0.001
epoch 16, running loss: 2.0556
lr:  0.001
epoch 17, running loss: 4.9100
lr:  0.001
epoch 18, running loss: 1.4182
lr:  0.001
epoch 19, running loss: 3.1585
lr:  0.001
epoch 20, running loss: 2.4918
lr:  0.00010000000000000002
epoch 21, running loss: 3.8448
lr:  0.00010000000000000002
epoch 22, running loss: 1.9604
lr:  0.0001000000000000

## Eval

In [20]:
test_docs = []
with open('docs.test', 'r') as f:
    for line in f:
        l = line.strip()
        test_docs.append(l)
    f.close()

print('Number of test docs: {}'.format(len(test_docs)))
max_len = -1
for doc in test_docs:
    if len(doc.split()) > max_len:
        max_len = len(doc.split())
        
print('length of doc with > 1000 words:')
for doc in test_docs:
    if len(doc.split()) > 1000:
        print(len(doc.split()))
print('Max doc length is {}'.format(max_len))
print('The first doc: ', test_docs[1])

Number of test docs: 1000
length of doc with > 1000 words:
1185
Max doc length is 1185
The first doc:  I have had the same snow problem as others describe in the reviews . I cleaned the heads - tried everything . One thing the unit does well is heat the room . Samsung produced a portable worldwide heater , capable of warming a small sized bedroom in just under an hour . If the VCR portion doesn't work for you , I recommend using it as a heater . Very satisfied with my heater purchase .


In [21]:
test_answers = []
with open('classes.test', 'r') as f:
    for line in f:
        l = line.strip()
        test_answers.append(int(l))
    f.close()
print('Number of answers: {}'.format(len(test_answers)))
print('First 10 answers:, ', test_answers[:10])

Number of answers: 1000
First 10 answers:,  [1, 1, 2, 2, 2, 2, 2, 2, 2, 2]


In [22]:
test_dataset = textDataset(test_docs, test_answers, train=False)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=64)

In [23]:
correct = 0
total = 0
net.eval()
wrong_list = []
# with some analysis
with torch.no_grad():
    for index, data in enumerate(test_dataloader):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
#         print('labels ', labels)
#         print('predicted, ', predicted.data)
        id = 0
        for l, p in zip(labels, predicted.data):
            if l != p:
                wrong_list.append(index * 8 + id)
                id += 1
        total += labels.size(0)
        correct += (predicted == labels).sum().item()# -1 because predictions are 0,1, labels are 1,2
print(total)
print(correct)
print('Accuracy of the network on the 1000 test docs: {:.4f}'.format(100 * correct / total))

1000
806
Accuracy of the network on the 1000 test docs: 80.6000


In [24]:
# -ve = 1, +ve = 2
num_1 = 0
num_2 = 0

for i in wrong_list:
    #print(len(test_docs[i].split()))
    if test_answers[i] == 1:
        num_1 += 1
    else:
        num_2 += 1
print(num_1, ' -ve')
print(num_2, ' +ve')

88  -ve
106  +ve
