In [1]:
import os
import math
import sys
import gzip

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import time

## Load training text and labels

In [2]:
answers = []
with open('classes.train', 'r') as f:
    for line in f:
        l = line.strip()
        answers.append(int(l))
    f.close()
print('Number of answers: {}'.format(len(answers)))
print('First 10 answers:, ', answers[:10])

Number of answers: 1000
First 10 answers:,  [2, 2, 1, 1, 1, 1, 2, 2, 2, 2]


In [3]:
docs = []
with open('docs.train', 'r') as f:
    for line in f:
        l = line.strip()
        docs.append(l)
    f.close()
print('Number of docs: {}'.format(len(docs)))
max_len = -1
for doc in docs:
    if len(doc.split()) > max_len:
        max_len = len(doc.split())
        
print('length of doc with > 1000 words:')
for doc in docs:
    if len(doc.split()) > 1000:
        print(len(doc.split()))
print('Max doc length is {}'.format(max_len))
print('The first doc: ', docs[1])

Number of docs: 1000
length of doc with > 1000 words:
1010
1193
1420
Max doc length is 1420
The first doc:  I recently purchased a Samsung HDTVSamsung LN46A550 46-inch 1080p LCD HDTVwhich I also occasionally use as a computer monitor . This keyboard works great when I 'm sitting on the couch , 15 feet or so away from the computer , and using the TV as my main monitor . When I 'm back at my desk using the desktop monitor , I use the standard keyboard that 's hardwired to the computer . This keyboard costs a little more than most the other wireless keyboards I looked at . However , it got good customer ratings for it 's performance from a distance , and has the mouse pad built in . Those two qualities are what finally sold me on it . For anyone interested , I use an AB switch boxManual Switchbox Db15f for Vga Monitors 2x1 Abto switch to / from the TV monitor and my desktop monitor . In my case , I already have two monitors connected to the video card on the computer . The AB switch allow

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

cuda:0


## Build word2idx
threhold is > 1; no modification on the original words (e.g. capital words are kept)

In [5]:
word2idx = {'[UNK]':0}
word_count = {}
threshold = 1
for doc in docs:
    words = doc.split()
    for word in words:
        if word not in word_count:
            word_count[word] = 1
        else:
            word_count[word] += 1
for word, count in word_count.items():
    if count > threshold:
        word2idx[word] = len(word2idx)

print('Number of distinct words in docs.train: {}'.format(len(word_count)))
print('Size of word to index: {}'.format(len(word2idx)))

Number of distinct words in docs.train: 10526
Size of word to index: 4971


In [6]:
class textDataset(torch.utils.data.dataset.Dataset):
    def __init__(self, samples, classes):
        self.samples = samples
        self.classes = classes
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, index):
        sample = self.samples[index]
        if len(sample) > 1000:
            sample = sample[:1000] # use only the first 1000 tokens
        sample_tokenized = sample.split()
        sample_indexes = []
        for token in sample_tokenized:
            if token in word2idx:
                sample_indexes.append(int(word2idx[token]))
            else:
                sample_indexes.append(int(word2idx['[UNK]']))
        
        # if the sample is short, append 0
        # shoudl this be different from UNK?
        if len(sample_indexes) < 1000:
            sample_indexes.extend([0]* (1000 - len(sample_indexes)))
        
        # class 1 = new class 0; class 2 = new class 1
        cls = int(self.classes[index] -1)
        
        return torch.tensor(sample_indexes), cls
dataset = textDataset(docs, answers)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4)

## Load pretrained word embeddings
code modified from the pdf

In [7]:
start = time.time()
# word2idx = {} # a Python hash table mapping word string to integer index
# TODO: to be initialized from the training data
with gzip.open("../vectors.txt.gz", "rt", encoding="utf8") as f:
    # First line contains the embedding vocabulary size and dimension
    # .strip() removes the trailing newline
    first_line = f.readline().strip()
    first_line_tokens = first_line.split(' ')
    emb_dim = first_line_tokens[1]
    emb_dim = int(emb_dim)
    # Embedding is to be stored in a PyTorch tensor, size is
    # training_text_vocab_size x emb_dim
    # initialized randomly between -0.25 to 0.25
    embeddings = torch.rand(len(word2idx), emb_dim) * 0.5 - 0.25
    # Read the remaining lines
    
    # get capitalization information
    num_small = 0
    num_total = 0
    num_have_pretrained_emb = 0
    for line in f: # traverse line in file descriptor f
        # Remove line trailing newline
        line = line.strip()
        # Split into word string and vector string
        # First space-separated column is the word string
        first_space_pos = line.find(' ', 1)
        word = line[:first_space_pos]
        # Check if word is in word2idx, skip if not
        num_total += 1
        if word[0].capitalize() != word[0]:
            # the word is not capitalized
            num_small += 1
        # trying to find special tokens
        if word[0] == '[' or word[0] == '<':
            print(word)
        if word not in word2idx:
            continue
        else:
            num_have_pretrained_emb += 1
        # Word index corresponds to the row in the embedding tensor
        idx = word2idx[word]
        # The remaining column is the vector string
        emb_str = line[first_space_pos+1:].strip()
        # Convert all vector strings into a list of floating point
        emb = [float(t) for t in emb_str.split(' ')]
        # Convert Python list into PyTorch tensor
        embeddings[idx] = torch.tensor(emb)
# Do not forget to close the file
f.close()
end = time.time()
print('Loading vectors takes {:.3f} seconds'.format(end-start))
print('num small {}'.format(num_small))
print('num total {}'.format(num_total))
print('Number of words with pretrained embedding: {}'.format(num_have_pretrained_emb))

</s>
Loading vectors takes 50.532 seconds
num small 632919
num total 3000000
Number of words with pretrained embedding: 4670


In [8]:
print(embeddings.shape)
print(len(word2idx))
print(emb_dim)
print(list(embeddings.shape))
print([len(word2idx), emb_dim])
print(list(embeddings.shape) == [len(word2idx), emb_dim])

torch.Size([4971, 300])
4971
300
[4971, 300]
[4971, 300]
True


# Define the model
embedding dim = 300, as stated in pdf,
out_channels = 100, tentative,
assume max text length is 1000 (will truncate)

In [12]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(word2idx), embedding_dim=emb_dim, _weight=embeddings)
        self.conv1 = nn.Conv1d(in_channels=emb_dim, out_channels=100, kernel_size=2, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size = 1000-2+1)
        self.fc = nn.Linear(in_features = 100, out_features = 2)
    def forward(self, x):
        x = self.embedding(x)
        # print('emb x', x.shape) need to reshape, see
        # https://mp.weixin.qq.com/s?__biz=MzI4MDYzNzg4Mw==&mid=2247489032&idx=4&sn=9e759a05ff5b9da255bae4fcf94e2af5&chksm=ebb42edcdcc3a7ca20eea203dae13105936926e1450f6b67f07010267aa840cc9fb455a66ee2&mpshare=1&scene=1&srcid=0313vVi2q20p2k3TqK6WRGjm&key=5064705dbe24d988c44e89bc5aacade833720db5bed7ef054acb62cd87625a58e4346ea63c5517f60db8e53826aae0a9fd520f9c74acbc5e1aec0c98653314da813162beff57b51a6b19cb03b1724563&ascene=1&uin=MTYxMjc0MjA2NA%3D%3D&devicetype=Windows+10&version=62060728&lang=zh_CN&pass_ticket=dYRZzFLSxLIOmdH5Lwk1l3ymeBOxSu%2B%2FSA7iiOPvUZImOklm%2BihH0Pt8NzO4bH7k
        x = x.permute(0,2,1)
        x = self.conv1(x)
        x = self.pool(F.relu(x))
        x = F.dropout(input=x)
        x = x.view(-1, 100)
        x = self.fc(x)
        return x
net = Net()
net.to(device)

Net(
  (embedding): Embedding(4971, 300)
  (conv1): Conv1d(300, 100, kernel_size=(2,), stride=(1,), padding=(1,))
  (pool): MaxPool1d(kernel_size=999, stride=999, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=100, out_features=2, bias=True)
)

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr = 0.001, momentum=0.9)

## Train

In [14]:
for epoch in range(50):
    running_loss = 0.0
    for i, data in enumerate(dataloader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        #print(inputs.shape)
        optimizer.zero_grad()
        outputs = net(inputs)
        #print(outputs)
        #print(labels)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
#         if i% 200 == 0:
#             print('[%d, %5d] loss: %.3f' %
#                 (epoch + 1, i + 1, running_loss / 200))
    print(running_loss)
    running_loss = 0.0
print('Finished Training')

173.2241666316986
172.62680304050446
171.10860818624496
166.51930689811707
163.99474176764488
158.7173831164837
152.78773164749146
143.11353754997253
136.37956902384758
128.5713141709566
120.45381104946136
111.71452312171459
108.76357926428318
98.17673103511333
89.2974879220128
86.41171844303608
83.49641493707895
79.63828664273024
73.5645200908184
64.28662864118814
62.70934849232435
59.29467452317476
55.69131428003311
53.599727258086205
51.795485600829124
46.6338609829545
41.34715363383293
44.30530908703804
38.99793766438961
36.99736427515745
41.70802564918995
32.163846768438816
32.88529112935066
27.97194766998291
29.1137617751956
26.957217887043953
26.402758844196796
23.967619754374027
27.069493129849434
22.097992904484272
20.538870438933372
20.39600621163845
18.33538281172514
19.67107642441988
19.940915182232857
17.754491567611694
15.653979979455471
15.200925543904305
16.867719128727913
13.601072020828724
Finished Training
