# Model definition

In [31]:
import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class CharacterCNN(nn.Module):
    
    def __init__(self, vocab_size, text_length, conv_kernels, conv_dim, 
                 linear_dim, output_dim, init_weights, dropout_prob, pool_kernel):
        
        super(CharacterCNN, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv1d(vocab_size, conv_dim, conv_kernels[0]),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = pool_kernel, stride = pool_kernel)# stride provides non-overlapping propagation
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv1d(conv_dim, conv_dim, conv_kernels[1]),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = pool_kernel, stride = pool_kernel)
        )
        
        self.conv3 = nn.Sequential(
            nn.Conv1d(conv_dim, conv_dim, conv_kernels[2]),
            nn.ReLU()
        )
            
        self.conv4 = nn.Sequential(
            nn.Conv1d(conv_dim, conv_dim, conv_kernels[3]),
            nn.ReLU()
        )
        
        self.conv5 = nn.Sequential(
            nn.Conv1d(conv_dim, conv_dim, conv_kernels[4]),
            nn.ReLU()
        )
        
        self.conv6 = nn.Sequential(
            nn.Conv1d(conv_dim, conv_dim, conv_kernels[5]),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = pool_kernel, stride = pool_kernel)
        )
        
        self.fc1 = nn.Sequential(
            nn.Linear(int(conv_dim * (text_length - 96) / 27), linear_dim),
            nn.ReLU(),
            nn.Dropout(dropout_prob)
        )
        
        self.fc2 = nn.Sequential(
            nn.Linear(linear_dim, linear_dim),
            nn.ReLU(),
            nn.Dropout(dropout_prob)
        )
        
        self.fc3 = nn.Linear(linear_dim, output_dim)
        self.logsoftmax = nn.LogSoftmax(dim = 0)
        
        self.weights_init(init_weights)
    
    def forward(self, input):
        
# 6 convolutions with ReLU activations + 3 max poolings
#         print('input dim:', input.size())
        output = self.conv1(input)
#         print('after conv1:', output.size())
        output = self.conv2(output)
#         print('after conv2:', output.size())
        output = self.conv3(output)
#         print('after conv3:', output.size())
        output = self.conv4(output)
#         print('after conv4:', output.size())
        output = self.conv5(output)
#         print('after conv5:', output.size())
        output = self.conv6(output)
        
# flatten the input for the linear layer
#         print('after conv6:', output.size())
        output = output.view(output.size()[0], -1)
#         print('flattened:', output.size())
        
# 3 affine maps
        output = self.fc1(output)
#         print('after fc1:', output.size())
        output = self.fc2(output)
#         print('after fc2:', output.size())
        output = self.fc3(output)
#         print('after fc3:', output.size())
        
# logsoftmax
        output = self.logsoftmax(output)
        
        return output
    
# initialize weights with normal distribution
    def weights_init(self, init_weights):
        for m in self.modules():
            if type(m) == nn.Linear:
                m.weight.data.normal_(init_weights[0][0], init_weights[0][1])
            elif type(m) == nn.Conv1d:
                m.weight.data.normal_(init_weights[1][0], init_weights[1][1])
                

# Prepare data for DataLoader

In [32]:
import csv
import os.path as op
import re
import codecs
import json
from torch.utils.data import DataLoader, Dataset

class TextLoader(Dataset):
    def __init__(self, label_data_path, alphabet_path, text_length):

        self.label_data_path = label_data_path
        # read alphabet
        with open(alphabet_path) as alphabet_file:
            alphabet = str(''.join(json.load(alphabet_file)))
        self.alphabet = alphabet
        self.text_length = text_length
        self.load()
        self.y = torch.LongTensor(self.label)
            
    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        X = self.oneHotEncode(idx)
        y = self.y[idx]
        return X, y


    def load(self, lowercase=True):
        self.label = []
        self.data = []
        with open(self.label_data_path, 'rt') as f:
            rdr = csv.reader(f, delimiter=',', quotechar='"')
            # num_samples = sum(1 for row in rdr)
            for index, row in enumerate(rdr):
                self.label.append(int(row[0]))
                txt = ' '.join(row[1:])
                if lowercase:
                    txt = txt.lower()                
                self.data.append(txt)

    def oneHotEncode(self, idx):
        X = torch.zeros(len(self.alphabet), self.text_length)
        sequence = self.data[idx]
        for index_char, char in enumerate(sequence[::-1]): # iterate over reversed sequence
            if self.char2Index(char)!=-1:
                X[self.char2Index(char)][index_char] = 1.0
        return X

    def char2Index(self, character):
        return self.alphabet.find(character)

    def get_class_weight(self):
        
        num_samples = self.__len__()
        label_set = set(self.label)
        num_class = [self.label.count(c) for c in label_set]
        class_weight = [num_samples / float(self.label.count(c)) for c in label_set]    
        
        return class_weight, num_class
    

In [46]:
def train(model, optimizer, loader, criterion):
    
    model.train()
    
    for inputs, labels in loader:
        
        inputs = Variable(inputs)
        labels = Variable(labels)
        
        optimizer.zero_grad()
        out = model(inputs)
        loss = criterion(out, labels)
        
        a = list(model.parameters())[0].clone()
        loss.backward()
        torch.nn.utils.clip_grad_norm(model.parameters(), 400)
        optimizer.step()
        b = list(model.parameters())[0].clone()
        print(torch.equal(a.data, b.data)) # checked that weights are updated
    

def evaluate(loader, model, criterion):

    model.eval()
    
    correct = 0
    total = 0
    avg_loss = 0
    for batch_num, data in enumerate(loader):
        
#         print('\n batch_num:', batch_num)
        
        inputs, labels = data
        inputs = Variable(inputs)
        
#         print('target size:', labels.size())
        
        out = model(inputs)
        
#         print('out:', out)
        
        predicted = Variable(torch.max(out.data, 1)[1])
        
#         if batch_num == 0:
#             print('predicted tensor:', predicted)
    #         print('predicted tensor.data:', predicted.data)
#             print('target:', labels)
        total += labels.size(0)
        correct += (predicted.view(labels.size()).data == labels).sum()
        accuracy = 100 * correct / total
        
        labels = Variable(labels)
        loss = criterion(out, labels)
        avg_loss += (loss.data[0] - avg_loss) / (batch_num + 1)
    
    return avg_loss, accuracy

def fit(model, optimizer, scheduler, criterion, train_loader, test_loader, n_epochs):

    train_log, train_acc_log = [], []
    val_log, val_acc_log = [], []

    for epoch in range(n_epochs):
        
#         print('number of epoch:', epoch)
        
        scheduler.step()
        
        train(model, optimizer, train_loader, criterion)
#         print('train evaluation')
        train_loss, train_acc = evaluate(train_loader, model, criterion)
#         print('test evaluation')
        val_loss, val_acc = evaluate(test_loader, model, criterion)

        train_log.append(train_loss)
        train_acc_log.append(train_acc)

        val_log.append(val_loss)
        val_acc_log.append(val_acc)

        print (('Epoch [%d/%d], LR: %f, Loss (train/test): %.4f/%.4f,'+\
               ' Acc (train/test): %.4f/%.4f' )
                   %(epoch+1, n_epochs, \
                     optimizer.state_dict()['param_groups'][0]['lr'], train_loss, val_loss, train_acc, val_acc))
            
    return train_log, train_acc_log, val_log, val_acc_log

# Fit the model

In [None]:
torch.manual_seed(1)

# Define model hyperparameters
VOCAB_SIZE = 70
TEXT_LENGTH = 1014
CONV_KERNELS = [7, 7, 3, 3, 3, 3]
POOL_KERNEL = 3
CONV_DIM = 256
LINEAR_DIM = 1024
INIT_WEIGHTS = [[0, 0.02], [0, 0.02]]
DROPOUT_PROB = 0.5
OUTPUT_DIM = 4

# Define training parameters
EPOCHS_NUM = 100
BATCH_SIZE = 64

alphabet_path = 'data/alphabet.json'
train_data_path = 'data/train_1k.csv'
test_data_path = 'data/test_1k.csv'
              
loss_func = nn.NLLLoss()
model = CharacterCNN(VOCAB_SIZE, TEXT_LENGTH, CONV_KERNELS, CONV_DIM, 
                     LINEAR_DIM, OUTPUT_DIM, INIT_WEIGHTS, DROPOUT_PROB, POOL_KERNEL)
optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # halve learning rate 10 times every 3 epochs

train_dataset = TextLoader(train_data_path, alphabet_path, TEXT_LENGTH)
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE)

test_dataset = TextLoader(test_data_path, alphabet_path, TEXT_LENGTH)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE)

# Model fit
fit(model, optimizer, scheduler, loss_func, train_loader, test_loader, EPOCHS_NUM)


False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Epoch [1/100], LR: 0.010000, Loss (train/test): 4.1279/4.1295, Acc (train/test): 19.7197/18.6000
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Epoch [2/100], LR: 0.010000, Loss (train/test): 4.1279/4.1295, Acc (train/test): 21.8218/20.8000
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Epoch [3/100], LR: 0.010000, Loss (train/test): 4.1279/4.1295, Acc (train/test): 23.0230/21.4000
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Epoch [4/100], LR: 0.001000, Loss (train/test): 4.1279/4.1295, Acc (train/test): 23.4234/20.1000
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Epoch [5/100], LR: 0.001000, Loss (train/test): 4.1279/4.1295, Acc (train/test): 23.5235/20.9000
False
False
False
False
False
False

In [33]:
list(model.parameters())[0].grad

Variable containing:
( 0 ,.,.) = 
1.00000e-05 *
  0.4413 -0.4662  0.6021  ...  -0.2693  0.1294  0.1249
 -0.7685  0.0540  0.0092  ...  -0.0284 -0.1575 -0.2028
 -0.2328  0.0836 -0.0354  ...   1.0756 -0.0948  0.2401
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000

( 1 ,.,.) = 
1.00000e-05 *
  0.9597  1.0848 -0.2259  ...  -0.5702  0.4546  0.9083
 -0.6620 -0.0403  0.3498  ...   0.0265  0.1766  0.9024
 -0.5377  0.1443  0.0402  ...  -0.8583 -0.0681 -0.9353
           ...             ⋱             ...          
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
  0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000

( 2 ,.,.) = 
1.00000e-05 *
  1.1175 -0.5392  0.0826  ...  -0.7855  0.5442  0.8627
 -0.0286 -0.0270 -0.5062  ...   0.7827 -0.0836 -0.7614
 -2.1394  0.5224

In [60]:
for i in range(1):
#     _, y = train_dataset[i]
#     if y != train_dataset.label[i]:
    data, _ = train_dataset[i]
    for row_num, row in enumerate(data):
        print(row_num, row.numpy())
    print(train_dataset.data[i][::-1], train_dataset.label[i], len(train_dataset.data[i]))
#     else: continue

# X, y = TextLoader(train_data_path, alphabet_path, TEXT_LENGTH)

0 [0. 0. 0. ... 0. 0. 0.]
1 [0. 0. 0. ... 0. 0. 0.]
2 [0. 0. 0. ... 0. 0. 0.]
3 [0. 0. 0. ... 0. 0. 0.]
4 [0. 0. 0. ... 0. 0. 0.]
5 [0. 0. 0. ... 0. 0. 0.]
6 [0. 0. 0. ... 0. 0. 0.]
7 [0. 0. 0. ... 0. 0. 0.]
8 [0. 0. 1. ... 0. 0. 0.]
9 [0. 0. 0. ... 0. 0. 0.]
10 [0. 0. 0. ... 0. 0. 0.]
11 [0. 0. 0. ... 0. 0. 0.]
12 [0. 0. 0. ... 0. 0. 0.]
13 [0. 1. 0. ... 0. 0. 0.]
14 [0. 0. 0. ... 0. 0. 0.]
15 [0. 0. 0. ... 0. 0. 0.]
16 [0. 0. 0. ... 0. 0. 0.]
17 [0. 0. 0. ... 0. 0. 0.]
18 [0. 0. 0. ... 0. 0. 0.]
19 [0. 0. 0. ... 0. 0. 0.]
20 [0. 0. 0. ... 0. 0. 0.]
21 [0. 0. 0. ... 0. 0. 0.]
22 [0. 0. 0. ... 0. 0. 0.]
23 [0. 0. 0. ... 0. 0. 0.]
24 [0. 0. 0. ... 0. 0. 0.]
25 [0. 0. 0. ... 0. 0. 0.]
26 [0. 0. 0. ... 0. 0. 0.]
27 [0. 0. 0. ... 0. 0. 0.]
28 [0. 0. 0. ... 0. 0. 0.]
29 [0. 0. 0. ... 0. 0. 0.]
30 [0. 0. 0. ... 0. 0. 0.]
31 [0. 0. 0. ... 0. 0. 0.]
32 [0. 0. 0. ... 0. 0. 0.]
33 [0. 0. 0. ... 0. 0. 0.]
34 [0. 0. 0. ... 0. 0. 0.]
35 [0. 0. 0. ... 0. 0. 0.]
36 [0. 0. 0. ... 0. 0. 0.]
37 [0. 0. 0

In [50]:
seq = [0,1,2,3,4,5,6,7,8,9]
for i,j in enumerate(seq[::-1]):
    print(i,j)

0 9
1 8
2 7
3 6
4 5
5 4
6 3
7 2
8 1
9 0


In [35]:
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE)

for i, data in enumerate(train_loader):
    if i < 1:
        batch, target = data
        for inputs in batch:
            print('\n The input is:', inputs)
#             print('\n The target is:', label)
        i += 1
    else:
        break


 The input is: 
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 70x1014]


 The input is: 
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 70x1014]


 The input is: 
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0

 The input is: 
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 70x1014]


 The input is: 
    0     0     0  ...      0     0     0
    0     0     1  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 70x1014]


 The input is: 
    0     0     1  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0 

 The input is: 
    0     0     1  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 70x1014]


 The input is: 
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 70x1014]


 The input is: 
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0 

In [23]:
label = []
data = []

label_data_path = 'data/train_5k.csv'

with open(label_data_path, 'rt') as f: 
    rdr = csv.reader(f, delimiter=',', quotechar='"')
    for index, row in enumerate(rdr):
        label.append(int(row[0]))
        txt = ' '.join(row[1:])
        txt = txt.lower()                
        data.append(txt)

In [26]:
for i in range(10):
    print(i, ' ', label[i], '\n', data[i], '\n')

0   2 
 wall st. bears claw back into the black (reuters) reuters - short-sellers, wall street's dwindling\band of ultra-cynics, are seeing green again. 

1   2 
 carlyle looks toward commercial aerospace (reuters) reuters - private investment firm carlyle group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market. 

2   2 
 oil and economy cloud stocks' outlook (reuters) reuters - soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums. 

3   2 
 iraq halts oil exports from main southern pipeline (reuters) reuters - authorities have halted oil export\flows from the main pipeline in southern iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on saturday. 

4   2 
 oil prices soar to all-time record, posing new men