# Model definition

In [1]:
import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class CharacterCNN(nn.Module):
    
    def __init__(self, vocab_size, text_length, conv_kernels, conv_dim, 
                 linear_dim, output_dim, init_weights, dropout_prob, pool_kernel):
        
        super(CharacterCNN, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv1d(vocab_size, conv_dim, conv_kernels[0]),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = pool_kernel, stride = pool_kernel)# stride provides non-overlapping propagation
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv1d(conv_dim, conv_dim, conv_kernels[1]),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = pool_kernel, stride = pool_kernel)
        )
        
        self.conv3 = nn.Sequential(
            nn.Conv1d(conv_dim, conv_dim, conv_kernels[2]),
            nn.ReLU()
        )
            
        self.conv4 = nn.Sequential(
            nn.Conv1d(conv_dim, conv_dim, conv_kernels[3]),
            nn.ReLU()
        )
        
        self.conv5 = nn.Sequential(
            nn.Conv1d(conv_dim, conv_dim, conv_kernels[4]),
            nn.ReLU()
        )
        
        self.conv6 = nn.Sequential(
            nn.Conv1d(conv_dim, conv_dim, conv_kernels[5]),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size = pool_kernel, stride = pool_kernel)
        )
        
        self.fc1 = nn.Sequential(
            nn.Linear(int(conv_dim * (text_length - 96) / 27), linear_dim),
            nn.ReLU(),
            nn.Dropout(dropout_prob)
        )
        
        self.fc2 = nn.Sequential(
            nn.Linear(linear_dim, linear_dim),
            nn.ReLU(),
            nn.Dropout(dropout_prob)
        )
        
        self.fc3 = nn.Linear(linear_dim, output_dim)
        self.logsoftmax = nn.LogSoftmax(dim = 0)
        
        self.weights_init(init_weights)
    
    def forward(self, input):
        
# 6 convolutions with ReLU activations + 3 max poolings
        output = self.conv1(input)
        output = self.conv2(output)
        output = self.conv3(output)
        output = self.conv4(output)
        output = self.conv5(output)
        output = self.conv6(output)
        
# flatten the input for the linear layer
        output = output.view(output.size()[0], -1)
        
# 3 affine maps
        output = self.fc1(output)
        output = self.fc2(output)
        output = self.fc3(output)
        
# logsoftmax
        output = self.logsoftmax(output)
        
        return output
    
# initialize weights with normal distribution
    def weights_init(self, init_weights):
        for m in self.modules():
            if type(m) == nn.Linear:
                m.weight.data.normal_(init_weights[0][0], init_weights[0][1])
            elif type(m) == nn.Conv1d:
                m.weight.data.normal_(init_weights[1][0], init_weights[1][1])
                

#  Prepare data for DataLoader

In [2]:
import csv
import os.path as op
import re
import codecs
import json
from torch.utils.data import DataLoader, Dataset

class TextLoader(Dataset):
    def __init__(self, label_data_path, alphabet_path, text_length):

        self.label_data_path = label_data_path
        # read alphabet
        with open(alphabet_path) as alphabet_file:
            alphabet = str(''.join(json.load(alphabet_file)))
        self.alphabet = alphabet
        self.text_length = text_length
        self.load()
        self.y = torch.LongTensor(self.label)
            
    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        X = self.oneHotEncode(idx)
        y = self.y[idx]
        return X, y


    def load(self, lowercase=True):
        self.label = []
        self.data = []
        with open(self.label_data_path, 'rt') as f:
            rdr = csv.reader(f, delimiter=',', quotechar='"')
            # num_samples = sum(1 for row in rdr)
            for index, row in enumerate(rdr):
                self.label.append(int(row[0]))
                txt = ' '.join(row[1:])
                if lowercase:
                    txt = txt.lower()                
                self.data.append(txt)

    def oneHotEncode(self, idx):
        X = torch.zeros(len(self.alphabet), self.text_length)
        sequence = self.data[idx]
        for index_char, char in enumerate(sequence[::-1]): # iterate over reversed sequence
            if self.char2Index(char)!=-1:
                X[self.char2Index(char)][index_char] = 1.0
        return X

    def char2Index(self, character):
        return self.alphabet.find(character)

    def get_class_weight(self):
        
        num_samples = self.__len__()
        label_set = set(self.label)
        num_class = [self.label.count(c) for c in label_set]
        class_weight = [num_samples / float(self.label.count(c)) for c in label_set]    
        
        return class_weight, num_class
    

# Train/eval

In [6]:
def train(model, optimizer, loader, criterion):
    
    model.train()
    
    for inputs, labels in loader:
        
        inputs = Variable(inputs)
        labels = Variable(labels)
        
        optimizer.zero_grad()
        out = model(inputs)
        loss = criterion(out, labels)
        
#         a = list(model.parameters())[0].clone()
        loss.backward()
#         torch.nn.utils.clip_grad_norm(model.parameters(), 400)
        optimizer.step()
#         b = list(model.parameters())[0].clone()
#         print(torch.equal(a.data, b.data)) # checked that weights are updated
    

def evaluate(loader, model, criterion):

    model.eval()
    
    correct = 0
    total = 0
    avg_loss = 0
    for batch_num, data in enumerate(loader):
        
        inputs, labels = data
        inputs = Variable(inputs)
        
        out = model(inputs)
        
        predicted = Variable(torch.max(out.data, 1)[1])
        
        total += labels.size(0)
        correct += (predicted.view(labels.size()).data == labels).sum()
        accuracy = 100 * correct / total
        
        labels = Variable(labels)
        loss = criterion(out, labels)
        avg_loss += (loss.data[0] - avg_loss) / (batch_num + 1)
    
    return avg_loss, accuracy

def fit(model, optimizer, scheduler, criterion, train_loader, test_loader, n_epochs):

    train_log, train_acc_log = [], []
    val_log, val_acc_log = [], []

    for epoch in range(n_epochs):
        
        scheduler.step()
        
        train(model, optimizer, train_loader, criterion)
        train_loss, train_acc = evaluate(train_loader, model, criterion)
        val_loss, val_acc = evaluate(test_loader, model, criterion)

        train_log.append(train_loss)
        train_acc_log.append(train_acc)

        val_log.append(val_loss)
        val_acc_log.append(val_acc)

        print (('Epoch [%d/%d], LR: %f, Loss (train/test): %.4f/%.4f,'+\
               ' Acc (train/test): %.4f/%.4f' )
                   %(epoch+1, n_epochs, \
                     optimizer.state_dict()['param_groups'][0]['lr'], train_loss, val_loss, train_acc, val_acc))
            
    return train_log, train_acc_log, val_log, val_acc_log

# Model training

In [7]:
torch.manual_seed(1)

# Define model hyperparameters
VOCAB_SIZE = 70
TEXT_LENGTH = 1014
CONV_KERNELS = [7, 7, 3, 3, 3, 3]
POOL_KERNEL = 3
CONV_DIM = 256
LINEAR_DIM = 1024
INIT_WEIGHTS = [[0, 0.02], [0, 0.02]]
DROPOUT_PROB = 0.5
OUTPUT_DIM = 4

# Define training parameters
EPOCHS_NUM = 10
BATCH_SIZE = 64

alphabet_path = 'data/alphabet.json'
train_data_path = 'data/train_1k.csv'
test_data_path = 'data/test_1k.csv'
              
loss_func = nn.NLLLoss()
# loss_func = nn.CrossEntropy()
model = CharacterCNN(VOCAB_SIZE, TEXT_LENGTH, CONV_KERNELS, CONV_DIM, 
                     LINEAR_DIM, OUTPUT_DIM, INIT_WEIGHTS, DROPOUT_PROB, POOL_KERNEL)
optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum = 0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) # halve learning rate 10 times every 3 epochs

train_dataset = TextLoader(train_data_path, alphabet_path, TEXT_LENGTH)
train_loader = DataLoader(train_dataset, batch_size = BATCH_SIZE)

test_dataset = TextLoader(test_data_path, alphabet_path, TEXT_LENGTH)
test_loader = DataLoader(test_dataset, batch_size = BATCH_SIZE)

# Model fit
fit(model, optimizer, scheduler, loss_func, train_loader, test_loader, EPOCHS_NUM)


Epoch [1/10], LR: 0.010000, Loss (train/test): 4.1279/4.1295, Acc (train/test): 19.7197/18.6000
Epoch [2/10], LR: 0.010000, Loss (train/test): 4.1279/4.1295, Acc (train/test): 21.8218/20.8000
Epoch [3/10], LR: 0.010000, Loss (train/test): 4.1279/4.1295, Acc (train/test): 23.0230/21.4000
Epoch [4/10], LR: 0.001000, Loss (train/test): 4.1279/4.1295, Acc (train/test): 23.4234/20.1000
Epoch [5/10], LR: 0.001000, Loss (train/test): 4.1279/4.1295, Acc (train/test): 23.5235/20.9000
Epoch [6/10], LR: 0.001000, Loss (train/test): 4.1279/4.1295, Acc (train/test): 23.5235/21.0000
Epoch [7/10], LR: 0.000100, Loss (train/test): 4.1279/4.1295, Acc (train/test): 23.7237/20.8000
Epoch [8/10], LR: 0.000100, Loss (train/test): 4.1279/4.1295, Acc (train/test): 23.9239/20.7000
Epoch [9/10], LR: 0.000100, Loss (train/test): 4.1279/4.1295, Acc (train/test): 24.0240/20.6000
Epoch [10/10], LR: 0.000010, Loss (train/test): 4.1279/4.1295, Acc (train/test): 24.0240/20.6000


([4.127929776906967,
  4.1279273480176935,
  4.1279269605875015,
  4.12792693078518,
  4.127926513552666,
  4.127926662564277,
  4.1279266923666,
  4.127926751971245,
  4.127926692366599,
  4.1279266923666],
 [19.71971971971972,
  21.82182182182182,
  23.023023023023022,
  23.423423423423422,
  23.523523523523522,
  23.523523523523522,
  23.723723723723722,
  23.923923923923923,
  24.024024024024023,
  24.024024024024023],
 [4.129515290260315,
  4.129512429237366,
  4.129512235522271,
  4.129511386156082,
  4.12951086461544,
  4.12951135635376,
  4.129511207342148,
  4.129511207342148,
  4.129511147737503,
  4.129511207342148],
 [18.6, 20.8, 21.4, 20.1, 20.9, 21.0, 20.8, 20.7, 20.6, 20.6])