#### CNN's for NLP


* http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
* dataset: https://github.com/spro/practical-pytorch

In [170]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

### Import text-data from txt.files
__Dataset import__

In [2]:
# dataset location: ./dataset/names/*.txt
import glob

all_filenames = glob.glob('dataset/names/*.txt')
print(all_filenames)

['dataset/names/Arabic.txt', 'dataset/names/Chinese.txt', 'dataset/names/Czech.txt', 'dataset/names/Dutch.txt', 'dataset/names/English.txt', 'dataset/names/French.txt', 'dataset/names/German.txt', 'dataset/names/Greek.txt', 'dataset/names/Irish.txt', 'dataset/names/Italian.txt', 'dataset/names/Japanese.txt', 'dataset/names/Korean.txt', 'dataset/names/Polish.txt', 'dataset/names/Portuguese.txt', 'dataset/names/Russian.txt', 'dataset/names/Scottish.txt', 'dataset/names/Spanish.txt', 'dataset/names/Vietnamese.txt']


__Convert to non-ascii characters__

In [145]:
import unicodedata
import string

all_letters = string.ascii_letters + "_- .,;'0123456789"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicode_to_ascii('Ślusàrski'))

Slusarski


__Determine categories and words inside each txt file__

In [146]:

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename).read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]

# create a list of words for each category
for filename in all_filenames:
    category = filename.split('/')[-1].split('.')[0]
    all_categories.append(category)
    category_lines[category] = readLines(filename)

n_categories = len(all_categories)
print('n_categories =', n_categories)

# all_categories contains the keys to iterate over the category_lines dict
print(all_categories)

n_categories = 18
['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese']


__Creating Tensors__
 
Usually add padding to the character-sequences to normalise length for the CNN input. I'll try to Avoid this by treating the words as sequences of bi-grams:

e.g. bigram-tensor for the word 'every'
    
|height (4)|width (2)     |
|------|---:|
|   |'ev'|
|   |'ve'|
|   |'er'|
|   |'ry'|

In [147]:
# index all possible bigrams
possible_bigrams = []
for letter_1 in all_letters:
    for letter_2 in all_letters:
        possible_bigrams.append(letter_1 + letter_2)
# reversed index & convert possible bigrams to dict
all_bigrams = {bigram: index for index, bigram in enumerate(possible_bigrams)}
possible_bigrams = {index: bigram for index, bigram in enumerate(possible_bigrams)}
    
print(possible_bigrams[0])
print(all_bigrams['a_'])

aa
52


In [148]:
# function that takes a list of characters and ouputs bi-gram tensors with the same label
def word_to_bigrams(word):
    bigrams = []
    if len(word) < 2:
        # words consisting of a single letter are padded with a space ' '
        return [word + ' ']
    else:
        list_of_chars = list(word)
        # n-1 bigrams in a word
        for i in range(len(list_of_chars) - 1):
            bigrams.append([list_of_chars[i] + list_of_chars[i + 1]])
        return bigrams

In [149]:
word_to_bigrams('test_word')

[['te'], ['es'], ['st'], ['t_'], ['_w'], ['wo'], ['or'], ['rd']]

In [150]:
num_batches = 1 

def word_to_tensor(word):
    # e.g. for the word 'every' a 4 by 2 tensor
    tensor = torch.zeros(len(word) - 1, num_batches, len(possible_bigrams))
    print(type(tensor)) ## hiermee verder! goed de tensor-grootte bekijken
    for bigram_nr, bigram in enumerate(word_to_bigrams(word)):
        tensor[bigram_nr][0][all_bigrams[bigram[0]]] =1
    return tensor

In [205]:
word_to_tensor('test_word').size()

<class 'torch.FloatTensor'>


torch.Size([8, 1, 4761])

In [206]:
# the X input is a list of all tensors, representing all names
x_input = []
for category in all_categories:
    for name in category_lines[category]:
        x_input.append(word_to_tensor(name))

# the Y labels are the categories, where arabic is 0 and vietnamese is 17
y_input = []
for idx, category in enumerate(all_categories):
    for i in range(0, len(category_lines[category])):
        y_input.append(idx)
        

<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatT

<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatT

<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatT

<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatT

<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatTensor'>
<class 'torch.FloatT

In [290]:
data = []

for idx, _ in enumerate(x_input):
    data.append([x_input[idx], y_input[idx]])
    
from random import shuffle
shuffle(data)

In [293]:
import pandas as pd

dataframe = {
    'Amount of labels' : [data[i][1] for i in range(9)],
    'Size of some tensors': [tensor.size() for tensor in x_input[9000:9010]]
}

pd.DataFrame(dataframe)

ValueError: arrays must all be same length

#### CNN model
* 2 convolutional layers
* 2 pooling

In [237]:
class CNN(nn.Module):
    def __init__(self, input_ch, conv1_ch, output_ch, kernel_size, fc_dim, output_size):
        super(CNN, self).__init__()
        
        # Convolution 1
        self.cnn1 = nn.Conv2d(in_channels=input_ch, out_channels=conv1_ch, kernel_size=kernel_size, stride=1,padding=2)
        self.activation1 = nn.ReLU()
        
        # Max pool 1
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)
        
        #Convolution2
        self.cnn2 = nn.Conv2d(in_channels=conv1_ch, out_channels=output_ch, kernel_size=kernel_size, stride=1,padding=2)
        self.activation2 = nn.ReLU()
        
        # Max pool 1
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)
        
        # Fully connected 
        self.fc = nn.Linear(output_ch, output_size)
        
        
    def forward(self, x):
        # 3D tensor to 4D for the conv layer:
        x = x.unsqueeze(0)
        x = self.cnn1(x)
        x = self.activation1(x)
        x = self.maxpool1(x)
        x = self.cnn2(x)
        x = self.activation2(x)
        x = self.maxpool2(x)
        # Resize
        # - original size: [wordlength - 1, batch_size, possible_bigrams: 4761]
        print(x.size)
        # - x.size
        # - new output size: [wordlength - 1, batch_size, possible_bigrams: 4761]
        x = x.view(x.size(0),-1)
        x = self.fc(x)
        return x

__Kernel size__
* $O = \frac{W-K+2P}{S}+1$
  * $O$: output heigth/length
  * $W$: input height/length
  * $K$: kernel size
  * $P$: padding
    * $ P = \frac{K-1}{2}$
  * $S$: Stride
* $O$ = len(word_to_bigrams)

In [238]:
# input_ch = 1
# conv1_ch = 16
# output_ch, = 32
# kernel_size = 2 to 5
# fc_dim = 1
# output_size = 18 classes
### non-sliding kernel_height = 4761 (possible_bigrams)
### sliding could be e.g. 529 (possible_bigrams/9)
model = CNN(1,16,32,2,1,18)
if torch.cuda.is_available():
    model.cuda()

In [239]:
#define some parameters

epochs = 10
batch_size = 32
learning_rate = 0.01
momentum = 0.9


#define loss and optimizer

criterion = nn.CrossEntropyLoss() #cross entropy loss = log softmax + NLL loss
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

In [240]:
#for plotting

plot_loss = []
plot_correct = []

In [241]:
# Considering a cuda model, otherwise remove .cpu() or write if/else 
print(model.cpu().parameters())
print(len(list(model.cpu().parameters())))
print('Conv1 kernels:\n',list(model.cpu().parameters())[0].size())
print('Conv1 bias kernels:\n',list(model.cpu().parameters())[1].size())
print('Conv2 kernels (depth 16):\n',list(model.cpu().parameters())[2].size())
print('Conv2 bias kernels:\n',list(model.cpu().parameters())[3].size())
print('Fully connected layer:\n',list(model.cpu().parameters())[4].size())
print('Fully connected bias:\n',list(model.cpu().parameters())[5].size())


<generator object Module.parameters at 0x7fa394348f68>
6
Conv1 kernels:
 torch.Size([16, 1, 2, 2])
Conv1 bias kernels:
 torch.Size([16])
Conv2 kernels (depth 16):
 torch.Size([32, 16, 2, 2])
Conv2 bias kernels:
 torch.Size([32])
Fully connected layer:
 torch.Size([18, 32])
Fully connected bias:
 torch.Size([18])


In [244]:
def train(model, criterion, optimizer, x, y):
    x = Variable(x, requires_grad=False)
    x = x.cuda()
    y = Variable(y, requires_grad=False)
    y = y.cuda()
    
    # reset gradient
    optimizer.zero_grad()

    # forward pass
    fx = model.forward(x)
    
    # get the loss
    loss = criterion(fx, y)

    # backward pass
    loss.backward()

    # update parameters
    optimizer.step()

    # return the actual loss data, not the Variable
    return loss.data[0]

In [288]:

data[0][0]


( 0  ,.,.) = 
   0   0   0  ...    0   0   0

( 1  ,.,.) = 
   0   0   0  ...    0   0   0

( 2  ,.,.) = 
   0   0   0  ...    0   0   0

( 3  ,.,.) = 
   0   0   0  ...    0   0   0

( 4  ,.,.) = 
   0   0   0  ...    0   0   0

( 5  ,.,.) = 
   0   0   0  ...    0   0   0
[torch.FloatTensor of size 6x1x4761]

In [289]:
iter = 0 

for e in range(1, epochs+1):
    loss = 0.
    for i, d in enumerate(data):
        x = data[i][0] #converts list of indices to tensor of indices
        y = torch.LongTensor(data[i][1])

        loss += train(model, criterion, optimizer, x, y)
    plot_loss.append(loss/num_examples)
    print("Epoch %02d, loss = %f" % (e, loss / num_examples))

RuntimeError: Need input.size[1] == 1 but got 6 instead.

In [None]:
list_of_lines = [li for li in category_lines.values()]
[line for line in list_of_lines[17]]

In [73]:
[line[0][:][0] for cat, line in enumerate(category_lines.values())]

['Khoury',
 'Ang',
 'Abl',
 'Aalsburg',
 'Abbas',
 'Abel',
 'Abbing',
 'Adamidis',
 'Adam',
 'Abandonato',
 'Abe',
 'Ahn',
 'Adamczak',
 'Abreu',
 'Ababko',
 'Smith',
 'Abana',
 'Nguyen']

In [202]:
28*28*2

1568