#### CNN's for NLP


* http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
* dataset: https://github.com/spro/practical-pytorch

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

### Import text-data from txt.files
__Dataset import__

In [3]:
# dataset location: ./dataset/names/*.txt
import glob

all_filenames = glob.glob('dataset/names/*.txt')
print(all_filenames)

['dataset/names/Arabic.txt', 'dataset/names/Chinese.txt', 'dataset/names/Czech.txt', 'dataset/names/Dutch.txt', 'dataset/names/English.txt', 'dataset/names/French.txt', 'dataset/names/German.txt', 'dataset/names/Greek.txt', 'dataset/names/Irish.txt', 'dataset/names/Italian.txt', 'dataset/names/Japanese.txt', 'dataset/names/Korean.txt', 'dataset/names/Polish.txt', 'dataset/names/Portuguese.txt', 'dataset/names/Russian.txt', 'dataset/names/Scottish.txt', 'dataset/names/Spanish.txt', 'dataset/names/Vietnamese.txt']


__Convert to non-ascii characters__

In [4]:
import unicodedata
import string

all_letters = string.ascii_letters + "_- .,;'0123456789"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicode_to_ascii('Ślusàrski'))

Slusarski


__Determine categories and words inside each txt file__

In [5]:

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename).read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]

# create a list of words for each category
for filename in all_filenames:
    category = filename.split('/')[-1].split('.')[0]
    all_categories.append(category)
    category_lines[category] = readLines(filename)

n_categories = len(all_categories)
print('n_categories =', n_categories)

# all_categories contains the keys to iterate over the category_lines dict
print(all_categories)

n_categories = 18
['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese']


__Creating Tensors__
 
Usually add padding to the character-sequences to normalise length for the CNN input. I'll try to Avoid this by treating the words as sequences of bi-grams:

e.g. bigram-tensor for the word 'every'
    
|height (4)|width (2)     |
|------|---:|
|'e'|'v'|
|'v'|'e'|
|'e'|'r'|
|'r'|'y'|

In [20]:
# index all possible bigrams
possible_bigrams = []
for letter_1 in all_letters:
    for letter_2 in all_letters:
        possible_bigrams.append(letter_1 + letter_2)
# reversed index & convert possible bigrams to dict
all_bigrams = {bigram: index for index, bigram in enumerate(possible_bigrams)}
possible_bigrams = {index: bigram for index, bigram in enumerate(possible_bigrams)}
    
print(possible_bigrams[0])
print(all_bigrams['a_'])

aa
52


In [21]:
# function that takes a list of characters and ouputs bi-gram tensors with the same label
def word_to_bigrams(word):
    bigrams = []
    if len(word) < 2:
        # words consisting of a single letter are padded with a space ' '
        return [word + ' ']
    else:
        list_of_chars = list(word)
        # n-1 bigrams in a word
        for i in range(len(list_of_chars) - 1):
            bigrams.append([list_of_chars[i] + list_of_chars[i + 1]])
        return bigrams

In [22]:
for index, i in enumerate(word_to_bigrams('test_word')):
    for letter in i:
        print(index, letter)

0 te
1 es
2 st
3 t_
4 _w
5 wo
6 or
7 rd


In [23]:
num_batches = 1 

# every letter in a word should be represented by a vector
def word_to_tensor(word):
    list_of_chars = list(word)
    tensor = torch.zeros(2, num_batches, n_letters)
    tensors = []
    
    # each tensor is a single vector with a 1 for every bigram appearing
    for index, letter in enumerate(list_of_chars):
        letter_index = all_letters.find(letter)
        tensor[0][0][letter_index] = 1
        tensors.append(tensor)
    return tensors


In [24]:
type(word_to_tensor('test')[0])

torch.FloatTensor

In [465]:
num_batches = 1 

def letter_to_tensor(letter):
    tensor = torch.zeros(1, n_letters)
    letter_index = all_letters.find(letter)
    tensor[0][letter_index] = 1
    return tensor

# create a tensor for each 
def word_to_tensor(word):
    tensor = torch.zeros(2, num_batches, n_letters)
    tensors = []
    # each tensor is a single vector with a 1 for every bigram appearing
    # this has to change!
    for bigram in word_to_bigrams(word):
        for index, letter in enumerate(bigram):
            letter_index = all_letters.find(letter)
            # mistake here
            tensor[index][0][letter_index] = 1
        tensors.append(tensor)
    return tensors

In [477]:
print(type(word_to_tensor('test_word')))
print(word_to_tensor('test_word')[0].size())

<class 'list'>
torch.Size([2, 1, 69])


In [479]:
19

[
 (0 ,.,.) = 
 
 Columns 0 to 18 
     0   1   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 
 Columns 19 to 37 
     0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 
 Columns 38 to 56 
     0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 
 Columns 57 to 68 
     0   0   0   0   0   0   0   0   0   0   0   0
 
 (1 ,.,.) = 
 
 Columns 0 to 18 
     0   0   0   0   1   0   0   0   0   0   0   0   0   1   0   0   0   0   0
 
 Columns 19 to 37 
     0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 
 Columns 38 to 56 
     0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 
 Columns 57 to 68 
     0   0   0   0   0   0   0   0   0   0   0   0
 [torch.FloatTensor of size 2x1x69], 
 (0 ,.,.) = 
 
 Columns 0 to 18 
     0   1   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 
 Columns 19 to 37 
     0   0   0   0   0   0   0   0   0   0   0   0   0   

In [384]:
# the X input is a list of all tensors, representing all names
x_input = []
for category in all_categories:
    for name in category_lines[category]:
        x_input.append(word_to_tensor(name))

# the Y labels are the categories, where arabic is 0 and vietnamese is 17
y_input = []
for idx, category in enumerate(all_categories):
    for i in range(0, len(category_lines[category])):
        y_input.append(idx)
        

#### Create a pytorch dataset from the data


In [None]:
from torch.utils.data import DataSet

class wordclassification(DataSet):
    def __init__(self, input_list, label_list, transform=None):
        '''
        Args:
            input_list: list with input values
            label_list: list with labels
            transform: Optional transform that may be applied
        '''
        self.inputs = [x for x in input_list]
        self.labels = [torch.LongTensor(y) for y in label_list]
        self.transform = transform
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self,idx):
        
        sample = 
        return sample
        

In [396]:
data = []

for idx, _ in enumerate(x_input):
    data.append([x_input[idx], y_input[idx]])
    
from random import shuffle
shuffle(data)

from torch.utils.data import TensorDataset
for x, y in data:
    dataset = TensorDataset(x, torch.LongTensor(2,y))


In [397]:
import pandas as pd

dataframe = {
    'Amount of labels' : [data[i][1] for i in range(10)],
    'Size of some tensors': [tensor.size() for tensor in x_input[9000:9010]]
}

pd.DataFrame(dataframe)

Unnamed: 0,Amount of labels,Size of some tensors
0,10,"(2, 1, 4761)"
1,14,"(2, 1, 4761)"
2,14,"(2, 1, 4761)"
3,4,"(2, 1, 4761)"
4,14,"(2, 1, 4761)"
5,0,"(2, 1, 4761)"
6,4,"(2, 1, 4761)"
7,3,"(2, 1, 4761)"
8,0,"(2, 1, 4761)"
9,4,"(2, 1, 4761)"


#### CNN model
* 2 convolutional layers
* 2 pooling

In [327]:
class CNN(nn.Module):
    def __init__(self, input_ch, conv1_ch, output_ch, kernel_size, fc_dim, output_size):
        super(CNN, self).__init__()
        
        # Convolution 1
        self.cnn1 = nn.Conv2d(in_channels=input_ch, out_channels=conv1_ch, kernel_size=kernel_size, stride=1,padding=2)
        self.activation1 = nn.ReLU()
        
        # Max pool 1
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)
        
        #Convolution2
        self.cnn2 = nn.Conv2d(in_channels=conv1_ch, out_channels=output_ch, kernel_size=kernel_size, stride=1,padding=2)
        self.activation2 = nn.ReLU()
        
        # Max pool 1
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)
        
        # Fully connected 
        self.fc = nn.Linear(output_ch, output_size)
        
        
    def forward(self, x):
        # 3D tensor to 4D for the conv layer:
        x = x.unsqueeze(0)
        x = self.cnn1(x)
        x = self.activation1(x)
        x = self.maxpool1(x)
        x = self.cnn2(x)
        x = self.activation2(x)
        x = self.maxpool2(x)
        # Resize
        # - original size: [wordlength - 1, batch_size, possible_bigrams: 4761]
        print(x.size())
        # - x.size
        # - new output size: [wordlength - 1, batch_size, possible_bigrams: 4761]
        x = x.view(x.size(0),-1)
        x = self.fc(x)
        return x

__Kernel size__
* $O = \frac{W-K+2P}{S}+1$
  * $O$: output heigth/length
  * $W$: input height/length
  * $K$: kernel size
  * $P$: padding
    * $ P = \frac{K-1}{2}$
  * $S$: Stride
* $O$ = len(word_to_bigrams)

In [355]:
# input_ch = 1
# conv1_ch = 16
# output_ch, = 32
# kernel_size = 2 to 5
# fc_dim = 1
# output_size = 18 classes
### non-sliding kernel_height = 4761 (possible_bigrams)
### sliding could be e.g. 529 (possible_bigrams/9)
model = CNN(2,16,32,2,1,18)
if torch.cuda.is_available():
    model.cuda()

In [356]:
#define some parameters

epochs = 10
batch_size = 32
learning_rate = 0.01
momentum = 0.9


#define loss and optimizer

criterion = nn.CrossEntropyLoss() #cross entropy loss = log softmax + NLL loss
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

In [357]:
#for plotting

plot_loss = []
plot_correct = []

In [358]:
# Considering a cuda model, otherwise remove .cpu() or write if/else 
print(model.cpu().parameters())
print(len(list(model.cpu().parameters())))
print('Conv1 kernels:\n',list(model.cpu().parameters())[0].size())
print('Conv1 bias kernels:\n',list(model.cpu().parameters())[1].size())
print('Conv2 kernels (depth 16):\n',list(model.cpu().parameters())[2].size())
print('Conv2 bias kernels:\n',list(model.cpu().parameters())[3].size())
print('Fully connected layer:\n',list(model.cpu().parameters())[4].size())
print('Fully connected bias:\n',list(model.cpu().parameters())[5].size())


<generator object Module.parameters at 0x7fa3a2fcbd00>
6
Conv1 kernels:
 torch.Size([16, 2, 2, 2])
Conv1 bias kernels:
 torch.Size([16])
Conv2 kernels (depth 16):
 torch.Size([32, 16, 2, 2])
Conv2 bias kernels:
 torch.Size([32])
Fully connected layer:
 torch.Size([18, 32])
Fully connected bias:
 torch.Size([18])


In [359]:
def train(model, criterion, optimizer, x, y):
    x = Variable(x, requires_grad=False)
    y = Variable(y, requires_grad=False)
    
    # reset gradient
    optimizer.zero_grad()

    # forward pass
    fx = model.forward(x)
    
    # get the loss
    loss = criterion(fx, y)

    # backward pass
    loss.backward()

    # update parameters
    optimizer.step()

    # return the actual loss data, not the Variable
    return loss.data[0]

In [375]:

torch.LongTensor(data[0][1]).cuda()


 1.4034e+14
 1.4034e+14
[torch.cuda.LongTensor of size 2 (GPU 0)]

In [401]:
dataset[0]

(
     0     0     0  ...      0     0     0
 [torch.FloatTensor of size 1x4761], 
  9.4173e+13
  9.4173e+13
  0.0000e+00
 -4.2950e+09
 [torch.LongTensor of size 4])

In [400]:
iter = 0 

for e in range(1, epochs+1):
    loss = 0.
    for i in range(len(dataset)):
        x = dataset[i].cuda() #converts list of indices to tensor of indices
        y = torch.LongTensor(2, data[i][1]).cuda()

        loss += train(model, criterion, optimizer, x, y)
    plot_loss.append(loss/num_examples)
    print("Epoch %02d, loss = %f" % (e, loss / num_examples))

AttributeError: 'tuple' object has no attribute 'cuda'

In [None]:
list_of_lines = [li for li in category_lines.values()]
[line for line in list_of_lines[17]]

In [73]:
[line[0][:][0] for cat, line in enumerate(category_lines.values())]

['Khoury',
 'Ang',
 'Abl',
 'Aalsburg',
 'Abbas',
 'Abel',
 'Abbing',
 'Adamidis',
 'Adam',
 'Abandonato',
 'Abe',
 'Ahn',
 'Adamczak',
 'Abreu',
 'Ababko',
 'Smith',
 'Abana',
 'Nguyen']

In [202]:
28*28*2

1568