#### CNN's for NLP


* http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
* dataset: https://github.com/spro/practical-pytorch

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

### Import text-data from txt.files
__Dataset import__

In [2]:
# dataset location: ./dataset/names/*.txt
import glob

all_filenames = glob.glob('dataset/names/*.txt')
print(all_filenames)

['dataset/names/Arabic.txt', 'dataset/names/Chinese.txt', 'dataset/names/Czech.txt', 'dataset/names/Dutch.txt', 'dataset/names/English.txt', 'dataset/names/French.txt', 'dataset/names/German.txt', 'dataset/names/Greek.txt', 'dataset/names/Irish.txt', 'dataset/names/Italian.txt', 'dataset/names/Japanese.txt', 'dataset/names/Korean.txt', 'dataset/names/Polish.txt', 'dataset/names/Portuguese.txt', 'dataset/names/Russian.txt', 'dataset/names/Scottish.txt', 'dataset/names/Spanish.txt', 'dataset/names/Vietnamese.txt']


__Convert to non-ascii characters__

In [3]:
import unicodedata
import string

all_letters = string.ascii_letters + "_- .,;'0123456789"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicode_to_ascii('Ślusàrski'))

Slusarski


__Determine categories and words inside each txt file__

In [4]:

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename).read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]

# create a list of words for each category
for filename in all_filenames:
    category = filename.split('/')[-1].split('.')[0]
    all_categories.append(category)
    category_lines[category] = readLines(filename)

n_categories = len(all_categories)
print('n_categories =', n_categories)

# all_categories contains the keys to iterate over the category_lines dict
print(all_categories)

n_categories = 18
['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese']


__Creating Tensors__
 
Usually add padding to the character-sequences to normalise length for the CNN input. I'll try to Avoid this by treating the words as sequences of bi-grams:

e.g. bigram-tensor for the word 'every'
    
|height (4)|width (2)     |
|------|---:|
|   |'ev'|
|   |'ve'|
|   |'er'|
|   |'ry'|

In [5]:
# index all possible bigrams
possible_bigrams = []
for letter_1 in all_letters:
    for letter_2 in all_letters:
        possible_bigrams.append(letter_1 + letter_2)
# reversed index & convert possible bigrams to dict
all_bigrams = {bigram: index for index, bigram in enumerate(possible_bigrams)}
possible_bigrams = {index: bigram for index, bigram in enumerate(possible_bigrams)}
    
print(possible_bigrams[0])
print(all_bigrams['a_'])

aa
52


In [6]:
# function that takes a list of characters and ouputs bi-gram tensors with the same label
def word_to_bigrams(word):
    bigrams = []
    if len(word) < 2:
        # words consisting of a single letter are padded with a space ' '
        return [word + ' ']
    else:
        list_of_chars = list(word)
        # n-1 bigrams in a word
        for i in range(len(list_of_chars) - 1):
            bigrams.append([list_of_chars[i] + list_of_chars[i + 1]])
        return bigrams

In [7]:
word_to_bigrams('test_word')

[['te'], ['es'], ['st'], ['t_'], ['_w'], ['wo'], ['or'], ['rd']]

In [8]:
num_batches = 1 

# split word into bigrams, create a 
def word_to_tensor(word):
    # e.g. for the word 'every' a 4 by 2 tensor
    tensor = torch.zeros(2, num_batches, len(possible_bigrams))
    # each tensor is a single vector with a 1 for every bigram appearing
    # this has to change!
    for bigram in word_to_bigrams(word):
        tensor[0][0][all_bigrams[bigram[0]]] =1
    return tensor

In [9]:
word_to_tensor('test_word')


( 0  ,.,.) = 
   0   0   0  ...    0   0   0

( 1  ,.,.) = 
   0   0   0  ...    0   0   0
[torch.FloatTensor of size 2x1x4761]

In [10]:
# the X input is a list of all tensors, representing all names
x_input = []
for category in all_categories:
    for name in category_lines[category]:
        x_input.append(word_to_tensor(name))

# the Y labels are the categories, where arabic is 0 and vietnamese is 17
y_input = []
for idx, category in enumerate(all_categories):
    for i in range(0, len(category_lines[category])):
        y_input.append(idx)
        

In [11]:
data = []

for idx, _ in enumerate(x_input):
    data.append([x_input[idx], y_input[idx]])
    
from random import shuffle
shuffle(data)


In [12]:
import pandas as pd

dataframe = {
    'Amount of labels' : [data[i][1] for i in range(10)],
    'Size of some tensors': [tensor.size() for tensor in x_input[9000:9010]]
}

pd.DataFrame(dataframe)

Unnamed: 0,Amount of labels,Size of some tensors
0,6,"(2, 1, 4761)"
1,14,"(2, 1, 4761)"
2,14,"(2, 1, 4761)"
3,3,"(2, 1, 4761)"
4,14,"(2, 1, 4761)"
5,0,"(2, 1, 4761)"
6,14,"(2, 1, 4761)"
7,0,"(2, 1, 4761)"
8,4,"(2, 1, 4761)"
9,8,"(2, 1, 4761)"


#### CNN model
* 2 convolutional layers
* 2 pooling

In [13]:
class CNN(nn.Module):
    def __init__(self, input_ch, conv1_ch, output_ch, kernel_size, fc_dim, output_size):
        super(CNN, self).__init__()
        
        # Convolution 1
        self.cnn1 = nn.Conv2d(in_channels=input_ch, out_channels=conv1_ch, kernel_size=kernel_size, stride=1,padding=2)
        self.activation1 = nn.ReLU()
        
        # Max pool 1
        self.maxpool1 = nn.MaxPool2d(kernel_size=2)
        
        #Convolution2
        self.cnn2 = nn.Conv2d(in_channels=conv1_ch, out_channels=output_ch, kernel_size=kernel_size, stride=1,padding=2)
        self.activation2 = nn.ReLU()
        
        # Max pool 2
        self.maxpool2 = nn.MaxPool2d(kernel_size=2)
        
        # Fully connected 
        self.fc = nn.Linear(76288, output_size)
        
        
    def forward(self, x):
        # 3D tensor to 4D for the conv layer:
        x = x.unsqueeze(0)
        x = self.cnn1(x)
        x = self.activation1(x)
        x = self.maxpool1(x)
        x = self.cnn2(x)
        x = self.activation2(x)
        x = self.maxpool2(x)
        # Resize
        # - original size: [wordlength - 1, batch_size, possible_bigrams: 4761]
        # - x.size
        # - new output size: [wordlength - 1, batch_size, possible_bigrams: 4761]
        x = x.view(x.size(0),-1)
        x = self.fc(x)
        return x

__Kernel size__
* $O = \frac{W-K+2P}{S}+1$
  * $O$: output heigth/length
  * $W$: input height/length
  * $K$: kernel size
  * $P$: padding
    * $ P = \frac{K-1}{2}$
  * $S$: Stride
* $O$ = len(word_to_bigrams)

In [14]:
# input_ch = 1
# conv1_ch = 16
# output_ch, = 32
# kernel_size = 2 to 5
# fc_dim = 1
# output_size = 18 classes
### non-sliding kernel_height = 4761 (possible_bigrams)
### sliding could be e.g. 529 (possible_bigrams/9)
model = CNN(2,16,32,2,1,18)
if torch.cuda.is_available():
    model.cuda()

In [15]:
#define some parameters

epochs = 10
batch_size = 32
learning_rate = 0.01
momentum = 0.9


#define loss and optimizer

criterion = nn.CrossEntropyLoss() #cross entropy loss = log softmax + NLL loss
optimizer = optim.Adam(model.parameters())

In [16]:
#for plotting

plot_loss = []
plot_correct = []

In [17]:
# Considering a cuda model, otherwise remove .cpu() or write if/else 
"""print(model.parameters())
print(len(list(model.parameters())))
print('Conv1 kernels:\n',list(model.parameters())[0].size())
print('Conv1 bias kernels:\n',list(model.parameters())[1].size())
print('Conv2 kernels (depth 16):\n',list(model.parameters())[2].size())
print('Conv2 bias kernels:\n',list(model.parameters())[3].size())
print('Fully connected layer:\n',list(model.parameters())[4].size())
print('Fully connected bias:\n',list(model.parameters())[5].size())"""


"print(model.parameters())\nprint(len(list(model.parameters())))\nprint('Conv1 kernels:\n',list(model.parameters())[0].size())\nprint('Conv1 bias kernels:\n',list(model.parameters())[1].size())\nprint('Conv2 kernels (depth 16):\n',list(model.parameters())[2].size())\nprint('Conv2 bias kernels:\n',list(model.parameters())[3].size())\nprint('Fully connected layer:\n',list(model.parameters())[4].size())\nprint('Fully connected bias:\n',list(model.parameters())[5].size())"

In [18]:
def train(model, criterion, optimizer, x, y):
    x = Variable(x, requires_grad=False)
    y = Variable(y, requires_grad=False)
    
    # reset gradient
    optimizer.zero_grad()

    # forward pass
    fx = model.forward(x)
    
    # get the loss
    loss = criterion(fx, y)

    # backward pass
    loss.backward()

    # update parameters
    optimizer.step()

    # return the actual loss data, not the Variable
    return loss.data[0]

In [19]:

torch.LongTensor(data[0][1]).cuda()


 1.4039e+14
 9.4418e+13
 0.0000e+00
 0.0000e+00
 0.0000e+00
 0.0000e+00
[torch.cuda.LongTensor of size 6 (GPU 0)]

In [20]:
data[0]

[
 ( 0  ,.,.) = 
    0   0   0  ...    0   0   0
 
 ( 1  ,.,.) = 
    0   0   0  ...    0   0   0
 [torch.FloatTensor of size 2x1x4761], 6]

In [21]:
iter = 0 

for e in range(1, epochs+1):
    loss = 0.
    for i, tensor in enumerate(data):

        x = data[i][0].cuda() #converts list of indices to tensor of indices
        y = torch.LongTensor([data[i][1]]).cuda()
        
        loss += train(model, criterion, optimizer, x, y)
    plot_loss.append(loss/len(data))
    print("Epoch %02d, loss = %f" % (e, loss / len(data)))

Epoch 01, loss = 0.950206
Epoch 02, loss = 0.637751
Epoch 03, loss = 0.571735
Epoch 04, loss = 0.537836
Epoch 05, loss = 0.516724
Epoch 06, loss = 0.503568
Epoch 07, loss = 0.493467
Epoch 08, loss = 0.481044
Epoch 09, loss = 0.478606
Epoch 10, loss = 0.470019


In [22]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline

plt.figure()
plt.plot(all_losses)


 14
[torch.FloatTensor of size 1]

In [None]:
# Keep track of correct guesses in a confusion matrix
confusion = torch.zeros(n_categories, n_categories)
n_confusion = 10000

# Just return an output given a line
def evaluate(line_tensor):
    return model(line_tensor)

# Go through a bunch of examples and record which are correctly guessed
for i in range(n_confusion):
    category, line, category_tensor, line_tensor = random_training_pair()
    output = evaluate(line_tensor)
    guess, guess_i = category_from_output(output)
    category_i = all_categories.index(category)
    confusion[category_i][guess_i] += 1

# Normalize by dividing every row by its sum
for i in range(n_categories):
    confusion[i] = confusion[i] / confusion[i].sum()

# Set up plot
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(confusion.numpy())
fig.colorbar(cax)

# Set up axes
ax.set_xticklabels([''] + all_categories, rotation=90)
ax.set_yticklabels([''] + all_categories)

# Force label at every tick
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

plt.show()

In [23]:
list_of_lines = [li for li in category_lines.values()]
[line for line in list_of_lines[17]]

['Nguyen',
 'Tron',
 'Le',
 'Pham',
 'Huynh',
 'Hoang',
 'Phan',
 'Vu',
 'Vo',
 'Dang',
 'Bui',
 'Do',
 'Ho',
 'Ngo',
 'Duong',
 'Ly',
 'An',
 'an',
 'Bach',
 'Banh',
 'Cao',
 'Chau',
 'Chu',
 'Chung',
 'Chu',
 'Diep',
 'Doan',
 'Dam',
 'Dao',
 'Dinh',
 'Doan',
 'Giang',
 'Ha',
 'Han',
 'Kieu',
 'Kim',
 'La',
 'Lac',
 'Lam',
 'Lieu',
 'Luc',
 'Luong',
 'Luu',
 'Ma',
 'Mach',
 'Mai',
 'Nghiem',
 'Phi',
 'Pho',
 'Phung',
 'Quach',
 'Quang',
 'Quyen',
 'Ta',
 'Thach',
 'Thai',
 'Sai',
 'Thi',
 'Than',
 'Thao',
 'Thuy',
 'Tieu',
 'To',
 'Ton',
 'Tong',
 'Trang',
 'Trieu',
 'Trinh',
 'Truong',
 'Van',
 'Vinh',
 'Vuong',
 'Vuu']

In [24]:
[line[0][:][0] for cat, line in enumerate(category_lines.values())]

['K',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'A',
 'S',
 'A',
 'N']

In [25]:
28*28*2

1568