#### CNN's for NLP


* http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
* dataset: https://github.com/spro/practical-pytorch

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable


### Import text-data from txt.files
__Dataset import__

In [2]:
# dataset location: ./dataset/names/*.txt
import glob

all_filenames = glob.glob('dataset/names/*.txt')
print(all_filenames)

['dataset/names/Arabic.txt', 'dataset/names/Chinese.txt', 'dataset/names/Czech.txt', 'dataset/names/Dutch.txt', 'dataset/names/English.txt', 'dataset/names/French.txt', 'dataset/names/German.txt', 'dataset/names/Greek.txt', 'dataset/names/Irish.txt', 'dataset/names/Italian.txt', 'dataset/names/Japanese.txt', 'dataset/names/Korean.txt', 'dataset/names/Polish.txt', 'dataset/names/Portuguese.txt', 'dataset/names/Russian.txt', 'dataset/names/Scottish.txt', 'dataset/names/Spanish.txt', 'dataset/names/Vietnamese.txt']


__Convert to non-ascii characters__

In [3]:
import unicodedata
import string

all_letters = string.ascii_letters + "- .,;'0123456789"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicode_to_ascii('Ślusàrski'))

Slusarski


__Determine categories and words inside each txt file__

In [4]:

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename).read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]

# create a list of words for each category
for filename in all_filenames:
    category = filename.split('/')[-1].split('.')[0]
    all_categories.append(category)
    category_lines[category] = readLines(filename)

n_categories = len(all_categories)
print('n_categories =', n_categories)

# all_categories contains the keys to iterate over the category_lines dict
print(all_categories)

n_categories = 18
['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese']


__Creating Tensors__
 
Usually add padding to the character-sequences to normalise length for the CNN input. I'll try to Avoid this by treating the words as sequences of bi-grams:

e.g. bigram-tensor for the word 'every'
    
|height (4)|width (2)     |
|------|---:|
|   |'ev'|
|   |'ve'|
|   |'er'|
|   |'ry'|

In [29]:
# index all possible bigrams
possible_bigrams = []
for letter_1 in all_letters:
    for letter_2 in all_letters:
        possible_bigrams.append(letter_1 + letter_2)
# reversed index & convert possible bigrams to dict
all_bigrams = {bigram: index for index, bigram in enumerate(possible_bigrams)}
possible_bigrams = {index: bigram for index, bigram in enumerate(possible_bigrams)}
    
print(possible_bigrams[0])
print(all_bigrams['aa'])

aa
0


In [45]:
# function that takes a list of characters and ouputs bi-gram tensors with the same label
def word_to_bigrams(word):
    bigrams = []
    if len(word) < 2:
        # words consisting of a single letter are padded with a space ' '
        return [word + ' ']
    else:
        list_of_chars = list(word)
        # n-1 bigrams in a word
        for i in range(len(list_of_chars) - 1):
            bigrams.append([list_of_chars[i] + list_of_chars[i + 1]])
        return bigrams

In [48]:
word_to_bigrams('test_word')

[['te'], ['es'], ['st'], ['t_'], ['_w'], ['wo'], ['or'], ['rd']]

In [138]:
num_batches = 1 

def word_to_tensor(word):
    # e.g. for the word 'every' a 4 by 2 tensor
    tensor = torch.zeros(len(word) - 1, num_batches, len(possible_bigrams))
    print(type(tensor)) ## hiermee verder! goed de tensor-grootte bekijken
    for bigram_nr, bigram in enumerate(word_to_bigrams(word)):
        tensor[bigram_nr][0][all_bigrams[bigram[0]]] =1
    return tensor

In [142]:
word_to_tensor('test').size()

<class 'torch.FloatTensor'>


torch.Size([3, 1, 4624])

In [140]:
type(tensor)

torch.FloatTensor

In [98]:
list_of_lines = [li for li in category_lines.values()]
[line for line in list_of_lines[17]]

['Nguyen',
 'Tron',
 'Le',
 'Pham',
 'Huynh',
 'Hoang',
 'Phan',
 'Vu',
 'Vo',
 'Dang',
 'Bui',
 'Do',
 'Ho',
 'Ngo',
 'Duong',
 'Ly',
 'An',
 'an',
 'Bach',
 'Banh',
 'Cao',
 'Chau',
 'Chu',
 'Chung',
 'Chu',
 'Diep',
 'Doan',
 'Dam',
 'Dao',
 'Dinh',
 'Doan',
 'Giang',
 'Ha',
 'Han',
 'Kieu',
 'Kim',
 'La',
 'Lac',
 'Lam',
 'Lieu',
 'Luc',
 'Luong',
 'Luu',
 'Ma',
 'Mach',
 'Mai',
 'Nghiem',
 'Phi',
 'Pho',
 'Phung',
 'Quach',
 'Quang',
 'Quyen',
 'Ta',
 'Thach',
 'Thai',
 'Sai',
 'Thi',
 'Than',
 'Thao',
 'Thuy',
 'Tieu',
 'To',
 'Ton',
 'Tong',
 'Trang',
 'Trieu',
 'Trinh',
 'Truong',
 'Van',
 'Vinh',
 'Vuong',
 'Vuu']

In [73]:
[line[0][:][:] for cat, line in enumerate(category_lines.values())]

['Khoury',
 'Ang',
 'Abl',
 'Aalsburg',
 'Abbas',
 'Abel',
 'Abbing',
 'Adamidis',
 'Adam',
 'Abandonato',
 'Abe',
 'Ahn',
 'Adamczak',
 'Abreu',
 'Ababko',
 'Smith',
 'Abana',
 'Nguyen']

In [35]:
all_letters.find('a')

0