In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim

In [2]:
import unicodedata
import string

all_letters = string.ascii_letters + "_- .,;'0123456789"

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicode_to_ascii('Ślusàrski'))

Slusarski


In [3]:
# char 2 index
char2idx = {c:i+1 for i, c in enumerate(all_letters)}

# index 2 char
index2char = {i+1:c for i,c in enumerate(all_letters)}

print("number of letters: %i \n" %len(char2idx))

number of letters: 69 



In [4]:
words = ['four', 'word','word5','AaBbCc','example','eightlet', 'ninelette','tenletters', 'elevenlette','twelveletter']

In [5]:
min_len = min([len(word) for word in words])
max_len = max([len(word) for word in words])
print(min_len,max_len)

4 12


In [6]:
word_idx = []

for i, word in enumerate(words):
    word_idx.append([char2idx[c] for c in word])
    print(len(word_idx[i]))

4
4
5
6
7
8
9
10
11
12


In [14]:
# create batches of words
batches = {}

for data_size in range(min_len,max_len+1,2):
        batches[data_size] = []
print('empty batches:',batches)
        
for word_list in word_idx:
    for data_size in range(min_len,max_len+1,2):  
        if len(word_list) == data_size or len(word_list) == data_size+1:
            batches[data_size] += [word_list]
print('size_based batches:\n',batches)

empty batches: {4: [], 6: [], 8: [], 10: [], 12: []}
size_based batches:
 {4: [[6, 15, 21, 18], [23, 15, 18, 4], [23, 15, 18, 4, 65]], 6: [[27, 1, 28, 2, 29, 3], [5, 24, 1, 13, 16, 12, 5]], 8: [[5, 9, 7, 8, 20, 12, 5, 20], [14, 9, 14, 5, 12, 5, 20, 20, 5]], 10: [[20, 5, 14, 12, 5, 20, 20, 5, 18, 19], [5, 12, 5, 22, 5, 14, 12, 5, 20, 20, 5]], 12: [[20, 23, 5, 12, 22, 5, 12, 5, 20, 20, 5, 18]]}


In [8]:
class CNN(nn.Module):
    def __init__(self, input_size, output_size):
        super(CNN, self).__init__()
        
        self.embedding = nn.Embedding(input_size, len(char2idx))
        self.input_size = input_size
        
        self.conv = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(2,len(char2idx)))
        self.pool = nn.AdaptiveMaxPool2d((3,1), return_indices=False)
        
        '''self.sequential = nn.Sequential(nn.Conv2d(in_channels=1, out_channels=1, 
                                                  kernel_size=(2,len(vocab))),
                                        nn.MaxPool2d(kernel_size=(2,len(vocab))))'''
        self.linear = nn.Linear(1, output_size) # n_channels, output_size
        
    def forward(self, x):
        
        print('\nn_length:',x.size())
        
        x = self.embedding(x)
        print('embed:',x.size())
        
        # x = torch.cat((embedded_x, i), dim=1)
        
        x = x.unsqueeze(0).unsqueeze(0)
        print('unsqueezed:',x.size())
        
        x = self.conv(x)
        print('conv_x:',x.size())
        x = self.pool(x)
        print('pool:',x.size())
        
        ''' for i in range((self.input_size-4)//2):
            x = self.sequential(x)
            print('conv2:',x.size())
        '''
        x = self.linear(x)
        
        return x

In [9]:
cnn = CNN(len(char2idx), 18)

In [10]:
for i, word in enumerate(word_idx):
    
    name_var = Variable(torch.LongTensor(word))
    indexes = [i for i in range(len(name_var))]
    x = torch.LongTensor(indexes).float()
    x = x.unsqueeze(1)
    index_var = Variable(x)
    print(name_var.size())
    cnn(name_var)

torch.Size([4])

n_length: torch.Size([4])
embed: torch.Size([4, 69])
unsqueezed: torch.Size([1, 1, 4, 69])
conv_x: torch.Size([1, 1, 3, 1])
pool: torch.Size([1, 1, 3, 1])
torch.Size([4])

n_length: torch.Size([4])
embed: torch.Size([4, 69])
unsqueezed: torch.Size([1, 1, 4, 69])
conv_x: torch.Size([1, 1, 3, 1])
pool: torch.Size([1, 1, 3, 1])
torch.Size([5])

n_length: torch.Size([5])
embed: torch.Size([5, 69])
unsqueezed: torch.Size([1, 1, 5, 69])
conv_x: torch.Size([1, 1, 4, 1])
pool: torch.Size([1, 1, 3, 1])
torch.Size([6])

n_length: torch.Size([6])
embed: torch.Size([6, 69])
unsqueezed: torch.Size([1, 1, 6, 69])
conv_x: torch.Size([1, 1, 5, 1])
pool: torch.Size([1, 1, 3, 1])
torch.Size([7])

n_length: torch.Size([7])
embed: torch.Size([7, 69])
unsqueezed: torch.Size([1, 1, 7, 69])
conv_x: torch.Size([1, 1, 6, 1])
pool: torch.Size([1, 1, 3, 1])
torch.Size([8])

n_length: torch.Size([8])
embed: torch.Size([8, 69])
unsqueezed: torch.Size([1, 1, 8, 69])
conv_x: torch.Size([1, 1, 7, 1])

In [11]:
for i in range((8-4)//2):
    print(i)

0
1


In [12]:
5//3

1