In [1]:
import warnings;warnings.filterwarnings('ignore')

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
from bidict import bidict

In [4]:
text = "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM,./;'[]1234567890-=!@#$%^&*()_+{}:<>?\"'"

In [5]:
c2i=bidict()

In [6]:
c2i['<pad>'] = len(c2i)
c2i['<unk>'] =  len(c2i)

In [7]:
for i,c in enumerate(set(text)):
    c2i[c]=i+2

In [8]:
c2i

bidict({'<pad>': 0, '<unk>': 1, 'W': 2, 'S': 3, '5': 4, 'O': 5, '/': 6, 'i': 7, '_': 8, '!': 9, 'U': 10, 'm': 11, 'Q': 12, 'Y': 13, '+': 14, 'f': 15, 'X': 16, 'C': 17, 'L': 18, '3': 19, '8': 20, 'b': 21, '}': 22, ']': 23, ',': 24, 'n': 25, '@': 26, 'H': 27, 'F': 28, '>': 29, 'o': 30, '=': 31, '%': 32, 'u': 33, 't': 34, '<': 35, 'N': 36, '4': 37, '{': 38, 'r': 39, 'c': 40, 'j': 41, '(': 42, '$': 43, '"': 44, '#': 45, '-': 46, 'A': 47, 'G': 48, ':': 49, 'y': 50, "'": 51, 'x': 52, '*': 53, 's': 54, 'E': 55, 'R': 56, '?': 57, 'D': 58, ';': 59, 'k': 60, 'Z': 61, 'h': 62, '6': 63, 'M': 64, 'e': 65, '[': 66, 'v': 67, 'l': 68, 'K': 69, 'q': 70, 'g': 71, ')': 72, 'a': 73, 'P': 74, 'p': 75, '&': 76, '^': 77, 'I': 78, '0': 79, '9': 80, '2': 81, 'J': 82, 'd': 83, 'V': 84, '7': 85, 'w': 86, '.': 87, 'z': 88, 'T': 89, '1': 90, 'B': 91})

In [9]:
# joblib.dump(c2i,"c2i.pkl")

In [10]:
#c2i = joblib.load("c2i.pkl")

In [11]:
def to_cuda(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return x

## Model:

In [12]:
class CharCNN(nn.Module):
    def __init__(self,vocab,emb_dim=8,num_filters=50,kernels=[2,3,4],unk_token='<unk>'):
        """
        vocab: Vocabulary dict (char to index dict, <pad> should be 0)
        emb_dim: embedding dimension
        num_filters: hidden dimension
        kernels: list of window sizes
        """
        super().__init__()
        self.vocab = vocab
        self.unk_token=unk_token
        self.emb = nn.Embedding(len(vocab),emb_dim,padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(emb_dim,num_filters,kernel_size=k) for k in kernels
        ])
        
    def forward(self,inp):
        """
        inp: List of words, if vocab is character level 
            list of list of tokens, if vocab is word level
        Example: 
            ["Character","level","cnn"] OR [["Char","level","cnn"],["Jupyter","notebook"]]
        """
        x = self.embed(inp)
        emb = self.emb(x)
        emb = emb.transpose(1,2)
        convolved = torch.cat([F.relu(conv(emb)) for conv in self.convs],dim=2)
        rep = F.max_pool1d(convolved,convolved.shape[2]).squeeze(-1)
        return rep
        
    def stoi(self,item):
        return torch.LongTensor([self.vocab[c] if c in self.vocab else self.vocab[self.unk_token] for c in item])
    
    def embed(self,items):
        """
        items: list of words/documents
        """
        tokens = [self.stoi(item) for item in items]
        
        lengths = [len(item) for item in items]
        
        max_len = max(lengths)
        
        token_indices = to_cuda(torch.stack([F.pad(item,pad=(0,max_len-l)) for l,item in zip(lengths,tokens)]))
        
        return token_indices
        

In [13]:
cnn = CharCNN(c2i)

In [14]:
o = cnn(["deep","learning"])

In [15]:
o

tensor([[0.4685, 1.3153, 0.7533, 1.0488, 0.5932, 0.7788, 0.8945, 0.4982, 0.7091,
         0.6757, 0.5001, 0.2864, 0.3897, 0.6170, 0.1782, 0.4083, 0.9988, 0.7238,
         0.1919, 0.8164, 0.6779, 0.9645, 1.3307, 1.1420, 0.9077, 0.6954, 1.1642,
         1.3908, 1.0360, 0.8405, 0.6351, 1.0089, 0.4347, 0.6679, 0.5625, 0.6854,
         1.3044, 0.4633, 0.4044, 0.8451, 0.8105, 0.4896, 0.3921, 0.4114, 0.8870,
         0.6930, 0.4992, 0.7758, 0.5003, 0.5569],
        [0.9756, 0.4208, 1.0318, 1.3228, 0.7362, 1.2971, 1.5304, 1.6063, 1.2590,
         1.2203, 1.0692, 1.0547, 0.8174, 0.9306, 1.0132, 1.0609, 0.9413, 1.0682,
         1.1147, 1.1471, 1.8688, 1.3351, 0.7733, 1.2357, 0.8448, 0.4439, 1.3979,
         1.1420, 1.0289, 1.2483, 0.6529, 0.7585, 0.9880, 0.7411, 0.9505, 1.0240,
         1.3919, 0.5283, 1.0585, 1.7394, 0.7236, 1.0748, 0.8614, 0.8359, 1.1646,
         0.9773, 0.8523, 1.6156, 1.1179, 1.8229]], grad_fn=<SqueezeBackward1>)

In [17]:
o.size()

torch.Size([2, 50])

## Training Process:

Soon...