# Neural Net Language models

> Basic neuralnet-based language modeling 

In [3]:
#| default_exp models.lm

In [4]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

In [246]:
#| export
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

from torchtext.vocab import vocab

from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

from typing import Dict, List, Tuple, Optional, Set
from collections import Counter, OrderedDict

from plum import dispatch

## Build vocab

In [263]:
#| export
class Vocab:
    def __init__(self, data:pd.Series, specials=['<pad>', '<unk>', '<bos>', '<eos>']):
        c = Counter()
        for row in data.items():
            name = list(row[1])
            c.update(name)

        ordered_tuple = sorted(c.items(), key=lambda x:x[1], reverse=True)
        dict = OrderedDict(ordered_tuple)        
        self.voc = vocab(dict, specials=specials)
        if '<unk>' in specials:
            self.voc.set_default_index(voc['<unk>'])
        else:
            self.voc.set_default_index(-1)
        self._stoi = self.voc.get_stoi()
        self._itos = self.voc.get_itos()

    @dispatch
    def stoi(self, token:str)->int:
        return self._stoi[token]

    @dispatch
    def stoi(self, tokens:List[str])->List[int]:
        return [self._stoi[tok] for tok in tokens]
    
    # @dispatch #TODO
    # def stoi(self, tokens:List[List[str]])->List[List[int]]:
    #     return [self._stoi[u] for tok in tokens for ]

    @dispatch    
    def itos(self, index:int)->str:
        return self._itos[index]

    @dispatch    
    def itos(self, indices:List[int])->List[str]:
        return [self._itos[index] for index in indices]
    


### Usage

In [258]:
df = pd.read_csv('../data/names.txt', header=None, names=['name'])
v = Vocab(df.name)
print(v.stoi('e'))
print(v.stoi('m'))
print(v.itos(4))
print(v.stoi(['e','m']))
print(v.itos([5,14]))

5
14
a
[5, 14]
['e', 'm']


## Data formatting
given last n tokens we predict token n+1

In [276]:
s = list("alexandra")
print(s)
bigram = [(x,y) for x, y in zip(s, s[1:])]
print(bigram)
trigram = [ (x,y,z) for x, y, z in zip(s, s[1:], s[2:])]

['a', 'l', 'e', 'x', 'a', 'n', 'd', 'r', 'a']
[('a', 'l'), ('l', 'e'), ('e', 'x'), ('x', 'a'), ('a', 'n'), ('n', 'd'), ('d', 'r'), ('r', 'a')]


In [323]:
X = []
y = []
pad_value = 0
context_length = 5
# init prefix with padding while len < context_length
for i in range(context_length-1):
    sequence = v.stoi(s[:i+1])
    pad_len = context_length - len(sequence)
    pad = [pad_value] * pad_len
    X.append(pad + sequence)
    y.append(v.stoi(s[i+1]))

# 
i = 0
while i < (len(s) - context_length):
    # print(s[i:3+i], s[i+3])
    X.append(v.stoi(s[i:context_length+i]))
    y.append(v.stoi(s[i+context_length]))
    i += 1

for x, y in zip(X,y):
    print(x,y)

[0, 0, 0, 0, 4] 8
[0, 0, 0, 4, 8] 5
[0, 0, 4, 8, 5] 28
[0, 4, 8, 5, 28] 4
[4, 8, 5, 28, 4] 6
[8, 5, 28, 4, 6] 16
[5, 28, 4, 6, 16] 9
[28, 4, 6, 16, 9] 4


In [6]:
#| hide
import nbdev; nbdev.nbdev_export()