In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [3]:
# Define a simple neural network
class SimpleNet(nn.Module):
    def __init__(self, n, m, h, l):
        super(SimpleNet, self).__init__()
        self.n = n
        self.m = m
        self.fc1 = nn.Linear(l, m)
        self.fc2 = nn.Linear(n * m, h, bias=True)
        self.fc3 = nn.Linear(h, l, bias=True)

    def forward(self, x): # n * l
        x = self.fc1(x) # n * m
        x = x.reshape(-1, self.n * self.m) # 1 * nm
        x = self.fc2(x) 
        x = nn.functional.tanh(x)
        x = self.fc3(x)
        # x = nn.functional.softmax(x)
        return x


In [4]:

persian_chars_list = "آ ا ب پ ت ث ج چ ح خ د ذ ر ز ژ س ش ص ض ط ظ ع غ ف ق ک گ ل م ن و ه ی".split()



In [5]:
class NameDataset(Dataset):
    
    def _omit_unnecessary_chars(self, string, chars_list):    
        str = ""
        for ch in string:
            if ch in chars_list:
                str += ch
        return str


    def _omit_parantheses(self, string, char):
        index = string.find(char)
        if index == -1:
            return string
        return string[:index]
        

    def clean_dataset(self, strings, chars_list):
        arr = []
        for i in range(len(strings)):
            s = self._omit_parantheses(strings[i], ' ')
            s = self._omit_parantheses(s, '(')
            s = self._omit_parantheses(s, '[')
            s = self._omit_unnecessary_chars(s, chars_list)
            if len(s) > 1:
                arr.append(s)
        return arr
    
    def generate_dicts(self, chars_list):
        char_int = {}
        int_char = {}
        n = len(chars_list)
        for i in range(n):
            char_int[chars_list[i]] = i + 1
            int_char[i + 1] = chars_list[i]
        char_int['.'] = 0
        int_char[0] = '.'
        char_int['\u200c'] = n+1
        int_char[n+1] = '\u200c'
        return (char_int, int_char)
    
    def generate_data_set(self, block_size, path, chars_list):        
        X, Y = [], []
        names = self.clean_dataset(path, chars_list)
        char_int = self.generate_dicts(chars_list)[0]       

        for w in names:            
            context = [0] * block_size
            for ch in w + '.':                
                ix = char_int[ch]
                X.append(context)
                Y.append(ix)                
                context = context[1:] + [ix]
        X = torch.tensor(X).to(device=self.device)
        # print(X.max())
        X = F.one_hot(X, num_classes=len(chars_list) + 1).float()
        Y = torch.tensor(Y).to(device=self.device)

        return (X,Y)
    

    def __init__(self, path, chars_list, block_size, device) -> None:
        with open(path, 'r', encoding='utf-8') as file:
            names = file.read().splitlines()
        
        self.device = device
        self.X, self.Y = self.generate_data_set(block_size, names, chars_list)

    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.Y[index]

In [130]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        print(x)
        pred = model(X)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        # print(loss)
        optimizer.step()
        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [8]:
# Instantiate the network, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device='cpu'
net = SimpleNet(n = 4, m = 5, h=100, l=34).to(device)
print(next(net.parameters()).device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.01)
dataset = NameDataset('persianNames.txt', persian_chars_list, block_size=4,device=device)
dataloader = DataLoader(dataset=dataset, batch_size=32)
#print(x_hot)
# Training loop
for epoch in range(100):  # loop over the dataset multiple times
    train_loop(dataloader, net, criterion, optimizer)
print('Finished Training')


cuda:0
loss: 3.551260  [   32/ 4380]
loss: 2.243391  [ 3232/ 4380]
loss: 2.700101  [   32/ 4380]
loss: 2.196141  [ 3232/ 4380]
loss: 2.723104  [   32/ 4380]
loss: 2.115409  [ 3232/ 4380]
loss: 2.571261  [   32/ 4380]
loss: 2.180636  [ 3232/ 4380]
loss: 2.544547  [   32/ 4380]
loss: 2.199903  [ 3232/ 4380]
loss: 2.481555  [   32/ 4380]
loss: 2.086476  [ 3232/ 4380]
loss: 2.355093  [   32/ 4380]
loss: 2.145050  [ 3232/ 4380]
loss: 2.592844  [   32/ 4380]
loss: 2.110377  [ 3232/ 4380]
loss: 2.230784  [   32/ 4380]
loss: 2.060028  [ 3232/ 4380]
loss: 2.298032  [   32/ 4380]
loss: 2.139713  [ 3232/ 4380]
loss: 2.434499  [   32/ 4380]
loss: 2.055780  [ 3232/ 4380]
loss: 2.235286  [   32/ 4380]
loss: 1.954289  [ 3232/ 4380]
loss: 2.275639  [   32/ 4380]
loss: 2.008130  [ 3232/ 4380]
loss: 2.283372  [   32/ 4380]
loss: 2.033384  [ 3232/ 4380]
loss: 2.312065  [   32/ 4380]
loss: 1.974115  [ 3232/ 4380]
loss: 2.149675  [   32/ 4380]
loss: 2.063908  [ 3232/ 4380]
loss: 2.383432  [   32/ 4380]
los

In [133]:
def generate_dicts(chars_list):
        char_int = {}
        int_char = {}
        n = len(chars_list)
        for i in range(n):
            char_int[chars_list[i]] = i + 1
            int_char[i + 1] = chars_list[i]
        char_int['.'] = 0
        int_char[0] = '.'
        char_int['\u200c'] = n+1
        int_char[n+1] = '\u200c'
        return (char_int, int_char)

In [134]:
char_int, int_char = generate_dicts(persian_chars_list)

In [143]:

first_char = char_int['ک']
second_char = char_int['ب']
tensor_context = F.one_hot(torch.tensor([0,0,first_char, second_char]), 34).to(device=device).float()
sequence = [torch.tensor(first_char), torch.tensor(second_char)]

while True:
    i_next_char = torch.argmax(net(tensor_context))
    if i_next_char == 0:
        break
    sequence.append(i_next_char)
    tensor_context = torch.cat((tensor_context[1:], (F.one_hot(i_next_char, 34).to(device=device).float()).unsqueeze(0)))


In [144]:
sequence

[tensor(26),
 tensor(3),
 tensor(2, device='cuda:0'),
 tensor(14, device='cuda:0')]

In [145]:
str = ""
for x in sequence:
    str += int_char[x.item()]

In [146]:
str

'کباز'

In [71]:
eng_chars_list = "a b c d e f g h i j k l m n o p q r s t u v w x y z".split()

In [None]:

net2 = SimpleNet(n = 4, m = 5, h=100, l=28).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net2.parameters(), lr=0.01)
dataset2 = NameDataset('names.txt', eng_chars_list, block_size=4,device=device)
dataloader2 = DataLoader(dataset=dataset2, batch_size=32)
#print(x_hot)
# Training loop
for epoch in range(10):  # loop over the dataset multiple times
    train_loop(dataloader2, net2, criterion, optimizer)
print('Finished Training')


In [127]:
char_int, int_char = generate_dicts(eng_chars_list)

In [128]:
print(char_int)
print(int_char)
len((char_int.values()))

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0, '\u200c': 27}
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.', 27: '\u200c'}


28

In [97]:
first_char = char_int['a']
second_char = char_int['b']
tensor_context = F.one_hot(torch.tensor([0,0,first_char, second_char]), 27).to(device=device).float()
sequence = [torch.tensor(first_char), torch.tensor(second_char)]

while True:
    i_next_char = torch.argmax(net2(tensor_context))
    if i_next_char == 0:
        break
    sequence.append(i_next_char)
    tensor_context = torch.cat((tensor_context[1:], (F.one_hot(i_next_char, 27).to(device=device).float()).unsqueeze(0)))


In [98]:
str = ""
for x in sequence:
    str += int_char[x.item()]
print(str)

abar
