# GENERATIVE LSTM FOR NICKNAME GENERATION

### READ AND PREPROCESS DATA

In [1]:
%env CUDA_LAUNCH_BLOCKING=1
import pandas as pd
import numpy as np

env: CUDA_LAUNCH_BLOCKING=1


In [2]:
df = pd.read_csv("dota_names.csv", header=None, encoding= 'Windows-1251')

In [3]:
df

Unnamed: 0,0
0,ele 15avg nabivayu
1,kitamef
2,burythatbodyitbloodyougotit
3,АКАШИ СЕЙДЖУРО
4,АК 47 МАСТЕР 666
...,...
1959193,Mookki
1959194,Boombl4
1959195,(жмых) KIng_ak47-Awp
1959196,hyiloebanoe1111


In [4]:
digits = "1234567890"
cyrillic_letters = u"абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
allowed_special = ".-_()? "

allowed_chars = cyrillic_letters + digits + allowed_special

def strip(text):
    text = text.lower()
    if any([text.count(c)>10 for c in allowed_chars]):
        return ""
    text = "".join([c for c in text if c in allowed_chars])
    if all(c in digits or c in allowed_special for c in text):
        return ""
    return text

In [5]:
import swifter
df[0] = df[0].astype(str).swifter.apply(strip)

Pandas Apply:   0%|          | 0/1959198 [00:00<?, ?it/s]

In [6]:
df[0] = df[0].replace("", np.nan)

In [7]:
df = df.dropna().drop_duplicates()

In [8]:
df[0] = df[0].astype(str).swifter.apply(lambda x: list(x))

Pandas Apply:   0%|          | 0/189090 [00:00<?, ?it/s]

In [9]:
df

Unnamed: 0,0
3,"[а, к, а, ш, и, , с, е, й, д, ж, у, р, о]"
4,"[а, к, , 4, 7, , м, а, с, т, е, р, , 6, 6, 6]"
5,"[а, м, , а, , п, у, м, а]"
6,"[а, н, а, л, ь, н, ы, й, , м, у, д, р, е, ц]"
7,"[а, л, е, , б, а, р, н, и]"
...,...
1959147,"[_, н, е, г, р]"
1959154,"[п, р, а, й, м, ?]"
1959155,"[м, о, б, ш, и, д]"
1959164,"[ф, в, ы, ф, ы, ф, в, ы]"


In [10]:
eos = 0
sos = 1

char_to_int = dict((c, i+2) for i, c in enumerate(allowed_chars))

n_vocab = len(allowed_chars) + 2

def encode(text: list) -> list:
    return [sos] + [char_to_int[c] for c in text] + [eos]

In [11]:
df[0] = df[0].apply(encode)
df

Unnamed: 0,0
3,"[1, 2, 13, 2, 27, 11, 51, 20, 7, 12, 6, 9, 22,..."
4,"[1, 2, 13, 51, 38, 41, 51, 15, 2, 20, 21, 7, 1..."
5,"[1, 2, 15, 51, 2, 51, 18, 22, 15, 2, 0]"
6,"[1, 2, 16, 2, 14, 31, 16, 30, 12, 51, 15, 22, ..."
7,"[1, 2, 14, 7, 51, 3, 2, 19, 16, 11, 0]"
...,...
1959147,"[1, 47, 16, 7, 5, 19, 0]"
1959154,"[1, 18, 19, 2, 12, 15, 50, 0]"
1959155,"[1, 15, 17, 3, 27, 11, 6, 0]"
1959164,"[1, 23, 4, 30, 23, 30, 23, 4, 30, 0]"


In [12]:
df = df.sample(frac=1).reset_index(drop=True)

### MODEL DEFINITION AND DATA RESHAPING

In [13]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [14]:
from tqdm.notebook import tqdm

In [15]:

seq_length = 34
dataX = []
dataY = []

length = len(df)

for index, row in tqdm(df.iterrows()):
    text = row[0]

    for i in range(1,len(text)):
        seq_in = text[:i]
        seq_out = text[i]
        dataX.append(((33 - len(seq_in)))*[0] + seq_in)
        dataY.append(seq_out)
n_patterns = len(dataX)

0it [00:00, ?it/s]

In [16]:
X = torch.tensor(dataX, dtype=torch.float).reshape(n_patterns, seq_length - 1, 1)
y = torch.tensor(dataY)

In [17]:
class CharModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=512, num_layers=4, batch_first=True, dropout=0.3, bias=False)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(512, n_vocab)
    def forward(self, x):
        x = nn.functional.normalize(x)
        x, _ = self.lstm(x)
        # take only the last output
        x = x[:, -1, :]
        # produce output
        x = self.linear(self.dropout(x))
        return x

### MODEL TRAINING

In [18]:
n_epochs = 75
batch_size = 3000
model = CharModel()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
model.to(device)

CharModel(
  (lstm): LSTM(1, 512, num_layers=4, bias=False, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (linear): Linear(in_features=512, out_features=52, bias=True)
)

In [19]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=0.0001, T_max=120)
loss_fn = nn.CrossEntropyLoss(reduction="sum")

precentage = 0.1
size = len(X)
split = int(size*(1 - 0.1))


X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]

train_loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)
test_loader = data.DataLoader(data.TensorDataset(X_test, y_test), shuffle=True, batch_size=batch_size)

In [None]:
best_model = None
best_loss = np.inf
for epoch in tqdm(range(n_epochs)):
    model.train()
    for X_batch, y_batch in tqdm(train_loader):
        y_pred = model(X_batch.to(device))
        loss = loss_fn(y_pred, y_batch.to(device))
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            y_pred = model(X_batch.to(device))
            loss += loss_fn(y_pred, y_batch.to(device))
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))

torch.save([best_model, char_to_int], "v3.pth")
torch.save([model.state_dict(), char_to_int], "v3_latest.pth")

In [38]:
# Generation using the trained model
best_model, char_to_int = torch.load("v3.pth")
n_vocab = len(char_to_int)
int_to_char = dict((i, c) for c, i in char_to_int.items())
model.load_state_dict(best_model)
model.to(device)

CharModel(
  (lstm): LSTM(1, 512, num_layers=4, bias=False, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (linear): Linear(in_features=512, out_features=52, bias=True)
)

In [39]:
input = ""
def encode_prompt(text: list) -> torch.tensor:
    return torch.tensor([sos] + [char_to_int[c] for c in text], dtype=torch.float).to(device)

def generate_name(prompt: str) -> str:

    output = encode_prompt(list(prompt))
    prediction = int(model(output.view(1, -1, 1)).argmax())

    while prediction!=0:
        new_char = int_to_char[prediction]
        prompt+=new_char

        output = encode_prompt(list(prompt))
        prediction = int(model(output.view(1, -1, 1)).argmax())

    return prompt

In [44]:
print(generate_name("ол"))

олег барабуль
олег барабуль
олег барабуль
олег барабуль
олег барабуль
олег барабуль
олег барабуль
олег барабуль
олег барабуль
олег барабуль


In [185]:
def decode(encoded_name: torch.tensor) -> str:
    result = ""
    for c in encoded_name[1:]:
        if c == 0:
            break
        result += int_to_char[int(c)]
    return result

In [279]:
def generate_names(prompt: str, alpha: float = 0.2, noise: bool = True, k: int = 10, split_steps: int = 5) -> str:
    names = [encode_prompt(list(prompt))]
    prediction0 = None

    steps = 0

    while prediction0!=0:
        predictions = model(names[0].view(1, -1, 1))
        topk = torch.topk(predictions, k)
        prediction0 = topk.indices[0][0].reshape(1)
        value0 = topk.values[0][0].reshape(1)
        noise_value = 0

        if steps < split_steps:
            for i in range(1,k):
                valuei = topk.values[0][i].reshape(1)
                predictioni = topk.indices[0][i].reshape(1)

                if noise:
                    noise_value = torch.rand(1).to(device) / 3

                if value0/valuei < 1 + alpha + noise_value:
                    names.append(torch.cat((names[0], predictioni), 0))

        names[0] = torch.cat((names[0], prediction0), 0)
        steps+=1

    for i in range(1, len(names)):
        prediction = None
        while prediction != 0:
            prediction = model(names[i].view(1, -1, 1)).argmax().reshape(1)
            names[i] = torch.cat((names[i], prediction), 0)

    decoded_names = []
    for name in names:
        decoded_names.append(decode(name))
    return decoded_names

In [280]:
generate_names("глобу")

['глобус в деле',
 'глобул ',
 'глобус',
 'глобус на мать',
 'глобус по кайфу',
 'глобус ',
 'глобус с пивом',
 'глобус в попку',
 'глобус в кашу',
 'глобус в тапках']

### SAVE MODEL AS TORCHSCRIPT TO LOAD IN C++

In [281]:
model.cpu()

CharModel(
  (lstm): LSTM(1, 512, num_layers=4, bias=False, batch_first=True, dropout=0.3)
  (dropout): Dropout(p=0.3, inplace=False)
  (linear): Linear(in_features=512, out_features=52, bias=True)
)

In [282]:
traced_script_module = torch.jit.script(model)

In [285]:
traced_script_module.save("1.pt")

In [32]:
print(allowed_chars)

абвгдеёжзийклмнопрстуфхцчшщъыьэюя1234567890.-_()? 


In [33]:
print(int_to_char)

{2: 'а', 3: 'б', 4: 'в', 5: 'г', 6: 'д', 7: 'е', 8: 'ё', 9: 'ж', 10: 'з', 11: 'и', 12: 'й', 13: 'к', 14: 'л', 15: 'м', 16: 'н', 17: 'о', 18: 'п', 19: 'р', 20: 'с', 21: 'т', 22: 'у', 23: 'ф', 24: 'х', 25: 'ц', 26: 'ч', 27: 'ш', 28: 'щ', 29: 'ъ', 30: 'ы', 31: 'ь', 32: 'э', 33: 'ю', 34: 'я', 35: '1', 36: '2', 37: '3', 38: '4', 39: '5', 40: '6', 41: '7', 42: '8', 43: '9', 44: '0', 45: '.', 46: '-', 47: '_', 48: '(', 49: ')', 50: '?', 51: ' '}


In [34]:
encode_prompt("123123").shape

torch.Size([7])