# GENERATIVE LSTM FOR NICKNAME GENERATION

### READ AND PREPROCESS DATA

In [1]:
%env CUDA_LAUNCH_BLOCKING=1
import pandas as pd
import numpy as np

env: CUDA_LAUNCH_BLOCKING=1


In [9]:
df = pd.read_csv("dota_names.csv", header=None, encoding= 'Windows-1251')

In [10]:
df

Unnamed: 0,0
0,ele 15avg nabivayu
1,kitamef
2,burythatbodyitbloodyougotit
3,АКАШИ СЕЙДЖУРО
4,АК 47 МАСТЕР 666
...,...
1959193,Mookki
1959194,Boombl4
1959195,(жмых) KIng_ak47-Awp
1959196,hyiloebanoe1111


In [3]:
digits = "1234567890"
cyrillic_letters = u"абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
allowed_special = ".-_()? "

allowed_chars = cyrillic_letters + digits + allowed_special

def strip(text):
    text = text.lower()
    if any([text.count(c)>10 for c in allowed_chars]):
        return ""
    text = "".join([c for c in text if c in allowed_chars])
    if all(c in digits or c in allowed_special for c in text):
        return ""
    return text

In [12]:
import swifter
df[0] = df[0].astype(str).swifter.apply(strip)

Pandas Apply:   0%|          | 0/1959198 [00:00<?, ?it/s]

In [13]:
df[0] = df[0].replace("", np.nan)

In [14]:
df = df.dropna().drop_duplicates()

In [15]:
df[0] = df[0].astype(str).swifter.apply(lambda x: list(x))

Pandas Apply:   0%|          | 0/189090 [00:00<?, ?it/s]

In [16]:
df

Unnamed: 0,0
3,"[а, к, а, ш, и, , с, е, й, д, ж, у, р, о]"
4,"[а, к, , 4, 7, , м, а, с, т, е, р, , 6, 6, 6]"
5,"[а, м, , а, , п, у, м, а]"
6,"[а, н, а, л, ь, н, ы, й, , м, у, д, р, е, ц]"
7,"[а, л, е, , б, а, р, н, и]"
...,...
1959147,"[_, н, е, г, р]"
1959154,"[п, р, а, й, м, ?]"
1959155,"[м, о, б, ш, и, д]"
1959164,"[ф, в, ы, ф, ы, ф, в, ы]"


In [4]:
eos = 0
sos = 1

char_to_int = dict((c, i+2) for i, c in enumerate(allowed_chars))

n_vocab = len(allowed_chars) + 2

def encode(text: list) -> list:
    return [sos] + [char_to_int[c] for c in text] + [eos]

In [18]:
df[0] = df[0].apply(encode)
df

Unnamed: 0,0
3,"[1, 2, 13, 2, 27, 11, 51, 20, 7, 12, 6, 9, 22,..."
4,"[1, 2, 13, 51, 38, 41, 51, 15, 2, 20, 21, 7, 1..."
5,"[1, 2, 15, 51, 2, 51, 18, 22, 15, 2, 0]"
6,"[1, 2, 16, 2, 14, 31, 16, 30, 12, 51, 15, 22, ..."
7,"[1, 2, 14, 7, 51, 3, 2, 19, 16, 11, 0]"
...,...
1959147,"[1, 47, 16, 7, 5, 19, 0]"
1959154,"[1, 18, 19, 2, 12, 15, 50, 0]"
1959155,"[1, 15, 17, 3, 27, 11, 6, 0]"
1959164,"[1, 23, 4, 30, 23, 30, 23, 4, 30, 0]"


In [19]:
df = df.sample(frac=1).reset_index(drop=True)

### MODEL DEFINITION AND DATA RESHAPING

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [4]:
from tqdm.notebook import tqdm

In [22]:

seq_length = 34
dataX = []
dataY = []

length = len(df)

for index, row in tqdm(df.iterrows()):
    text = row[0]

    for i in range(1,len(text)):
        seq_in = text[0:i]
        seq_out = text[i]
        dataX.append(((33 - len(seq_in)))*[0] + seq_in)
        dataY.append(seq_out)
n_patterns = len(dataX)

0it [00:00, ?it/s]

In [23]:
X = torch.tensor(dataX, dtype=torch.float).reshape(n_patterns, seq_length - 1, 1)
y = torch.tensor(dataY)

In [6]:
class CharModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=512, num_layers=4, batch_first=True, dropout=0.2, bias=False)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(512, n_vocab)
    def forward(self, x):
        x = nn.functional.normalize(x)
        x, _ = self.lstm(x)
        # take only the last output
        x = x[:, -1, :]
        #Add noise to make output different each time
        x = nn.functional.normalize(x)
        x = x + torch.randn(x.size(), device = "cuda")/10
        # produce output
        x = self.linear(self.dropout(x))
        return x

### MODEL TRAINING

In [7]:
n_epochs = 100
batch_size = 3000
model = CharModel()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
model.to(device)

CharModel(
  (lstm): LSTM(1, 512, num_layers=4, bias=False, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (linear): Linear(in_features=512, out_features=52, bias=True)
)

In [26]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, eta_min=0.0001, T_max=120)
loss_fn = nn.CrossEntropyLoss(reduction="sum")

precentage = 0.1
size = len(X)
split = int(size*(1 - 0.1))


#X_train, y_train = X[:split], y[:split]
X_train, y_train = X, y
X_test, y_test = X[split:], y[split:]

train_loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)
test_loader = data.DataLoader(data.TensorDataset(X_test, y_test), shuffle=True, batch_size=batch_size)

In [None]:
best_model = None
best_loss = np.inf
for epoch in tqdm(range(n_epochs)):
    model.train()
    for X_batch, y_batch in tqdm(train_loader):
        y_pred = model(X_batch.to(device))
        loss = loss_fn(y_pred, y_batch.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    model.eval()
    loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            y_pred = model(X_batch.to(device))
            loss += loss_fn(y_pred, y_batch.to(device))
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))

torch.save([best_model, char_to_int], "v2.pth")

In [8]:
# Generation using the trained model
best_model, char_to_int = torch.load("v2.pth")
n_vocab = len(char_to_int)
int_to_char = dict((i, c) for c, i in char_to_int.items())
model.load_state_dict(best_model)

<All keys matched successfully>

In [9]:
input = ""
def encode_prompt(text: list) -> torch.tensor:
    return torch.tensor([sos] + [char_to_int[c] for c in text], dtype=torch.float).to(device)

def generate_name(prompt: str) -> str:

    output = encode_prompt(list(prompt))
    prediction = int(model(output.view(1, -1, 1)).argmax())

    while prediction!=0:
        new_char = int_to_char[prediction]
        prompt+=new_char

        output = encode_prompt(list(prompt))
        prediction = int(model(output.view(1, -1, 1)).argmax())

    return prompt

In [10]:
for i in range(10):
    print(generate_name(""))

пожилой катки
степан баранов
капитан банан
данил санго
бот канеки
подпивас 2000
генерал санираа
барашек
степан помоскун
саша сакан


### SAVE MODEL AS TORCHSCRIPT TO LOAD IN C++

In [11]:
traced_script_module = torch.jit.script(model)

In [12]:
traced_script_module.save("TracedGenLSTM.pt")