In [3]:
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [4]:
LETTERS = "абвгдежзийклмнопрстуфхцчшщъыьэюя"
VOWELS = "аеиоуэюяы"
N_LETTERS = len(LETTERS)
MAX_WORD_LENGTH = 23

In [5]:
train = open('data/train_stresses_labels.txt', 'r', encoding="utf8").readlines()
train_word = []
x = []
for i in range(len(train)):
    train[i] = train[i][:-1]
    if len(train[i]) <= MAX_WORD_LENGTH:
        train_word.append(train[i].replace('^', '').replace('ё', 'е'))
    else:
        x.append(train[i].replace('^', '').replace('ё', 'е'))
train[:5], len(x), x[:5]

(['аа^к', 'аа^ка', 'аа^ке', 'аа^ки', 'аа^ков'],
 495,
 ['абдоминомедиастинальный',
  'абдоминоперикардиостомией',
  'абдоминоперикардиостомиею',
  'абдоминоперикардиостомии',
  'абдоминоперикардиостомия'])

In [6]:
test = open('data/public_test_stresses.txt', 'r', encoding="utf8").readlines()
for i in range(len(test)):
    test[i] = test[i][:-1].replace('ё', '')
print(*test[:10])

аакам ааками ааленец аама аамами аамов аамом аамы аангичами аангичах


In [7]:
def letter_to_index(letter):
    return LETTERS.find(letter)

def letter_to_tensor(letter):
    tensor = torch.zeros(1, N_LETTERS)
    tensor[0][letter_to_index(letter)] = 1
    return tensor

# <1 x max_word_length x n_letters>,
# or an array of one-hot letter vectors
def word_to_tensor(word):
    tensor = torch.zeros(1, MAX_WORD_LENGTH, N_LETTERS)
    for i, letter in enumerate(word):
        tensor[0][i][letter_to_index(letter)] = 1
    return tensor

In [8]:
X, y = [], []
for i in tqdm(range(len(train_word))):
    if len(train[i]) <= MAX_WORD_LENGTH:
        X.append(word_to_tensor(train_word[i]))
        correct_pos = train[i].find('^')
        y.append(torch.zeros(MAX_WORD_LENGTH))
        y[-1][correct_pos] = 1
X = torch.stack(X)
y = torch.stack(y)
y[:5]

100%|██████████| 587995/587995 [01:56<00:00, 5048.25it/s]


tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.]])

In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42)

In [10]:
class CustomDataset(Dataset):
    def __init__(self, data_tensor, target_tensor):
        self.data = data_tensor
        self.target = target_tensor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.target[idx]

In [11]:
train_files = CustomDataset(X_train, y_train)
val_files = CustomDataset(X_val, y_val)

In [12]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [29]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv_layers = nn.Sequential(
            # 1 x 23 x 32
            nn.Conv2d(1, 8, kernel_size=(3, N_LETTERS)),
            nn.LeakyReLU(),
            nn.BatchNorm2d(8),
            
            # 8 x 21 x 1
            nn.Conv2d(8, 16, kernel_size=(3, 1), padding=(0, 1)),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(16),
            nn.Dropout(0.3),
            
            # 16 x 9 x 1
            nn.Conv2d(16, 32, kernel_size=(3, 1), padding=(0, 1)),
            nn.LeakyReLU(),
            nn.BatchNorm2d(32),
            nn.Dropout(0.3),
            
            # 32 x 7 x 12
            nn.Conv2d(32, 64, kernel_size=(3, 1), padding=(0, 1)),
            nn.LeakyReLU(),
            nn.BatchNorm2d(64),
            nn.Dropout(0.3),
            
            # 64 x 5 x 10
            nn.Conv2d(64, 64, kernel_size=(3, 1), padding=(0, 1)),
            nn.LeakyReLU(),
            nn.BatchNorm2d(64),
            nn.Dropout(0.3),
            
            # 64 x 3 x 8
            nn.Conv2d(64, 32, kernel_size=(3, 1), padding=(0, 1)),
            nn.LeakyReLU(),
            nn.BatchNorm2d(32),
            nn.Dropout(0.3)
            
            # 32 x 2 x 7
        )
        
        self.fully_conected = nn.Sequential(
            nn.Linear(288, 256),
            nn.LeakyReLU(),
            # nn.Dropout(0.25),
            nn.BatchNorm1d(256),
            
            nn.Linear(256, 256),
            nn.LeakyReLU(),
            nn.Dropout(0.25),
            nn.BatchNorm1d(256),
            
            # nn.Linear(512, 128),
            # nn.LeakyReLU(),
            # # nn.Dropout(0.25),
            # nn.BatchNorm1d(128),
            
            # nn.Linear(128, 64),
            # nn.LeakyReLU(),
            # # nn.Dropout(0.25),
            # nn.BatchNorm1d(64),
            
            nn.Linear(256, MAX_WORD_LENGTH),
            nn.Softmax()
        )
        
    def forward(self, x):
        x = self.conv_layers(x)
        # print(x.shape)
        
        x = nn.Flatten()(x)
        x = self.fully_conected(x)
        
        return x

In [30]:
def fit_epoch(model, train_loader, criterion, optimizer):
    running_loss = 0.0
    running_corrects = 0
    processed_data = 0
    
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        
        outputs = model(inputs)
        # print(outputs.shape, labels[0])
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        preds = torch.argmax(outputs, 1)
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == torch.argmax(labels.data, 1))
        processed_data += inputs.size(0)
              
    train_loss = running_loss / processed_data
    train_acc = running_corrects.cpu().numpy() / processed_data
    return train_loss, train_acc

In [31]:
def eval_epoch(model, val_loader, criterion):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    processed_size = 0

    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            preds = torch.argmax(outputs, 1)

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == torch.argmax(labels.data))
        processed_size += inputs.size(0)
    val_loss = running_loss / processed_size
    val_acc = running_corrects.double() / processed_size
    return val_loss, val_acc

In [32]:
def train(model, epochs, batch_size):
    train_loader = DataLoader(train_files, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_files, batch_size=batch_size, shuffle=False)

    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
        val_loss {v_loss:0.4f} train_acc {t_acc:0.4f} val_acc {v_acc:0.4f}"
    
    with tqdm(desc="epoch", total=epochs) as pbar_outer:
        # opt = torch.optim.Adam(model.parameters(), lr=0.01)
        opt = torch.optim.AdamW(model.parameters())
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=opt, gamma=0.9)
        criterion = nn.CrossEntropyLoss()
        
        for epoch in range(epochs):
            train_loss, train_acc = fit_epoch(model, train_loader, criterion, opt)
            print("loss", train_loss)
            
            val_loss, val_acc = eval_epoch(model, val_loader, criterion)
            history.append((train_loss, train_acc, val_loss, val_acc))
            
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, v_acc=val_acc))
            
            scheduler.step()
            
    return history

In [33]:
cnn = CNN().to(device)

params_count = sum(p.numel() for p in cnn.parameters() if p.requires_grad)
print(f'Number of trainable parameters: {params_count}')

history = train(model=cnn, epochs=20, batch_size=64)

Number of trainable parameters: 174623


  return self._call_impl(*args, **kwargs)


loss 3.035331511752309


epoch:   5%|▌         | 1/20 [01:35<30:07, 95.11s/it]


Epoch 001 train_loss: 3.0353         val_loss 3.0051 train_acc 0.1653 val_acc 0.1507
loss 3.0028662674864357


epoch:  10%|█         | 2/20 [02:58<26:30, 88.36s/it]


Epoch 002 train_loss: 3.0029         val_loss 3.0563 train_acc 0.2023 val_acc 0.0922
loss 2.9798568596958446


epoch:  15%|█▌        | 3/20 [04:24<24:38, 86.98s/it]


Epoch 003 train_loss: 2.9799         val_loss 2.9695 train_acc 0.2254 val_acc 0.1440
loss 2.96442479779755


epoch:  20%|██        | 4/20 [05:48<22:53, 85.87s/it]


Epoch 004 train_loss: 2.9644         val_loss 2.9620 train_acc 0.2406 val_acc 0.1362
loss 2.956842594754175


epoch:  25%|██▌       | 5/20 [07:11<21:15, 85.05s/it]


Epoch 005 train_loss: 2.9568         val_loss 2.9587 train_acc 0.2487 val_acc 0.1373
loss 2.9475669289263204


epoch:  30%|███       | 6/20 [08:37<19:53, 85.25s/it]


Epoch 006 train_loss: 2.9476         val_loss 2.9481 train_acc 0.2583 val_acc 0.1289
loss 2.9349627736231514


epoch:  35%|███▌      | 7/20 [10:02<18:29, 85.33s/it]


Epoch 007 train_loss: 2.9350         val_loss 2.9355 train_acc 0.2712 val_acc 0.1275
loss 2.9269637216839253


epoch:  40%|████      | 8/20 [11:28<17:02, 85.23s/it]


Epoch 008 train_loss: 2.9270         val_loss 2.9279 train_acc 0.2793 val_acc 0.1253
loss 2.9202250831121295


epoch:  45%|████▌     | 9/20 [12:52<15:33, 84.90s/it]


Epoch 009 train_loss: 2.9202         val_loss 2.9234 train_acc 0.2863 val_acc 0.1250
loss 2.9150648637367924


epoch:  50%|█████     | 10/20 [14:19<14:14, 85.50s/it]


Epoch 010 train_loss: 2.9151         val_loss 2.9185 train_acc 0.2915 val_acc 0.1254
loss 2.909644093025011


epoch:  55%|█████▌    | 11/20 [15:41<12:42, 84.69s/it]


Epoch 011 train_loss: 2.9096         val_loss 2.9150 train_acc 0.2971 val_acc 0.1222
loss 2.905733845213733


epoch:  60%|██████    | 12/20 [17:01<11:04, 83.01s/it]


Epoch 012 train_loss: 2.9057         val_loss 2.9127 train_acc 0.3011 val_acc 0.1266
loss 2.901428609972758


epoch:  65%|██████▌   | 13/20 [18:17<09:26, 80.96s/it]


Epoch 013 train_loss: 2.9014         val_loss 2.9106 train_acc 0.3057 val_acc 0.1267
loss 2.8981465963399957


epoch:  70%|███████   | 14/20 [19:43<08:15, 82.59s/it]


Epoch 014 train_loss: 2.8981         val_loss 2.9030 train_acc 0.3088 val_acc 0.1265
loss 2.894935080661325


epoch:  75%|███████▌  | 15/20 [21:10<06:59, 83.81s/it]


Epoch 015 train_loss: 2.8949         val_loss 2.9025 train_acc 0.3121 val_acc 0.1251
loss 2.891700633077846


epoch:  80%|████████  | 16/20 [22:36<05:38, 84.69s/it]


Epoch 016 train_loss: 2.8917         val_loss 2.8980 train_acc 0.3154 val_acc 0.1264
loss 2.888791224826757


epoch:  85%|████████▌ | 17/20 [24:03<04:15, 85.32s/it]


Epoch 017 train_loss: 2.8888         val_loss 2.8985 train_acc 0.3185 val_acc 0.1250
loss 2.8865455570532785


epoch:  90%|█████████ | 18/20 [25:25<02:48, 84.14s/it]


Epoch 018 train_loss: 2.8865         val_loss 2.8967 train_acc 0.3208 val_acc 0.1250
loss 2.8842906756735327


epoch:  95%|█████████▌| 19/20 [26:45<01:22, 82.91s/it]


Epoch 019 train_loss: 2.8843         val_loss 2.8947 train_acc 0.3232 val_acc 0.1251
loss 2.882311581153485


epoch: 100%|██████████| 20/20 [28:06<00:00, 84.33s/it]


Epoch 020 train_loss: 2.8823         val_loss 2.8929 train_acc 0.3252 val_acc 0.1247





In [34]:
X_test, y_test = [], []
for i in tqdm(range(len(test))):
    if len(test[i]) <= MAX_WORD_LENGTH:
        X_test.append(word_to_tensor(test[i]))
        y_test.append(torch.tensor(i, dtype=torch.int))
    else:
        X_test.append(word_to_tensor("а"))
        y_test.append(torch.tensor(-1, dtype=torch.int))
    
X_test = torch.stack(X_test)

100%|██████████| 294253/294253 [00:55<00:00, 5343.52it/s]


In [35]:
precalc_res = open('precalced.txt', 'r', encoding="utf8").readlines()
for i in range(len(test)):
    precalc_res[i] = precalc_res[i][:-1]

In [None]:
cnn.eval()
test_files = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_files, batch_size=64, shuffle=False)

res = [0] * len(y_test)
answered_count = 0

for inputs, labels in test_loader:
    inputs = inputs.to(device)
    labels = labels.to(device)

    with torch.set_grad_enabled(False):
        outputs = cnn(inputs)
        preds = torch.argmax(outputs, 1)
        # print(preds[:10], labels[:10])
        
        for prediction, pos in zip(preds, labels):
            # print(prediction, pos)
            if pos != -1 and precalc_res[answered_count].count('ё') == 0:
                # print(1)
                res[answered_count] = prediction
            else:
                # print(2)
                res[answered_count] = precalc_res[answered_count]
            answered_count += 1

In [None]:
with open('cnn2_predictions.txt', 'w', encoding="utf8") as f:
    for i, x in enumerate(res):
        if torch.is_tensor(x):
            f.write(f"{test[i][:x + 1]}^{test[i][x + 1:]}\n")
        else:
            f.write(f"{x}\n")