In [4]:
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [5]:
LETTERS = "абвгдежзийклмнопрстуфхцчшщъыьэюя"
VOWELS = "аеиоуэюяы"
N_LETTERS = len(LETTERS)
MAX_WORD_LENGTH = 23

In [6]:
train = open('data/train_stresses_labels.txt', 'r', encoding="utf8").readlines()
train_word = []
x = []
for i in range(len(train)):
    train[i] = train[i][:-1]
    if len(train[i]) <= MAX_WORD_LENGTH:
        train_word.append(train[i].replace('^', '').replace('ё', 'е'))
    else:
        x.append(train[i].replace('^', '').replace('ё', 'е'))
train[:5], len(x), x[:5]

(['аа^к', 'аа^ка', 'аа^ке', 'аа^ки', 'аа^ков'],
 495,
 ['абдоминомедиастинальный',
  'абдоминоперикардиостомией',
  'абдоминоперикардиостомиею',
  'абдоминоперикардиостомии',
  'абдоминоперикардиостомия'])

In [7]:
test = open('data/public_test_stresses.txt', 'r', encoding="utf8").readlines()
for i in range(len(test)):
    test[i] = test[i][:-1].replace('ё', '')
print(*test[:10])

аакам ааками ааленец аама аамами аамов аамом аамы аангичами аангичах


In [8]:
def letter_to_index(letter):
    return LETTERS.find(letter)

def letter_to_tensor(letter):
    tensor = torch.zeros(1, N_LETTERS)
    tensor[0][letter_to_index(letter)] = 1
    return tensor

# <1 x max_word_length x n_letters>,
# or an array of one-hot letter vectors
def word_to_tensor(word):
    tensor = torch.zeros(1, MAX_WORD_LENGTH, N_LETTERS)
    for i, letter in enumerate(word):
        tensor[0][i][letter_to_index(letter)] = 1
    return tensor

In [9]:
X, y = [], []
for i in tqdm(range(len(train_word))):
    if len(train[i]) <= MAX_WORD_LENGTH:
        X.append(word_to_tensor(train_word[i]))
        correct_pos = train[i].find('^')
        y.append(torch.zeros(MAX_WORD_LENGTH))
        y[-1][correct_pos] = 1
X = torch.stack(X)
y = torch.stack(y)
y[:5]

100%|██████████| 587995/587995 [01:53<00:00, 5194.21it/s]


tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0.]])

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=42)

In [11]:
class CustomDataset(Dataset):
    def __init__(self, data_tensor, target_tensor):
        self.data = data_tensor
        self.target = target_tensor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.target[idx]

In [12]:
train_files = CustomDataset(X_train, y_train)
val_files = CustomDataset(X_val, y_val)

In [13]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [14]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv_layers = nn.Sequential(
            # 1 x 23 x 32
            nn.Conv2d(1, 8, kernel_size=3),
            nn.LeakyReLU(),
            nn.BatchNorm2d(8),
            
            # 1 x 21 x 30
            nn.Conv2d(8, 16, kernel_size=3),
            nn.LeakyReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.BatchNorm2d(16),
            nn.Dropout(0.2),
            
            # 16 x 9 x 14
            nn.Conv2d(16, 32, kernel_size=3),
            nn.LeakyReLU(),
            nn.BatchNorm2d(32),
            nn.Dropout(0.2),
            
            # 32 x 7 x 12
            nn.Conv2d(32, 64, kernel_size=3),
            nn.LeakyReLU(),
            nn.BatchNorm2d(64),
            nn.Dropout(0.2),
            
            # 64 x 5 x 10
            nn.Conv2d(64, 64, kernel_size=3),
            nn.LeakyReLU(),
            nn.BatchNorm2d(64),
            nn.Dropout(0.2),
            
            # 64 x 3 x 8
            nn.Conv2d(64, 32, kernel_size=2),
            nn.LeakyReLU(),
            nn.BatchNorm2d(32),
            nn.Dropout(0.2)
            
            # 32 x 2 x 7
        )
        
        self.fully_conected = nn.Sequential(
            nn.Linear(448, 256),
            nn.LeakyReLU(),
            # nn.Dropout(0.25),
            nn.BatchNorm1d(256),
            
            nn.Linear(256, 512),
            nn.LeakyReLU(),
            # nn.Dropout(0.25),
            nn.BatchNorm1d(512),
            
            # nn.Linear(512, 128),
            # nn.LeakyReLU(),
            # # nn.Dropout(0.25),
            # nn.BatchNorm1d(128),
            
            # nn.Linear(128, 64),
            # nn.LeakyReLU(),
            # # nn.Dropout(0.25),
            # nn.BatchNorm1d(64),
            
            nn.Linear(512, MAX_WORD_LENGTH),
            nn.Softmax()
        )
        
    def forward(self, x):
        x = self.conv_layers(x)
        # print(x.shape)
        
        x = nn.Flatten()(x)
        x = self.fully_conected(x)
        
        return x

In [15]:
def fit_epoch(model, train_loader, criterion, optimizer):
    running_loss = 0.0
    running_corrects = 0
    processed_data = 0
    
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        
        outputs = model(inputs)
        # print(outputs.shape, labels[0])
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        preds = torch.argmax(outputs, 1)
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == torch.argmax(labels.data, 1))
        processed_data += inputs.size(0)
              
    train_loss = running_loss / processed_data
    train_acc = running_corrects.cpu().numpy() / processed_data
    return train_loss, train_acc

In [16]:
def eval_epoch(model, val_loader, criterion):
    model.eval()
    running_loss = 0.0
    running_corrects = 0
    processed_size = 0

    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        with torch.set_grad_enabled(False):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            preds = torch.argmax(outputs, 1)

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == torch.argmax(labels.data))
        processed_size += inputs.size(0)
    val_loss = running_loss / processed_size
    val_acc = running_corrects.double() / processed_size
    return val_loss, val_acc

In [17]:
def train(model, epochs, batch_size):
    train_loader = DataLoader(train_files, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_files, batch_size=batch_size, shuffle=False)

    history = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
        val_loss {v_loss:0.4f} train_acc {t_acc:0.4f} val_acc {v_acc:0.4f}"
    
    with tqdm(desc="epoch", total=epochs) as pbar_outer:
        # opt = torch.optim.Adam(model.parameters(), lr=0.01)
        opt = torch.optim.AdamW(model.parameters())
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer=opt, gamma=0.9)
        criterion = nn.CrossEntropyLoss()
        
        for epoch in range(epochs):
            train_loss, train_acc = fit_epoch(model, train_loader, criterion, opt)
            print("loss", train_loss)
            
            val_loss, val_acc = eval_epoch(model, val_loader, criterion)
            history.append((train_loss, train_acc, val_loss, val_acc))
            
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_acc=train_acc, v_acc=val_acc))
            
            scheduler.step()
            
    return history

In [18]:
cnn = CNN().to(device)

params_count = sum(p.numel() for p in cnn.parameters() if p.requires_grad)
print(f'Number of trainable parameters: {params_count}')

history = train(model=cnn, epochs=20, batch_size=64)

Number of trainable parameters: 329831


  return self._call_impl(*args, **kwargs)


loss 3.040870185943894


epoch:   5%|▌         | 1/20 [01:40<31:57, 100.94s/it]


Epoch 001 train_loss: 3.0409         val_loss 3.0145 train_acc 0.1608 val_acc 0.1395
loss 3.0075056600141363


epoch:  10%|█         | 2/20 [03:08<27:51, 92.88s/it] 


Epoch 002 train_loss: 3.0075         val_loss 3.0008 train_acc 0.1966 val_acc 0.1398
loss 2.9840349384675235


epoch:  15%|█▌        | 3/20 [04:35<25:35, 90.34s/it]


Epoch 003 train_loss: 2.9840         val_loss 2.9672 train_acc 0.2211 val_acc 0.1355
loss 2.953312252727308


epoch:  20%|██        | 4/20 [06:02<23:43, 88.96s/it]


Epoch 004 train_loss: 2.9533         val_loss 2.9437 train_acc 0.2529 val_acc 0.1339
loss 2.932711900154874


epoch:  25%|██▌       | 5/20 [07:29<22:03, 88.25s/it]


Epoch 005 train_loss: 2.9327         val_loss 2.9269 train_acc 0.2737 val_acc 0.1325
loss 2.915481936874475


epoch:  30%|███       | 6/20 [08:56<20:30, 87.88s/it]


Epoch 006 train_loss: 2.9155         val_loss 2.9060 train_acc 0.2912 val_acc 0.1306
loss 2.889054240895065


epoch:  35%|███▌      | 7/20 [10:23<18:57, 87.52s/it]


Epoch 007 train_loss: 2.8891         val_loss 2.8893 train_acc 0.3179 val_acc 0.1268
loss 2.8711840747295008


epoch:  40%|████      | 8/20 [11:50<17:28, 87.37s/it]


Epoch 008 train_loss: 2.8712         val_loss 2.8728 train_acc 0.3363 val_acc 0.1275
loss 2.8575099316352426


epoch:  45%|████▌     | 9/20 [13:19<16:08, 88.04s/it]


Epoch 009 train_loss: 2.8575         val_loss 2.8629 train_acc 0.3502 val_acc 0.1277
loss 2.844894647101032


epoch:  50%|█████     | 10/20 [14:46<14:35, 87.56s/it]


Epoch 010 train_loss: 2.8449         val_loss 2.8585 train_acc 0.3631 val_acc 0.1296
loss 2.8338582280456954


epoch:  55%|█████▌    | 11/20 [16:12<13:03, 87.04s/it]


Epoch 011 train_loss: 2.8339         val_loss 2.8451 train_acc 0.3741 val_acc 0.1268
loss 2.8243612945355867


epoch:  60%|██████    | 12/20 [17:37<11:32, 86.54s/it]


Epoch 012 train_loss: 2.8244         val_loss 2.8401 train_acc 0.3839 val_acc 0.1277
loss 2.8163941994208903


epoch:  65%|██████▌   | 13/20 [19:03<10:03, 86.26s/it]


Epoch 013 train_loss: 2.8164         val_loss 2.8318 train_acc 0.3919 val_acc 0.1278
loss 2.8093066807087967


epoch:  70%|███████   | 14/20 [20:31<08:41, 86.87s/it]


Epoch 014 train_loss: 2.8093         val_loss 2.8260 train_acc 0.3993 val_acc 0.1275
loss 2.8027984217115107


epoch:  75%|███████▌  | 15/20 [22:00<07:17, 87.54s/it]


Epoch 015 train_loss: 2.8028         val_loss 2.8232 train_acc 0.4058 val_acc 0.1271
loss 2.7972074117662657


epoch:  80%|████████  | 16/20 [23:28<05:50, 87.53s/it]


Epoch 016 train_loss: 2.7972         val_loss 2.8199 train_acc 0.4114 val_acc 0.1267
loss 2.7923147620449944


epoch:  85%|████████▌ | 17/20 [24:55<04:22, 87.44s/it]


Epoch 017 train_loss: 2.7923         val_loss 2.8172 train_acc 0.4164 val_acc 0.1264
loss 2.7880024643303165


epoch:  90%|█████████ | 18/20 [26:23<02:55, 87.81s/it]


Epoch 018 train_loss: 2.7880         val_loss 2.8142 train_acc 0.4207 val_acc 0.1270
loss 2.783861698220692


epoch:  95%|█████████▌| 19/20 [27:53<01:28, 88.29s/it]


Epoch 019 train_loss: 2.7839         val_loss 2.8106 train_acc 0.4248 val_acc 0.1274
loss 2.7798562160590192


epoch: 100%|██████████| 20/20 [29:22<00:00, 88.12s/it]


Epoch 020 train_loss: 2.7799         val_loss 2.8097 train_acc 0.4290 val_acc 0.1272





In [19]:
X_test, y_test = [], []
for i in tqdm(range(len(test))):
    if len(test[i]) <= MAX_WORD_LENGTH:
        X_test.append(word_to_tensor(test[i]))
        y_test.append(torch.tensor(i, dtype=torch.int))
    else:
        X_test.append(word_to_tensor("а"))
        y_test.append(torch.tensor(-1, dtype=torch.int))
    
X_test = torch.stack(X_test)

100%|██████████| 294253/294253 [00:54<00:00, 5413.61it/s]


In [21]:
precalc_res = open('precalced.txt', 'r', encoding="utf8").readlines()
for i in range(len(test)):
    precalc_res[i] = precalc_res[i][:-1]

In [27]:
cnn.eval()
test_files = CustomDataset(X_test, y_test)
test_loader = DataLoader(test_files, batch_size=64, shuffle=False)

res = [0] * len(y_test)
answered_count = 0

for inputs, labels in test_loader:
    inputs = inputs.to(device)
    labels = labels.to(device)

    with torch.set_grad_enabled(False):
        outputs = cnn(inputs)
        preds = torch.argmax(outputs, 1)
        # print(preds[:10], labels[:10])
        
        for prediction, pos in zip(preds, labels):
            # print(prediction, pos)
            if pos != -1 and precalc_res[answered_count].count('ё') == 0:
                # print(1)
                res[answered_count] = prediction
            else:
                # print(2)
                res[answered_count] = precalc_res[answered_count]
            answered_count += 1

In [37]:
with open('cnn_predictions.txt', 'w', encoding="utf8") as f:
    for i, x in enumerate(res):
        if torch.is_tensor(x):
            f.write(f"{test[i][:x + 1]}^{test[i][x + 1:]}\n")
        else:
            f.write(f"{x}\n")