In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/machine-translation-ioai/train.csv
/kaggle/input/machine-translation-ioai/test.csv


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import random

In [3]:
train = pd.read_csv("/kaggle/input/machine-translation-ioai/train.csv")

all_chars = set()
for date in train["data"]:
    all_chars.update(str(date))
for date in train["label"]:
    all_chars.update(str(date))

all_chars = sorted(all_chars)
char_to_idx = {char: idx + 2 for idx, char in enumerate(all_chars)}
char_to_idx["<PAD>"] = 0 
char_to_idx["<SOS>"] = 1 
char_to_idx["<EOS>"] = len(char_to_idx) 

idx_to_char = {idx: char for char, idx in char_to_idx.items()}

input_vocab_size = len(char_to_idx)
output_vocab_size = len(char_to_idx)

In [4]:
def string_to_tensor(s, max_length, add_sos_eos=False):
    tensor = torch.zeros(max_length, dtype=torch.long)
    s = str(s)
    if add_sos_eos:
        tensor[0] = char_to_idx["<SOS>"]
        for i, char in enumerate(s):
            tensor[i + 1] = char_to_idx.get(char, char_to_idx["<PAD>"])
        tensor[len(s) + 1] = char_to_idx["<EOS>"]
    else:
        for i, char in enumerate(s):
            tensor[i] = char_to_idx.get(char, char_to_idx["<PAD>"])
    return tensor

max_input_length = max(len(str(date)) for date in train["data"]) + 2  # +2 для SOS/EOS
max_output_length = len("DD-MM-YYYY") + 2

In [5]:
test = pd.read_csv('/kaggle/input/machine-translation-ioai/test.csv')

In [6]:
class DateDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        raw_date = self.df.iloc[idx]["data"]
        target_date = self.df.iloc[idx]["label"]
        
        input_tensor = string_to_tensor(raw_date, max_input_length)
        target_tensor = string_to_tensor(target_date, max_output_length, add_sos_eos=True)
        
        return input_tensor, target_tensor

train_dataset = DateDataset(train)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

In [8]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        hidden = hidden.unsqueeze(1).repeat(1, encoder_outputs.size(1), 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = torch.softmax(self.v(energy).squeeze(2), dim=1)
        return attention

In [9]:
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(embedding_size + hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell, encoder_outputs):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)
        
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)
        
        lstm_input = torch.cat((embedded, context), dim=2)
        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell, attn_weights

In [10]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = target.size(0)
        target_len = target.size(1)
        target_vocab_size = self.decoder.fc.out_features
        
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(source.device)
        encoder_outputs, hidden, cell = self.encoder(source)
        
        decoder_input = target[:, 0]  # Первый токен — <SOS>
        
        for t in range(1, target_len):
            output, hidden, cell, _ = self.decoder(decoder_input, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            decoder_input = target[:, t] if teacher_force else output.argmax(1)
        
        return outputs

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# embedding_size = 256
# hidden_size = 512
embedding_size = 512
hidden_size = 1024
num_layers = 1

encoder = Encoder(input_vocab_size, embedding_size, hidden_size, num_layers).to(device)
decoder = Decoder(output_vocab_size, embedding_size, hidden_size, num_layers).to(device)
model = Seq2Seq(encoder, decoder).to(device)

optimizer = optim.Adam(model.parameters(), 
                       lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [12]:
num_epochs = 14

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch_idx, (input_batch, target_batch) in enumerate(train_loader):
        input_batch = input_batch.to(device)
        target_batch = target_batch.to(device)
        
        optimizer.zero_grad()
        output = model(input_batch, target_batch)
        
        loss = criterion(
            output[:, 1:].reshape(-1, output_vocab_size),
            target_batch[:, 1:].reshape(-1))
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

Epoch 1/14, Loss: 1.4509
Epoch 2/14, Loss: 0.7705
Epoch 3/14, Loss: 0.6599
Epoch 4/14, Loss: 0.3983
Epoch 5/14, Loss: 0.2021
Epoch 6/14, Loss: 0.0697
Epoch 7/14, Loss: 0.0298
Epoch 8/14, Loss: 0.0111
Epoch 9/14, Loss: 0.0046
Epoch 10/14, Loss: 0.0028
Epoch 11/14, Loss: 0.0013
Epoch 12/14, Loss: 0.0008
Epoch 13/14, Loss: 0.0006
Epoch 14/14, Loss: 0.0005


In [13]:
def predict(model, input_str, max_length=12):
    model.eval()
    with torch.no_grad():
        input_tensor = string_to_tensor(input_str, max_input_length).unsqueeze(0).to(device)
        encoder_outputs, hidden, cell = model.encoder(input_tensor)
        
        decoder_input = torch.tensor([char_to_idx["<SOS>"]]).to(device)
        output_str = ""
        
        for _ in range(max_length):
            output, hidden, cell, _ = model.decoder(decoder_input, hidden, cell, encoder_outputs)
            pred_token = output.argmax(1).item()
            
            if pred_token == char_to_idx["<EOS>"]:
                break
            
            output_str += idx_to_char[pred_token]
            decoder_input = torch.tensor([pred_token]).to(device)
        
        return output_str

In [14]:
from tqdm import tqdm
preds = []
for input_str in tqdm(test['data'].values):
    preds.append(predict(model, input_str))

100%|██████████| 4676/4676 [00:43<00:00, 106.39it/s]


In [15]:
test['label'] = preds
test = test.set_index('id')
test[['label']].to_csv('submission.csv')