In [49]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
from gensim import utils
from gensim.models import FastText
from razdel import tokenize
import pandas as pd
import numpy as np
from gensim.test.utils import get_tmpfile
import torch.optim as optim

In [2]:
data = pd.read_csv("/work/hack/datasets/dataset_1.csv")

for i in range(1, 5):
    new_data = pd.read_csv(f"/work/hack/datasets/dataset_{i}.csv")
    data = pd.concat([data, new_data])

In [3]:
building = pd.read_csv("/work/hack/additional_data/building_20230808.csv")
building = building[["id", "short_address"]]
building_1 = building.copy().sample(frac=1).reset_index(drop=True)
building_1 = building_1.rename(columns={"id": "id2", "short_address": "short_address2"})
building_zero = pd.concat([building, building_1], axis=1)
building_one = pd.concat([building, building.rename(columns={"id": "id2", "short_address": "short_address2"})], axis=1)
building_new = pd.concat([building_zero, building_one])
building_new = building_new.rename(columns={
    "id": "first_id",
    "short_address": "first_text",
    "id2": "second_id",
    "short_address2": "second_text"
})
building_new = building_new.sample(frac=1).reset_index(drop=True)

  building = pd.read_csv("/work/hack/additional_data/building_20230808.csv")


In [6]:
fast_text_model = FastText.load("/work/hack/fasttext.model")

In [87]:
def emb(text: str):
    tokens = list(tokenize(text))
    tokens = [_.text for _ in tokens]
    predict = np.array([fast_text_model.wv[token] for token in tokens])
    predict = np.mean(predict, axis=0)
    predict = predict / np.linalg.norm(predict)
    
    return predict

In [116]:
class MyDataset(Dataset):
    def __init__(self, texts: pd.DataFrame):
        self.first_emb = np.array(texts['first_text'].apply(emb).values.tolist())
        self.second_emb = np.array(texts['second_text'].apply(emb).values.tolist())
        self.labels = (texts["first_id"] == texts["second_id"]).astype(np.int8)
        
    def __getitem__(self, ind: int):
        concat_embs = np.concatenate([self.first_emb[ind], self.second_emb[ind]])
        label = torch.zeros(2)
        label[self.labels.iloc[ind]] = 1
        return torch.tensor(concat_embs), label

    def __len__(self):
        return len(self.first_emb)

In [124]:
class Model(nn.Module):
    def __init__(self, inp_shape: int):
        super().__init__()
        self.fc1 = nn.Linear(inp_shape, 1000)
        self.fc2 = nn.Linear(1000, 100)
        self.fc3 = nn.Linear(100, 2)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [125]:
train_dataset = MyDataset(building_new[:10000])
val_dataset = MyDataset(building_new[10000:11000])
train_data_loader = DataLoader(dataset=train_dataset, batch_size=128, shuffle=True)
val_data_loader = DataLoader(dataset=val_dataset, batch_size=128, shuffle=True)

In [126]:
net = Model(256)

In [127]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [128]:
for epoch in range(2):  # loop over the dataset multiple times

    net.train()
    running_loss = 0.0
    for i, data in enumerate(train_data_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 50 == 49:    
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 50:.3f}')
            running_loss = 0.0
    
    net.eval()
    cnt = 0
    with torch.no_grad():
        for data in val_data_loader:
            inputs, labels = data
            outputs = net(inputs)
            _, predictions = torch.max(outputs, 1)
            _, labels = torch.max(labels, 1)

            for label, prediction in zip(labels, predictions):
                if label == prediction:
                    cnt += 1
    print(f"accuracy: {cnt / 1000}")
print('Finished Training')

[1,    50] loss: 0.685
accuracy: 0.912
[2,    50] loss: 0.223
accuracy: 0.956
Finished Training
