In [13]:
# Automatic reload of local libraries
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [36]:
fqueries = '../ruwikIR/processed_queries.csv'
fdocs = '../ruwikIR/processed_documents.csv'
fqrels = '../ruwikIR/qrels'

emb_file = '/home/mrim/data/embeddings/cc.ru.300.bin'

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import fasttext

model = fasttext.load_model(emb_file)

def embedding_matrix(text, max_len):
    words = text.split()
    matrix = np.empty(())
    dim = model.get_dimension()
    matrix = np.zeros((max_len, dim))
    for i in range(min(len(words), max_len)):
        matrix[i] = model[words[i]]
    return matrix

def build_emb_input(batch):
    output = []
    for triple in batch:
        q, d1, d2 = triple
        q_m = embedding_matrix(q, max_len = 10)
        d1_m = embedding_matrix(d1, max_len = 200)
        d2_m = embedding_matrix(d2, max_len = 200)
        output.append(np.array([q_m, d1_m, d2_m]))
    return np.asarray(output)

def reshape_4d(tensor):
    return torch.from_numpy(tensor).float().view(1, tensor.shape[1], 1, tensor.shape[0])




In [38]:
class Autoencoder(nn.Module):
    def __init__(self, layer_size, dropout_prob=0.6):
        super().__init__()
        self.layer_size = layer_size
        self.fc = nn.ModuleList([])
        for i in range(len(layer_size)-1):
            self.fc.append(nn.Conv2d(layer_size[i], layer_size[i+1], (1, 5 if i == 0 else 1)))
        self.dropout = nn.Dropout(p=dropout_prob, inplace=False)

    def forward(self, x):
        for i in range(len(self.fc)):
            x = self.dropout(F.relu(self.fc[i](x)))
        x=torch.mean(x, 3, keepdim=True)
        return x
    

# TODO:
2. Интегрировать tensorboardx

In [47]:
import torch.optim as optim
from utils import ModelInputGenerator

mi_generator = ModelInputGenerator(fdocs, fqueries, fqrels)
batch_num = 1
autoencoder = Autoencoder([300, 100, 5000])
criterion = nn.MarginRankingLoss(margin=1.0)
optimizer = optim.SGD(autoencoder.parameters(), lr=0.001, momentum=0.9)
reg_lambda = 10e-7 


for epoch in range(1):  # loop over the dataset multiple times
    running_loss = 0.0
    mi_generator.reset()
    for b in range(batch_num):
        batch = mi_generator.generate_batch(size=4)
        out_batch = build_emb_input(batch)
        for i in range(out_batch.shape[0]):
            # get the inputs; data is a list of [inputs, labels]
            query, d1, d2 = out_batch[i]
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize

            q_out = autoencoder(reshape_4d(query))
            d1_out = autoencoder(reshape_4d(d1))
            d2_out = autoencoder(reshape_4d(d2))
            
            reg_term = torch.cat((q_out, d1_out, d2_out), dim=1).sum(dim=1, keepdim=True)
            x1 = (q_out * d1_out).sum(dim=1, keepdim=True)
            x2 = (q_out * d2_out).sum(dim=1, keepdim=True)

            target = torch.ones(1)
            loss = criterion(x1, x2, target) + reg_lambda * reg_term
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 200 == 199:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

print('Finished Training')

Preprocessing data started...
Finished.
Finished Training


In [48]:
def zeros(x):
    return len([i for i, e in enumerate(x) if e == 0])

def get_zeros(x):
    q, d1, d2 = x
    qa = autoencoder(reshape_4d(q)).view(-1)
    d1a = autoencoder(reshape_4d(d1)).view(-1)
    d2a = autoencoder(reshape_4d(d2)).view(-1)
    return zeros(qa), zeros(d1a), zeros(d2a)

In [49]:
mi_generator.reset(4)
batch = mi_generator.generate_batch(size=20)
out_batch = build_emb_input(batch)
    
for x in out_batch:
    q, d1, d2 = get_zeros(x)
    print("Iteration #"+str(i)+ ":")
    print("Zeros in query: ", q)
    print("Zeros in doc1: ", d1)
    print("Zeros in doc2: ", d2)

Iteration #3:
Zeros in query:  990
Zeros in doc1:  796
Zeros in doc2:  596
Iteration #3:
Zeros in query:  987
Zeros in doc1:  817
Zeros in doc2:  796
Iteration #3:
Zeros in query:  989
Zeros in doc1:  819
Zeros in doc2:  716
Iteration #3:
Zeros in query:  995
Zeros in doc1:  823
Zeros in doc2:  782
Iteration #3:
Zeros in query:  992
Zeros in doc1:  792
Zeros in doc2:  800
Iteration #3:
Zeros in query:  993
Zeros in doc1:  800
Zeros in doc2:  800
Iteration #3:
Zeros in query:  993
Zeros in doc1:  791
Zeros in doc2:  826
Iteration #3:
Zeros in query:  994
Zeros in doc1:  824
Zeros in doc2:  768
Iteration #3:
Zeros in query:  997
Zeros in doc1:  792
Zeros in doc2:  836
Iteration #3:
Zeros in query:  992
Zeros in doc1:  795
Zeros in doc2:  819
Iteration #3:
Zeros in query:  994
Zeros in doc1:  806
Zeros in doc2:  768
Iteration #3:
Zeros in query:  989
Zeros in doc1:  807
Zeros in doc2:  804
Iteration #3:
Zeros in query:  990
Zeros in doc1:  830
Zeros in doc2:  797
Iteration #3:
Zeros in qu

In [34]:
torch.save(autoencoder.state_dict(), './autoencoder.pth')

In [17]:
import importlib.util
import argparse
import datetime
import distutils.util
import importlib.util
import json
import os
import pandas as pd
import warnings
import shutil

spec = importlib.util.spec_from_file_location("config", './model/params.py')
print(spec)
cmodule = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cmodule)
print(type(cmodule))
configs = cmodule.params

ModuleSpec(name='config', loader=<_frozen_importlib_external.SourceFileLoader object at 0x000001A6FA7C6588>, origin='./model/params.py')
<class 'module'>


In [28]:
with open('./model/params.json') as f:
    params = json.load(f)
type(params)

dict

In [29]:


for key, val in params.items():
    print(key, val)

docs ../ruwikIR/processed_documents.csv
queries ../ruwikIR/processed_queries.csv
qrels ../ruwikIR/qrels
embeddings /home/mrim/data/embeddings/cc.ru.300.bin
inverted_index ./inverted_index.csv
outmodel ./model.pth
learning_rate {'value': 5, 'power': 0.0001}
epoches 2
batch_size 32
layers [300, 200, 5000]
lambda {'value': 1, 'power': 1e-06}
drop_prob 0.6


In [19]:
from snrm import InvertedIndexConstructor
import torch
import random

In [24]:
index_constr = InvertedIndexConstructor("./file.txt")
repr_tensor = torch.rand(5, 20)
for i in range(5):
    r = random.randint(0, 20)
    repr_tensor[i][r] = 0.0
    
print(repr_tensor[0]) 
index_constr.construct(range(5), repr_tensor)

tensor([0.5257, 0.3658, 0.4391, 0.0659, 0.3592, 0.3121, 0.7426, 0.1992, 0.0000,
        0.5123, 0.3275, 0.0671, 0.9164, 0.8634, 0.9826, 0.2818, 0.8634, 0.0939,
        0.7273, 0.0815])


In [25]:
index_constr.get_index()

{0: [(0, tensor(0.5257)),
  (1, tensor(0.1797)),
  (2, tensor(0.8578)),
  (3, tensor(0.5252)),
  (4, tensor(0.2159))],
 1: [(0, tensor(0.3658)),
  (1, tensor(0.3023)),
  (2, tensor(0.9728)),
  (3, tensor(0.8307)),
  (4, tensor(0.6781))],
 2: [(0, tensor(0.4391)),
  (1, tensor(0.0421)),
  (2, tensor(0.3877)),
  (3, tensor(0.3592))],
 3: [(0, tensor(0.0659)),
  (1, tensor(0.3625)),
  (3, tensor(0.6913)),
  (4, tensor(0.3166))],
 4: [(0, tensor(0.3592)),
  (2, tensor(0.6451)),
  (3, tensor(0.5580)),
  (4, tensor(0.5135))],
 5: [(0, tensor(0.3121)),
  (1, tensor(0.6315)),
  (2, tensor(0.9155)),
  (3, tensor(0.5570)),
  (4, tensor(0.2516))],
 6: [(0, tensor(0.7426)),
  (1, tensor(0.8405)),
  (2, tensor(0.3915)),
  (4, tensor(0.7029))],
 7: [(0, tensor(0.1992)),
  (1, tensor(0.4625)),
  (2, tensor(0.8267)),
  (3, tensor(0.1101)),
  (4, tensor(0.1669))],
 9: [(0, tensor(0.5123)),
  (1, tensor(0.2745)),
  (2, tensor(0.2430)),
  (3, tensor(0.2082)),
  (4, tensor(0.5402))],
 10: [(0, tensor(0.32