In [7]:
# Automatic reload of local libraries
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [36]:
fqueries = '../ruwikIR/processed_queries.csv'
fdocs = '../ruwikIR/processed_documents.csv'
fqrels = '../ruwikIR/qrels'

emb_file = '/home/mrim/data/embeddings/cc.ru.300.bin'

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import fasttext

model = fasttext.load_model(emb_file)

def embedding_matrix(text, max_len):
    words = text.split()
    matrix = np.empty(())
    dim = model.get_dimension()
    matrix = np.zeros((max_len, dim))
    for i in range(min(len(words), max_len)):
        matrix[i] = model[words[i]]
    return matrix

def build_emb_input(batch):
    output = []
    for triple in batch:
        q, d1, d2 = triple
        q_m = embedding_matrix(q, max_len = 10)
        d1_m = embedding_matrix(d1, max_len = 200)
        d2_m = embedding_matrix(d2, max_len = 200)
        output.append(np.array([q_m, d1_m, d2_m]))
    return np.asarray(output)

def reshape_4d(tensor):
    return torch.from_numpy(tensor).float().view(1, tensor.shape[1], 1, tensor.shape[0])




In [38]:
class Autoencoder(nn.Module):
    def __init__(self, layer_size, dropout_prob=0.6):
        super().__init__()
        self.layer_size = layer_size
        self.fc = nn.ModuleList([])
        for i in range(len(layer_size)-1):
            self.fc.append(nn.Conv2d(layer_size[i], layer_size[i+1], (1, 5 if i == 0 else 1)))
        self.dropout = nn.Dropout(p=dropout_prob, inplace=False)

    def forward(self, x):
        for i in range(len(self.fc)):
            x = self.dropout(F.relu(self.fc[i](x)))
        x=torch.mean(x, 3, keepdim=True)
        return x
    

# TODO:
2. Интегрировать tensorboardx

In [47]:
import torch.optim as optim
from utils import ModelInputGenerator

mi_generator = ModelInputGenerator(fdocs, fqueries, fqrels)
batch_num = 1
autoencoder = Autoencoder([300, 100, 5000])
criterion = nn.MarginRankingLoss(margin=1.0)
optimizer = optim.SGD(autoencoder.parameters(), lr=0.001, momentum=0.9)
reg_lambda = 10e-7 


for epoch in range(1):  # loop over the dataset multiple times
    running_loss = 0.0
    mi_generator.reset()
    for b in range(batch_num):
        batch = mi_generator.generate_batch(size=4)
        out_batch = build_emb_input(batch)
        for i in range(out_batch.shape[0]):
            # get the inputs; data is a list of [inputs, labels]
            query, d1, d2 = out_batch[i]
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize

            q_out = autoencoder(reshape_4d(query))
            d1_out = autoencoder(reshape_4d(d1))
            d2_out = autoencoder(reshape_4d(d2))
            
            reg_term = torch.cat((q_out, d1_out, d2_out), dim=1).sum(dim=1, keepdim=True)
            x1 = (q_out * d1_out).sum(dim=1, keepdim=True)
            x2 = (q_out * d2_out).sum(dim=1, keepdim=True)

            target = torch.ones(1)
            loss = criterion(x1, x2, target) + reg_lambda * reg_term
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 200 == 199:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2
                       000))
                running_loss = 0.0

print('Finished Training')

Preprocessing data started...
Finished.
Finished Training


In [48]:
def zeros(x):
    return len([i for i, e in enumerate(x) if e == 0])

def get_zeros(x):
    q, d1, d2 = x
    qa = autoencoder(reshape_4d(q)).view(-1)
    d1a = autoencoder(reshape_4d(d1)).view(-1)
    d2a = autoencoder(reshape_4d(d2)).view(-1)
    return zeros(qa), zeros(d1a), zeros(d2a)

In [49]:
mi_generator.reset(4)
batch = mi_generator.generate_batch(size=20)
out_batch = build_emb_input(batch)
    
for x in out_batch:
    q, d1, d2 = get_zeros(x)
    print("Iteration #"+str(i)+ ":")
    print("Zeros in query: ", q)
    print("Zeros in doc1: ", d1)
    print("Zeros in doc2: ", d2)

Iteration #3:
Zeros in query:  990
Zeros in doc1:  796
Zeros in doc2:  596
Iteration #3:
Zeros in query:  987
Zeros in doc1:  817
Zeros in doc2:  796
Iteration #3:
Zeros in query:  989
Zeros in doc1:  819
Zeros in doc2:  716
Iteration #3:
Zeros in query:  995
Zeros in doc1:  823
Zeros in doc2:  782
Iteration #3:
Zeros in query:  992
Zeros in doc1:  792
Zeros in doc2:  800
Iteration #3:
Zeros in query:  993
Zeros in doc1:  800
Zeros in doc2:  800
Iteration #3:
Zeros in query:  993
Zeros in doc1:  791
Zeros in doc2:  826
Iteration #3:
Zeros in query:  994
Zeros in doc1:  824
Zeros in doc2:  768
Iteration #3:
Zeros in query:  997
Zeros in doc1:  792
Zeros in doc2:  836
Iteration #3:
Zeros in query:  992
Zeros in doc1:  795
Zeros in doc2:  819
Iteration #3:
Zeros in query:  994
Zeros in doc1:  806
Zeros in doc2:  768
Iteration #3:
Zeros in query:  989
Zeros in doc1:  807
Zeros in doc2:  804
Iteration #3:
Zeros in query:  990
Zeros in doc1:  830
Zeros in doc2:  797
Iteration #3:
Zeros in qu

In [34]:
torch.save(autoencoder.state_dict(), './autoencoder.pth')

In [17]:
import importlib.util
import argparse
import datetime
import distutils.util
import importlib.util
import json
import os
import pandas as pd
import warnings
import shutil

spec = importlib.util.spec_from_file_location("config", './model/params.py')
print(spec)
cmodule = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cmodule)
print(type(cmodule))
configs = cmodule.params

ModuleSpec(name='config', loader=<_frozen_importlib_external.SourceFileLoader object at 0x000001A6FA7C6588>, origin='./model/params.py')
<class 'module'>


In [28]:
with open('./model/params.json') as f:
    params = json.load(f)
type(params)

dict

In [29]:


for key, val in params.items():
    print(key, val)

docs ../ruwikIR/processed_documents.csv
queries ../ruwikIR/processed_queries.csv
qrels ../ruwikIR/qrels
embeddings /home/mrim/data/embeddings/cc.ru.300.bin
inverted_index ./inverted_index.csv
outmodel ./model.pth
learning_rate {'value': 5, 'power': 0.0001}
epoches 2
batch_size 32
layers [300, 200, 5000]
lambda {'value': 1, 'power': 1e-06}
drop_prob 0.6


In [9]:
from snrm import InvertedIndex
import torch
import random

In [11]:
index_constr = InvertedIndex("./file.txt")
repr_tensor = torch.rand(5, 20)
for i in range(5):
    r = random.randint(0, 19)
    repr_tensor[i][r] = 0.0
    
print(repr_tensor[0]) 
index_constr.construct(range(5), repr_tensor)

tensor([0.6626, 0.7634, 0.3152, 0.9138, 0.0000, 0.9281, 0.1381, 0.1911, 0.8263,
        0.9297, 0.5980, 0.0308, 0.8186, 0.0843, 0.0141, 0.3353, 0.2179, 0.2511,
        0.4956, 0.3442])


In [12]:
index_constr.flush()

In [14]:
index_constr.read_index()

{'0': [[0, 0.6625607013702393],
  [1, 0.8118937611579895],
  [2, 0.17625075578689575],
  [3, 0.8161836862564087],
  [4, 0.22455036640167236]],
 '1': [[0, 0.7633713483810425],
  [1, 0.602530837059021],
  [2, 0.48982852697372437],
  [3, 0.5182974934577942],
  [4, 0.49064409732818604]],
 '2': [[0, 0.3151988387107849],
  [1, 0.9322848320007324],
  [2, 0.8350390791893005],
  [3, 0.9327800869941711],
  [4, 0.5328416228294373]],
 '3': [[0, 0.9137703776359558],
  [2, 0.32753509283065796],
  [3, 0.5071824193000793],
  [4, 0.06532859802246094]],
 '5': [[0, 0.9281238913536072],
  [1, 0.457541823387146],
  [2, 0.11879563331604004],
  [3, 0.9908499717712402],
  [4, 0.7916132807731628]],
 '6': [[0, 0.1381348967552185],
  [1, 0.9111696481704712],
  [2, 0.9475979804992676],
  [3, 0.02024853229522705],
  [4, 0.04052388668060303]],
 '7': [[0, 0.19107496738433838],
  [1, 0.012347698211669922],
  [2, 0.6920667886734009],
  [3, 0.0009909272193908691],
  [4, 0.5567017793655396]],
 '8': [[0, 0.82629883289337

In [1]:
import torch
import torchvision
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms

# Writer will output to ./runs/ directory by default
writer = SummaryWriter()

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
trainset = datasets.MNIST('mnist_train', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
model = torchvision.models.resnet50(False)
# Have ResNet model take in grayscale rather than RGB
model.conv1 = torch.nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
images, labels = next(iter(trainloader))

grid = torchvision.utils.make_grid(images)
writer.add_image('images', grid, 0)
writer.add_graph(model, images)
writer.close()

0it [00:00, ?it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to mnist_train\MNIST\raw\train-images-idx3-ubyte.gz


100%|███████████████████████████████████████████████████████████████████▉| 9904128/9912422 [01:33<00:00, 113001.65it/s]

Extracting mnist_train\MNIST\raw\train-images-idx3-ubyte.gz to mnist_train\MNIST\raw



0it [00:00, ?it/s][A

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to mnist_train\MNIST\raw\train-labels-idx1-ubyte.gz



  0%|                                                                                        | 0/28881 [00:00<?, ?it/s][A
32768it [00:01, 32292.11it/s]                                                                                          [A

0it [00:00, ?it/s][A

Extracting mnist_train\MNIST\raw\train-labels-idx1-ubyte.gz to mnist_train\MNIST\raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to mnist_train\MNIST\raw\t10k-images-idx3-ubyte.gz



  0%|                                                                                      | 0/1648877 [00:00<?, ?it/s][A
  1%|█                                                                     | 24576/1648877 [00:00<00:10, 157724.90it/s][A
  2%|█▋                                                                    | 40960/1648877 [00:00<00:11, 137429.31it/s][A
  3%|██                                                                    | 49152/1648877 [00:00<00:15, 103515.39it/s][A
  4%|███▏                                                                  | 73728/1648877 [00:01<00:14, 106598.00it/s][A
  5%|███▌                                                                   | 81920/1648877 [00:01<00:21, 72422.51it/s][A
  5%|███▉                                                                   | 90112/1648877 [00:01<00:38, 40804.16it/s][A
 10%|███████▎                                                              | 172032/1648877 [00:02<00:28, 51421.14it/s][A
 12%|████████▎ 

 79%|█████████████████████████████████████████████████████▋              | 1302528/1648877 [00:13<00:02, 121861.86it/s][A
 80%|██████████████████████████████████████████████████████▋             | 1327104/1648877 [00:14<00:02, 140128.35it/s][A
9920512it [01:50, 113001.65it/s]                                                                                       [A
 82%|████████████████████████████████████████████████████████▉            | 1359872/1648877 [00:14<00:03, 91620.65it/s][A
 83%|█████████████████████████████████████████████████████████▌           | 1376256/1648877 [00:14<00:03, 80560.03it/s][A
 84%|██████████████████████████████████████████████████████████▎          | 1392640/1648877 [00:14<00:02, 85716.45it/s][A
 85%|██████████████████████████████████████████████████████████▉          | 1409024/1648877 [00:15<00:03, 75940.32it/s][A
 86%|███████████████████████████████████████████████████████████▋         | 1425408/1648877 [00:15<00:02, 76234.06it/s][A
 87%|███████████

Extracting mnist_train\MNIST\raw\t10k-images-idx3-ubyte.gz to mnist_train\MNIST\raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to mnist_train\MNIST\raw\t10k-labels-idx1-ubyte.gz




8192it [00:00, 26481.66it/s]                                                                                           [A[A


Extracting mnist_train\MNIST\raw\t10k-labels-idx1-ubyte.gz to mnist_train\MNIST\raw
Processing...
Done!



1654784it [00:34, 115981.02it/s]                                                                                       [A

In [2]:
from torch.utils.tensorboard import SummaryWriter
import numpy as np

writer = SummaryWriter()

for n_iter in range(100):
    writer.add_scalar('Loss/train', np.random.random(), n_iter)
    writer.add_scalar('Loss/test', np.random.random(), n_iter)
    writer.add_scalar('Accuracy/train', np.random.random(), n_iter)
    writer.add_scalar('Accuracy/test', np.random.random(), n_iter)


In [8]:
x = 4e-2
x

0.04

In [4]:
x = 1e-2

In [5]:
x

0.01

In [14]:
!python train.py --params=params_local.json

Namespace(batch_size=32, dmax_len=10000, docs='../ruwikIR/processed_documents.csv', drop_prob=0.6, embeddings='/home/mrim/data/embeddings/cc.ru.300.bin', epoches=2, inverted_index='./inverted_index.csv', layers=[300, 200, 5000], learning_rate=5e-05, output_file='./model.pth', params='params_local.json', qmax_len=100, qrels='../ruwikIR/qrels', queries='../ruwikIR/processed_queries.csv', reg_lambda=1e-07)



Traceback (most recent call last):
  File "train.py", line 85, in <module>
    run(args)
  File "train.py", line 64, in run
    dmax_len=args.dmax_len,
  File "C:\Users\sonya\Desktop\snrm-bert\snrm\snrm.py", line 48, in __init__
    self.embeddings = Embeddings(fembeddings)
  File "C:\Users\sonya\Desktop\snrm-bert\snrm\snrm.py", line 14, in __init__
    self.model = fasttext.load_model(emb_file)
  File "C:\Users\sonya\Anaconda3\lib\site-packages\fasttext\FastText.py", line 350, in load_model
    return _FastText(model_path=path)
  File "C:\Users\sonya\Anaconda3\lib\site-packages\fasttext\FastText.py", line 43, in __init__
    self.f.loadModel(model_path)
ValueError: /home/mrim/data/embeddings/cc.ru.300.bin cannot be opened for loading!


In [13]:
!ls

"ls" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.
