In [None]:
from tqdm import tqdm

import numpy as np
import numpy.random as random

import math

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence

import io

import pickle

In [None]:
def load_embeddings(fname, get_embeddings=True, get_w2i=False, get_i2w=False, skip_first_line=True):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    
    if skip_first_line:
        fin.readline()
    
    num_embeddings = 0
    
    word2idx = {}
    idx2word = {}

    embeddings = []

    for line in fin:
        line = line.rstrip().split(' ')
        
        if get_w2i:
            word2idx[line[0]] = num_embeddings
        if get_i2w:
            idx2word[num_embeddings] = line[0]
        if get_embeddings:
            embeddings.append([float(num) for num in line[1:]])

        num_embeddings += 1
        
    fin.close()
    
    return torch.FloatTensor(embeddings), word2idx, idx2word

In [None]:
with open(r'word2idx.pickle', 'rb') as _if:
    word2idx = pickle.load(_if)

In [None]:
#word2idx = load_embeddings('../embeddings/wiki-news-300d-1M.vec', get_embeddings=False, get_w2i=True)[1]

In [None]:
class MultiplicativeAttention(nn.Module):
      
    def __init__(self, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

    def forward(self, q, k, v, mask=None):
        
        attn = torch.matmul(q , k.transpose(-2, -1) / math.sqrt(q.size(-1)))
        
        if mask is not None:
            attn = attn.masked_fill(mask.unsqueeze(1) == 1, -1e9)
        
        attn = self.dropout(F.softmax(attn, dim=-1))        
        res = torch.matmul(attn, v)

        return res, attn

class AdditiveSelfAttention(nn.Module):
    
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        
        self.w = nn.Linear(d_model, d_model)
        self.q = torch.nn.Parameter(torch.FloatTensor(d_model).uniform_(-0.1, 0.1))
    
    def forward(self, x, mask=None):
        attn = torch.tanh(self.dropout(self.w(x)))        
        attn = torch.matmul(attn, self.q)
        
        if mask is not None:
            attn = attn.masked_fill(mask == 1, -1e9)
        
        attn = self.dropout(F.softmax(attn, dim=-1))

        
        res = torch.einsum('ijk, ij->ik', x, attn)
        return res, attn

    
class MultiHeadAttention(nn.Module):
    
    def __init__(self, d_model, num_heads, d_qk, d_v, track_agreement=False, dropout=0.1):
        super().__init__()
        
        self.d_model = d_model
        self.d_qk = d_qk
        self.d_v = d_v
        
        self.num_heads = num_heads
        
        self.dropout = nn.Dropout(dropout)
        
        self.w_q = nn.Linear(d_model, num_heads * d_qk, bias=False)
        self.w_k = nn.Linear(d_model, num_heads * d_qk, bias=False)
        self.w_v = nn.Linear(d_model, num_heads * d_v, bias=False)
        
        self.w_fc = nn.Linear(num_heads * d_v, d_model, bias=False)
        
        self.attention = MultiplicativeAttention(dropout=dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
    
        self.track_agreement = track_agreement
        self.v_agreement = 0

    def forward(self, q, k, v, mask=None):     
        batch_size = q.shape[0]
        seq_size = q.shape[1]
        
        q_proj = self.w_q(q).view(q.shape[0], q.shape[1], self.num_heads, self.d_qk)
        k_proj = self.w_k(k).view(k.shape[0], k.shape[1], self.num_heads, self.d_qk)
        v_proj = self.w_v(v).view(v.shape[0], v.shape[1], self.num_heads, self.d_v) 

        if self.track_agreement:
            self.v_agreement += torch.einsum('bshd, bsnd->', F.normalize(v_proj, dim=3), F.normalize(v_proj, dim=3)) / self.num_heads**2

        if mask is None:
            q, attn = self.attention(q_proj.transpose(1, 2), k_proj.transpose(1, 2), v_proj.transpose(1, 2))
        else:
            q, attn = self.attention(q_proj.transpose(1, 2), k_proj.transpose(1, 2), v_proj.transpose(1, 2), mask.unsqueeze(1))
        
        q = q.transpose(1, 2).contiguous()
        q = q.view(batch_size, seq_size, -1)

        q = self.dropout(self.w_fc(q))

        q = self.layer_norm(q)
        
        return q, attn

    def clear_agreement(self):
        self.v_agreement = 0

class NonlinearFF(nn.Module):
    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_in, d_hid)
        self.w_2 = nn.Linear(d_hid, d_in)
        self.layer_norm = nn.LayerNorm(d_in, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.w_2(F.relu(self.w_1(x)))
        x = self.dropout(x)

        x = self.layer_norm(x)

        return x
    
class TitleEmbedding(nn.Module):
    def __init__(self, num_embeddings, d_model, num_heads, d_qk, d_v, d_hid=None, embeddings=None, track_agreement=False, padding_idx=0, dropout=0.1):
        super().__init__()

        if embeddings is not None:
            self.embeddings = nn.Embedding.from_pretrained(embeddings, freeze=False, sparse=True, padding_idx=padding_idx)
        else:
            self.embeddings = nn.Embedding(num_embeddings, d_model, sparse=True, padding_idx=0)
            
        self.mh_attn = MultiHeadAttention(d_model, num_heads, d_qk, d_v, track_agreement=track_agreement, dropout=dropout)
        self.nff = NonlinearFF(d_model, d_hid if d_hid is not None else d_model * 4, dropout=dropout)
        self.add_attn = AdditiveSelfAttention(d_model, dropout=dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        
        self.padding_idx = padding_idx
        
    def forward(self, title):    
        mask = (title == self.padding_idx).byte()
        
        q = k = v = self.embeddings(title)
        title, attn = self.mh_attn(q, k ,v, mask=mask)
        
        title = self.nff(title)
        title, add_attn = self.add_attn(title, mask=mask)
        
        title = self.layer_norm(title)
        
        return title
    
    def load_embeddings(embeddings):
        self.embeddings = nn.Embedding.from_pretrained(embeddings, freeze=False, sparse=True)

In [None]:
device = torch.device('cpu')

In [None]:
title_embedding = torch.load(r'model.pt', map_location='cpu')
title_embedding.eval()

In [None]:
import mwclient

from mwclient.page import Page
from mwclient import Site


import warnings
warnings.filterwarnings('ignore')



ua = ''

wiki = Site('en.wikipedia.org', clients_useragent=ua)
wiki_api = 'http://en.wikipedia.org/w/api.php'

generator = wiki.random(0, 1)

def get_random_page():
    for rand in generator:
        return Page(wiki, rand['title'])

#stopwords = set(nltk.corpus.stopwords.words('english'))
meaningless_tokens = [',', '.', '\'' , '', '"', '-', '_', '–', '&', '\'\'', '""']

trans = str.maketrans('(),_-/', '      ','\\"')

def parse_text(text, stopwords=[]): 
    
    tokens = text.translate(trans).split()
    #tokens = [word for word in nltk.word_tokenize(text) if word.lower() not in meaningless_tokens and word.lower() not in stopwords]
    
    #trans = str.maketrans('', '', '[]().,;:|`')
    
    #tokens = [token.translate(trans) for token in tokens]
    tokens = [token for token in tokens if token != '']
    
    return list(dict.fromkeys(tokens))

def encode_seq(seq, tolerate_miss=True):
    res = []
    
    for token in seq:
        if token in word2idx:
            res.append(word2idx[token])
        elif not tolerate_miss:
                return []
    
    return torch.LongTensor(res)  

def get_page_summary(id):
    query_params = {
                    'action': 'query',
                    'prop': 'extracts',
                    'exintro': '',
                    'explaintext': '',
                    'pageids': id,
                    'format': 'json'
                   }
    
    headers = {
        'User-Agent': ua
    }
    
    r = requests.get(wiki_api, params=query_params, headers=headers).json()
    return parse_text(r['query']['pages'][str(id)]['extract'], stopwords=stopwords)

In [None]:
def race(a, b, title_embedding, device):
    
    a = a.resolve_redirect()
    
    print('starting at page `{}`'.format(a.name))
    print('trying to reach page `{}`'.format(b.name))
    

    goal_seq = encode_seq(parse_text(b.name))
    
    if goal_seq == []:
        print('error: goal tokens unknown')
        return
    
    goal_seq = F.normalize(title_embedding(goal_seq.unsqueeze(0).to(device)))
    visited_name = set()
    visited_id = set()
    
    while(a.name != b.name):
        best_link = None
        best_sim = -1e9
        
        for link in a.links(0):
            
            
            if link.name in visited_name or link.pageid in visited_id:
                continue
                
            #print(link)
            #rdr = link.redirects_to()
            
            #if rdr is not None:
            #    if rdr.name in visited_name or rdr.pageid in visited_id:
            #        continue
                
            link_seq = encode_seq(parse_text(link.name))
            
            if len(link_seq) == 0:
                continue
            else:
                link_seq = link_seq.unsqueeze(0).to(device)
            
            link_embedding = F.normalize(title_embedding(link_seq))

            sim = torch.einsum('ij, ij->', link_embedding, goal_seq)
            

            if link.pageid is not None and sim > best_sim:
                best_sim = sim
                best_link = link
        
        if best_link is None:
            print('cannot find candidate link')
            break
        else:
            print('{} -> {}'.format(a.name, best_link.name))
        
            visited_name.add(best_link.name)
            visited_id.add(best_link.pageid)
            
            best_link = best_link.resolve_redirect()                
            
            visited_name.add(best_link.name)
            visited_id.add(best_link.pageid)

            a = best_link
            
            


In [None]:
a = get_random_page()
b = get_random_page()

print(a.name)
print(b.name)

In [None]:
race(a, b, title_embedding, device)

In [None]:
def sim(text1, text2):
    text1 = encode_seq(parse_text(text1)).unsqueeze(0)
    text2 = encode_seq(parse_text(text2)).unsqueeze(0)    

    text1 = title_embedding(text1)
    text2 = title_embedding(text2)
    
    return F.cosine_similarity(text1, text2)

def emb(text):
    return title_embedding(encode_seq(parse_text(text)).unsqueeze(0)).squeeze(0)

def score_links(page, target):
    d = {}
    for link in a.links(generator=False):

        lenc = encode_seq(parse_text(link))

        if len(lenc) == 0:
            continue

        lenc = lenc.unsqueeze(0)

        if len(lenc) != 0:
            lemb = F.normalize(title_embedding(lenc))

            d[link] = torch.einsum('ij, ij->', bemb, lemb)


    d_sort = sorted( ((v,k) for k,v in d.items()), reverse=True)

    for el in d_sort:
        print(el)