In [None]:
# download the pretrained model and extract
!wget https://nlp.h-its.org/bpemb/fi/fi.wiki.bpe.vs200000.d300.w2v.txt.tar.gz
!tar xvzf fi.wiki.bpe.vs200000.d300.w2v.txt.tar.gz


# This is the pretrained model from facebook research fasttext, damn slow, huge but bad performance
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.vec.gz
# !gunzip cc.fi.300.vec.gz

In [1]:
import os
import re
import random
import requests
import pickle
import multiprocessing
import pandas as pd
import numpy as np

from sklearn.manifold import TSNE

from gensim import corpora, downloader
from gensim.utils import simple_preprocess, tokenize
from gensim.models import KeyedVectors, Word2Vec
from gensim.test.utils import datapath
from gensim.models.fasttext import FastText

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings

import matplotlib.pyplot as plt
import matplotlib.cm as cm

%matplotlib inline


warnings.filterwarnings('ignore')


In [2]:
df = pd.read_json('data/topics.json')

In [3]:
df.head()

Unnamed: 0,keywords,topic_id
0,"[[audi, 0.0081285564], [keuhkosyövän, 0.005977...",0
1,"[[sopii, 0.0092158103], [ford, 0.008776377], [...",1
2,"[[jorma, 0.0219070092], [marja, 0.0090980381],...",2
3,"[[duodecim, 0.010550919], [hoito, 0.0075704153...",3
4,"[[kilpirauhasen, 0.051186423700000004], [vajaa...",4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
keywords    30 non-null object
topic_id    30 non-null int64
dtypes: int64(1), object(1)
memory usage: 560.0+ bytes


In [5]:
# what else we can do with this?
# First, transfer words to id

# create pairs of keyword - topic


In [6]:
%%time
# load pretrained w2v, this would take mins on google collab
gensim_embeds = KeyedVectors.load_word2vec_format('data/fi/fi.wiki.bpe.vs200000.d300.w2v.txt', binary=False)
pretrained_embeds = gensim_embeds.vectors
# To convert words in the input tweet to indices of the embeddings matrix:
word_to_idx = {word: i for i, word in enumerate(gensim_embeds.vocab.keys())}

CPU times: user 1min 25s, sys: 576 ms, total: 1min 26s
Wall time: 1min 25s


In [7]:
BATCH_SIZE = 10
EMBEDDING_SIZE = 300  # Dimension of the embedding vector
HIDDEN_SIZE = 300
N_CLASSES = len(df)

LR = 0.001
N_EPOCH = 30
REPORT_EVERY=1

# check if GPU available
USE_CUDA = torch.cuda.is_available()

WORKERS = multiprocessing.cpu_count()

device = torch.device("cuda" if USE_CUDA else "cpu")
torch.set_num_threads(WORKERS)
np.random.seed(12)

print(device, WORKERS)


cpu 8


In [8]:
#--- model ---

class FFNN(nn.Module):
    # Note that pretrained_embeds is a numpy matrix of shape (num_embeddings, embedding_dim)
    def __init__(self, pretrained_embeds, n_classes, n_hidden):
        super(FFNN, self).__init__()
        # get the dimension of the embedding
        embed_dim = pretrained_embeds.shape[1]
        
        self.embed = nn.Embedding.from_pretrained(torch.FloatTensor(pretrained_embeds))
        print(self.embed)
        self.fc1 = nn.Linear(embed_dim, n_hidden) 
        self.fc2 = nn.Linear(n_hidden, n_classes)

    def forward(self, x):
        embeds = self.embed(x)
        # print('embed: ', embeds.shape)
        h1 = F.relu(self.fc1(embeds))
        
        # print('h1', h1.shape)
        out = self.fc2(h1)

        return F.log_softmax(out, dim=1)

In [9]:
def make_label(topics):
    return torch.FloatTensor(topics)


def make_trainset(df, indices, w2v):
    """
    For each word in trainset, find similar words by w2v model and assign the
    label as the test
    """
    trainset = []
    testset = []
    count_missing = 0
    
    for index, row in df.iterrows():
        topic = row['topic_id']
        for word in row['keywords']:
            if word[0] not in indices:
                count_missing += 1
                continue
                
            word_idx = torch.tensor([indices[word[0]]], dtype=torch.long) 
            target = torch.tensor([topic], dtype=torch.long)
            
            similars = w2v.similar_by_word(word[0], topn=10)
            
            testset += [[torch.tensor([indices[s[0]]], dtype=torch.long), target] for s in similars]
            
            trainset.append([word_idx, target])
            
            
            
    print('Words missing from the w2v model:', count_missing)
    
    return trainset, testset

    

In [10]:
%%time
trainset, testset = make_trainset(df, word_to_idx, gensim_embeds)

Words missing from the w2v model: 611
CPU times: user 1min 58s, sys: 576 ms, total: 1min 58s
Wall time: 15.7 s


In [11]:
len(trainset), len(testset)

(889, 8890)

**As we can see above, the word missing in pretrained model is 611, which is a lot. The reason might be the words in corpus is not in base form, which require lemmatizing. However we don't have a good lemma tool for Finnish yet.**

In [12]:
#--- set up ---
model = FFNN(pretrained_embeds, N_CLASSES, HIDDEN_SIZE).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=LR)

Embedding(200000, 300)


In [13]:
#--- training ---
for epoch in range(N_EPOCH):
    total_loss = 0
    # correct = 0
    for i in trainset:  
        gold_class = i[1].to(device)
        # print(gold_class.shape)

        word_indice = i[0].to(device)
        # print(word_indice.shape)
        
        log_probs = model(word_indice)
        
        # print('log probs:', log_probs.shape)
        
        loss = loss_function(log_probs, gold_class)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(log_probs, 1)

    if ((epoch+1) % REPORT_EVERY) == 0:
        print('epoch: %d, loss: %.4f' % (epoch, total_loss*100/len(trainset)))

epoch: 0, loss: 339.8579
epoch: 1, loss: 338.0556
epoch: 2, loss: 336.3088
epoch: 3, loss: 334.5860
epoch: 4, loss: 332.8649
epoch: 5, loss: 331.1235
epoch: 6, loss: 329.3443
epoch: 7, loss: 327.5109
epoch: 8, loss: 325.6047
epoch: 9, loss: 323.6154
epoch: 10, loss: 321.5284
epoch: 11, loss: 319.3354
epoch: 12, loss: 317.0239
epoch: 13, loss: 314.5836
epoch: 14, loss: 312.0142
epoch: 15, loss: 309.3100
epoch: 16, loss: 306.4664
epoch: 17, loss: 303.4831
epoch: 18, loss: 300.3673
epoch: 19, loss: 297.1202
epoch: 20, loss: 293.7449
epoch: 21, loss: 290.2544
epoch: 22, loss: 286.6546
epoch: 23, loss: 282.9548
epoch: 24, loss: 279.1705
epoch: 25, loss: 275.3135
epoch: 26, loss: 271.3952
epoch: 27, loss: 267.4281
epoch: 28, loss: 263.4205
epoch: 29, loss: 259.3848


In [14]:
#--- test ---
def test_model(testset, n_topics=5):
    """
    This test is the correct counted for any match in top n_topics
    The idea is, if we try to match exact the target, the chance is really low
    Because of the keywords number is small + topics is over lapping.
    The reasonable test is check if predicted value in the top n of target
    """
    correct = 0
    with torch.no_grad():
        for i in testset:
            gold_class = i[1].to(device)

            word_indices = i[0].to(device)

            log_probs = model(word_indices)
            # print(log_probs)
            _, predicted = log_probs.topk(n_topics, 1)
            result = predicted.squeeze(0).data.cpu().numpy()
            target = gold_class.data.cpu().numpy()[0]
            # print(result, target)

            correct += 1 if target in result else 0

    print('test accuracy: %.2f' % (100.0 * correct / len(testset) ))


In [15]:
test_model(testset, n_topics=5)

test accuracy: 54.16


In [16]:
def predict_topics(keyword, model, indices, n_topics=5):
    word_idx = torch.tensor([indices[keyword]], dtype=torch.long)
    
    log_probs = model
    log_probs = model(word_idx)
    # print(log_probs)
    _, predicted = log_probs.topk(n_topics, 1)
    result = predicted.squeeze(0).data.cpu().numpy()
    
    return result

In [17]:
topics_pred = predict_topics('omena', model, word_to_idx)

In [18]:
print(topics_pred)

[ 1  2 29 23  9]


## Horray, we have been working on a long path, all the materials are ready to cook some food.

In [19]:
# First we need to get the doc_topics from previos LDA training

df_doc_topics = pd.read_csv('data/doc_topics.csv')

In [20]:
df_doc_topics.head()

Unnamed: 0,uuid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_20,topic_21,topic_22,topic_23,topic_24,topic_25,topic_26,topic_27,topic_28,topic_29
0,QXJ0aWNsZTpjZDUwZDI5Mi1jYjk5LTRlZmYtYWVmNC0yZm...,0.042724,0.02363,7.5e-05,0.000251,6.5e-05,9.3e-05,0.000148,0.039949,0.000148,...,0.000262,0.033658,8.4e-05,0.034273,0.000102,0.000103,0.00325,0.443131,0.00551,0.084713
1,QXJ0aWNsZTo5OTQ0MjNkYy04ZTljLTQxZTItOGQyNy0xMT...,6.8e-05,7.2e-05,6.9e-05,0.000231,6e-05,0.005728,0.000136,0.000119,0.000136,...,0.000241,0.029914,7.8e-05,0.039052,9.4e-05,9.4e-05,0.000113,0.827827,7.2e-05,0.020712
2,QXJ0aWNsZTplNGZiM2EwOS01MTI2LTRkYTctYjEzOS1jYz...,9.3e-05,9.9e-05,9.5e-05,0.000316,8.2e-05,0.000117,0.000186,0.000163,0.207305,...,0.00033,0.04623,0.000106,0.00012,0.000129,0.000129,0.000154,0.463375,9.9e-05,0.27336
3,QXJ0aWNsZTphOTQyMzU3YS01OGRkLTQ2ZTQtOGIyNC1hZW...,8.3e-05,8.8e-05,8.4e-05,0.000281,7.3e-05,0.000104,0.000165,0.000144,0.000165,...,0.000293,0.000103,9.4e-05,0.031181,0.000114,0.003296,0.000137,0.614437,8.8e-05,0.092755
4,QXJ0aWNsZTpkZDllOTM3ZS1iZWE3LTQxMDEtODAwNS00MT...,6.1e-05,0.013003,6.2e-05,0.000208,5.4e-05,7.7e-05,0.000122,0.000107,0.02073,...,0.000217,0.111349,7e-05,0.007153,8.5e-05,8.5e-05,0.000101,0.793116,6.5e-05,0.00014


In [21]:
# now we the the list of topics from the query
# could use them to find the top N documents relevant

def query_article_by_keyword(keyword, df_docs, model, indices, n_topics=5, n_docs=10):
    # get the n_topics by keywords:
    topics = predict_topics(keyword, model, indices, n_topics)
    # get n_docs for each topics
    list_doc = []
    
    for topic in topics:
        docs = df_docs.sort_values('topic_' + str(topic), ascending=False).head(n_docs)
        list_doc += docs['uuid'].tolist()
        
    return list(set(list_doc))

In [22]:
# First get the article ids from the query
article_ids = query_article_by_keyword("omena", df_doc_topics, model, word_to_idx)

In [23]:
# niceee, now let get the article content from this
df_article = pd.read_json('data/dataset.json')

In [24]:
df_article.head()

Unnamed: 0,body,categories,description,id,mainCategory,published,subCategory,title,type,venue
0,Vuonna 1978 valmistuneen mökin lautaverhoilu o...,[Mökit],Suvin ja Kallen mökki on niin lähellä perheen ...,QXJ0aWNsZTpjZDUwZDI5Mi1jYjk5LTRlZmYtYWVmNC0yZm...,Meidän Mökki,1550043116000,Sisustus,Makkaranpaistoa tulipadassa ja lämpimänä höyry...,Article,meillakotona
1,Yrittäjäpariskunta Anu ja Heikki asuvat Rymätt...,"[Keittiö, Keittiöremontti]","Anu ja Heikki halusivat laadukkaan keittiön, j...",QXJ0aWNsZTo5OTQ0MjNkYy04ZTljLTQxZTItOGQyNy0xMT...,Avotakka,1549969080000,Sisustus,Hirsitalon keittiö remontoitiin tähän päivään ...,Article,meillakotona
2,Sirpan koti sijaitsee vuonna 1907 rakennetussa...,"[Kodit, Skandinaavinen sisustus]","Kun arvostaa elämän pieniä nautintoja, taidett...",QXJ0aWNsZTplNGZiM2EwOS01MTI2LTRkYTctYjEzOS1jYz...,Avotakka,1549960923000,Sisustus,Jugendtalon rappukäytävä lumosi Sirpan – koti ...,Article,meillakotona
3,Moderni hirsitalo\r\nAsukkaat: sisustussuunnit...,"[Rakentaminen, Hirsitalo]","Moderni hirsitalo, entinen perhekoti ja muutto...",QXJ0aWNsZTphOTQyMzU3YS01OGRkLTQ2ZTQtOGIyNC1hZW...,Meidän Talo,1549959061000,Rakenna ja remontoi,Haaveena oma talo? Lue kolme erilaista tarinaa...,Article,meillakotona
4,"Saarekkeen ääressä on kiva juoda aamukahvit, n...","[Kodit, Ennen ja jälkeen, Keittiöremontti, Rem...",Kun Satu ja Kyösti Melametsä rakennuttivat per...,QXJ0aWNsZTpkZDllOTM3ZS1iZWE3LTQxMDEtODAwNS00MT...,Meidän Talo,1549956919000,Rakenna ja remontoi,Keittiöremontti käynnisti ketjureaktion! Teini...,Article,meillakotona


### Finally we made it, now we got the articles based on keyword query, whatever keywords are! (not quite sure)

In [25]:
df_article[df_article['id'].isin(article_ids)]

Unnamed: 0,body,categories,description,id,mainCategory,published,subCategory,title,type,venue
71,"Muodista, kauneudesta ja matkoista bloggaava H...",[],Moni meistä haluaisi raivata kalenteriinsa lis...,QXJ0aWNsZTpiOTcyZmI2NS05NjRlLTQ1NTYtODVlNS05MG...,,1547640000000,Ihmiset,Äänikirjat koukuttivat Lilou’s Crush -blogin H...,Article,meillakotona
195,Joulukuusissa on nykyään paljon valinnanvaraa....,"[Joulu, Joulukukat]",Joulukuusi kotiin kannettuna ja juhlavaksi kor...,QXJ0aWNsZTplZGM2MzhjZS1kNGJlLTRhMTctOGQ3MS00ND...,Viherpiha,1543988100000,Piha ja puutarha,Joulukuusi tekee juhlan! Perinteinen metsäkuus...,Article,meillakotona
330,"1. Kotimaista pellavaa\r\n\nLastu-pyyhe, 16,90...","[Design, Sisustusideat]",Joulun varmin vinkki on antaa lahjaksi kestävä...,QXJ0aWNsZTo1ZTJhN2NmZC1jZGI4LTQzODMtOTkzZS1lNT...,Avotakka,1541071440000,Sisustus,Sisustusarkkitehdin 6 valintaa: joululahjavink...,Article,meillakotona
503,"Kun illat viilenevät ja hämärtyvät, tekee miel...","[Remontointi, Rakentaminen]","Haaveiletko pienestä tulisijasta, joka toisi k...",QXJ0aWNsZTpmOWEzMDhjNi1iZTkzLTRmNTktYmU1Yy04Mz...,,1536829200000,Rakenna ja remontoi,Romanttiset puuhellat ja pienet takat – luo tu...,Article,meillakotona
558,Uuden Hento lumisade -mukin kuvitus pohjautuu ...,[Keittiö],Arabia julkisti uuden muumimukin. Seesteisen t...,QXJ0aWNsZTozMjM1NGM1Ny1kNjY0LTRhYTAtYWZhNi05Nz...,Meillä kotona,1535536731000,Sisustus,Uunituoreessa muumimukissa Muumipeikko ihailee...,Article,meillakotona
575,Miten päädyit veistämään kihlasormuksen tyttöy...,[Askartelu],Santeri oli miettinyt kosintaa pitkin kevättä ...,QXJ0aWNsZTo4MTU2Y2ZlZi0zNmY1LTQyMDMtYmYyMy1hNj...,Meillä kotona,1535020361000,Ihmiset,"Santeri halusi kosia, mutta sormus puuttui – v...",Article,meillakotona
669,Tuplakurkku yllätti voileipäpöydässä\nKaksi vi...,"[Puutarhan kasvit, Kasvimaa]","Miltä porkkanan, kurkun tai tomaatin kuuluu nä...",QXJ0aWNsZTphNjYyMzMyOS1lZWI3LTRhMmItOWY2Yy1hOW...,Meillä kotona,1532334348000,Piha ja puutarha,"Nehän halaavat! Rakastuneet porkkanat, peukutt...",Article,meillakotona
688,Porin asuntomessujen vilinässä valkoisten ja k...,"[Asuntomessut, Kodit, Asuntomessut 2018]","Toni Goltz uskoo, että edulliset minitalot tek...",QXJ0aWNsZToxZGZlMWMyOC05MWRkLTQ3NzItYTMzZS1kOT...,Meillä kotona,1531812180000,Rakenna ja remontoi,"Talo ilman muovia, myrkkyjä ja kulmia – tehora...",Article,meillakotona
717,Mansikoiden pakastaminen viipaleina\nPienen pa...,"[Sadonkorjuu, Marjat, Säilöntä, Ruoanvalmistus]",Kuumin mansikka-aika on käsillä juuri nyt. On ...,QXJ0aWNsZTo0NjRhODMyNC1jMjgyLTQ4YjMtYjQyNy03ND...,Maku,1530875534000,Ruoka,"Viipaloi, sokeroi ja säilö rasiaan – näin onni...",Article,meillakotona
760,"Kohde 26: Talo Haltiatar\r\n\nHuonejako, kerro...","[Asuntomessut, Asuntomessut 2018]",Asuntomessujen kohteissa 26–33 on huomioitu mu...,QXJ0aWNsZTphYWJhMjQxYy1iYThiLTRkMjAtOGRlMi1kNW...,Meillä kotona,1529566500000,Rakenna ja remontoi,Porin asuntomessut: kohteet 26–33,Article,meillakotona
