# Picking related articles.

In [13]:
import pandas as pd

from scipy.spatial.distance import jensenshannon

import joblib

from IPython.display import HTML, display
from ipywidgets import interact, Layout, HBox, VBox, Box
import ipywidgets as widgets
from IPython.display import clear_output

import matplotlib.pyplot as plt

plt.style.use("dark_background")

In [2]:
lda = joblib.load('lda_2016.csv')
data_vectorized = joblib.load('data_vectorized_2016.csv')
vectorizer = joblib.load('vectorizer2016.csv')
df = pd.read_csv('../csv/2016_03.csv')

In [4]:
doc_topic_dist = pd.DataFrame(lda.transform(data_vectorized))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.238411,0.000106,0.000106,0.000106,0.000106,0.000106,0.107994,0.000106,0.000106,0.000106,...,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106,0.000106
1,0.000039,0.000039,0.000039,0.000039,0.000039,0.005645,0.000039,0.000039,0.000039,0.135734,...,0.000039,0.000039,0.000039,0.024726,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039
2,0.000053,0.000053,0.000053,0.000053,0.000053,0.000053,0.000053,0.000053,0.090053,0.000053,...,0.000053,0.000053,0.000053,0.008197,0.000053,0.000053,0.000053,0.000053,0.077419,0.000053
3,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.000042,0.009310,0.006844,...,0.000042,0.000042,0.000042,0.000042,0.253911,0.000042,0.000042,0.000042,0.000042,0.000042
4,0.000120,0.122083,0.000120,0.000120,0.000120,0.000120,0.151994,0.000120,0.000120,0.019859,...,0.085292,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.000120,0.084412,0.000120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55044,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.355069,0.000038,0.000038,...,0.000038,0.187572,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038,0.000038
55045,0.000058,0.000058,0.000058,0.000058,0.000058,0.000058,0.000058,0.023087,0.000058,0.000058,...,0.000058,0.000058,0.000058,0.000058,0.081981,0.000058,0.000058,0.000058,0.000058,0.000058
55046,0.381246,0.000102,0.000102,0.106139,0.000102,0.000102,0.392331,0.000102,0.000102,0.000102,...,0.000102,0.000102,0.000102,0.007279,0.000102,0.000102,0.000102,0.000102,0.000102,0.000102
55047,0.000055,0.000055,0.000055,0.405778,0.000055,0.000055,0.066562,0.000055,0.000055,0.031452,...,0.008498,0.000055,0.000055,0.000055,0.000055,0.000055,0.000055,0.000055,0.248498,0.016294


In [19]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [21]:
print_top_words(lda, vectorizer, n_top_words=25)


Topic #0: use like make new not technology one also design camera get work app look system vr you device well car google even video build way

Topic #1: say police court case officer charge judge report law arrest lawyer crime justice attorney accord lawsuit tell prosecutor claim file investigation statement prison trial department

Topic #2: song music album band record space release year new track time not first one video last show make singer take fight like go fan tour

Topic #3: health say drug study patient use medical people dr care research doctor find one year also not disease researcher cancer treatment new may percent would

Topic #4: zika option next section indicate icon previous menu sometimes virus chevron navigation expandable case outbreak mosquito say health spread pregnant disease woman infection microcephaly infect

Topic #5: de brazil la brazilian rio mexico le ford paulo et des janeiro rousseff du sao mexican un sa il paris city est les france en

Topic #6: compa

In [6]:
def get_k_nearest_docs(doc_dist, k=5, get_dist=False):
    '''
    doc_dist: topic distribution (sums to 1) of one article
    
    Returns the index of the k nearest articles (as by Jensen–Shannon divergence in topic space). 
    '''
    temp = doc_topic_dist
         
    distances = temp.apply(lambda x: jensenshannon(x, doc_dist), axis=1)
    k_nearest = distances[distances != 0].nsmallest(n=k).index
    
    if get_dist:
        k_distances = distances[distances != 0].nsmallest(n=k)
        return k_nearest, k_distances
    else:
        return k_nearest

In [7]:
def plot_article_dna(title, width=20):
    t = df[df['title'] == title].title.values[0]
    doc_topic_dist[df['title'] == title].T.plot(kind='bar', legend=None, title=t, figsize=(width, 4))
    plt.xlabel('Topic')

def compare_dnas(title, recommendation_title, width=20):
    t = df[df['title'] == recommendation_title].title.values[0]
    temp = doc_topic_dist[df['title'] == title]
    ymax = temp.max(axis=1).values[0]*1.25
    temp = pd.concat([temp, doc_topic_dist[df['title'] == recommendation_title]])
    temp.T.plot(kind='bar', title=t, figsize=(width, 4), ylim= [0, ymax])
    plt.xlabel('Topic')
    plt.legend(['Selection', 'Recommendation'])

def dna_tabs(titles):
    k = len(titles)
    outs = [widgets.Output() for i in range(k)]

    tab = widgets.Tab(children = outs)
    tab_titles = ['Paper ' + str(i+1) for i in range(k)]
    for i, t in enumerate(tab_titles):
        tab.set_title(i, t)
    display(tab)

    for i, t in enumerate(tab_titles):
        with outs[i]:
            ax = plot_article_dna(titles[i])
            plt.show(ax)

def compare_tabs(title, recommendation_ids):
    k = len(recommendation_ids)
    outs = [widgets.Output() for i in range(k)]

    tab = widgets.Tab(children = outs)
    tab_titles = ['Paper ' + str(i+1) for i in range(k)]
    for i, t in enumerate(tab_titles):
        tab.set_title(i, t)
    display(tab)

    for i, t in enumerate(tab_titles):
        with outs[i]:
            ax = compare_dnas(title, recommendation_ids[i])
            plt.show(ax)



In [8]:
def recommendation(title, k=5, plot_dna=False):
    '''
    Returns the title of the k papers that are closest (topic-wise) to the paper given through title.
    '''
    display(df[df.title == title])

    recommended, dist = get_k_nearest_docs(doc_topic_dist[df['title'] == title].iloc[0], k, get_dist=True)
    recommended = df.iloc[recommended].copy()
    recommended['similarity'] = 1 - dist 
    
    h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n + '</a>' +' (Similarity: ' + "{:.2f}".format(s) + ')' for l, n, s in recommended[['url','title', 'similarity']].values])
    display(HTML(h))
    
    if plot_dna:
        compare_tabs(title, recommended['title'].values)

In [18]:
recommendation("Health Watch: Back Pain Overview", k=10, plot_dna=True)

Unnamed: 0,year,date,month,day,author,title,article,url,section,publication
7106,2016,2016-08-03,8.0,3,,Health Watch: Back Pain Overview,graphiq i d 1zj8ypuuhbz graphiq i d 86h0dyo1d8...,http://www.reuters.com/article/us-healthwatch-...,Journalists,Reuters


Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output…

In [10]:
def relevant_articles(tasks, k=5):
    tasks = [tasks] if type(tasks) is str else tasks 
    
    tasks_vectorized = vectorizer.transform(tasks)
    tasks_topic_dist = pd.DataFrame(lda.transform(tasks_vectorized))

    for index, bullet in enumerate(tasks):
        print(bullet)
        recommended, dist = get_k_nearest_docs(tasks_topic_dist.iloc[index], k, get_dist=True)
        recommended = df.iloc[recommended].copy()
        recommended['similarity'] = 1 - dist 

        h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n + '</a>' +' (Similarity: ' + "{:.2f}".format(s) + ')' for l, n, s in recommended[['url','title', 'similarity']].values])
        display(HTML(h))

In [24]:
def relevant_articles_for_text():    
    textW = widgets.Textarea(
        value='',
        placeholder='Type something',
        description='',
        disabled=False,
        layout=Layout(width='90%', height='200px')
    )

    kWidget = widgets.IntSlider(value=10, description='Number of articles', max=50, min=1, layout=Layout(width='50%'))

    button = widgets.Button(description="Search")

    display(VBox([HBox([kWidget], layout=Layout(width='90%', justify_content='space-around')),
        textW, button], layout=Layout(align_items='center')))

    def on_button_clicked(b):
        clear_output()
        display(VBox([HBox([kWidget], layout=Layout(width='90%', justify_content='space-around')),
            textW, button], layout=Layout(align_items='center')))        
        relevant_articles(textW.value, kWidget.value)

    button.on_click(on_button_clicked)

In [25]:
relevant_articles_for_text()

VBox(children=(HBox(children=(IntSlider(value=10, description='Number of articles', layout=Layout(width='50%')…

hi
