# Picking related articles.

In [13]:
import pandas as pd

from scipy.spatial.distance import jensenshannon


from IPython.display import HTML, display
from ipywidgets import interact, Layout, HBox, VBox, Box
from IPython.display import clear_output
import sys
import joblib
import matplotlib.pyplot as plt
import ipywidgets as widgets

sys.path.append("../datacleaning")
from load import load_years

plt.style.use("dark_background")

In [14]:
years: list[int] = load_years("../years.txt")

In [15]:
def load_all_files(path: str, csv_path: str, years: list[int]):
    lda = {}
    data_vectorized = {}
    vectorizer = {}
    df = {}
    doc_topic_dist = {}

    for year in years:
        lda.update({year: joblib.load(f'{path}/lda_{year}.csv')})
        data_vectorized.update({year: joblib.load(f'{path}/data_vectorized_{year}.csv')})
        vectorizer.update({year: joblib.load(f'{path}/vectorizer_{year}.csv')})
        df.update({year: pd.read_csv(f"{csv_path}/{year}_03.csv")})
        doc_topic_dist.update({year: joblib.load(f"{path}/doc_topic_dist_{year}.csv")})
        print(f"{year} done.")

    return lda, data_vectorized, vectorizer, df, doc_topic_dist

In [16]:
lda, data_vectorized, vectorizer, df, doc_topic_dist = load_all_files("./csv/", "../csv", years)

2016 done.
2017 done.
2018 done.
2019 done.
2020 done.


In [17]:
def print_top_words(model, vectorizer, n_top_words, n_topics=3):
    feature_names = vectorizer.get_feature_names_out()
    dist = list(enumerate(model.components_))
    for topic_id, topic in dist[:n_topics]:
        message = f"Topic {topic_id}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [18]:
for year in years:
    print_top_words(lda[year], vectorizer[year], n_top_words=25, n_topics=5)

Topic 0: use like make new not technology one also design camera get work app look system vr you device well car google even video build way
Topic 1: say police court case officer charge judge report law arrest lawyer crime justice attorney accord lawsuit tell prosecutor claim file investigation statement prison trial department
Topic 2: song music album band record space release year new track time not first one video last show make singer take fight like go fan tour
Topic 3: health say drug study patient use medical people dr care research doctor find one year also not disease researcher cancer treatment new may percent would
Topic 4: zika option next section indicate icon previous menu sometimes virus chevron navigation expandable case outbreak mosquito say health spread pregnant disease woman infection microcephaly infect

Topic 0: like not make get one go do good music sound time say come know play well you thing he new back re la way song
Topic 1: percent price stock say high oil

In [19]:
def get_k_nearest_docs(doc_dist, k=5, get_dist=False):
    '''
    doc_dist: topic distribution (sums to 1) of one article
    
    Returns the index of the k nearest articles (as by Jensen–Shannon divergence in topic space). 
    '''
    temp = doc_topic_dist[year]
         
    distances = temp.apply(lambda x: jensenshannon(x, doc_dist), axis=1)
    k_nearest = distances[distances != 0].nsmallest(n=k).index
    
    if get_dist:
        k_distances = distances[distances != 0].nsmallest(n=k)
        return k_nearest, k_distances
    else:
        return k_nearest

In [20]:
def compare_dnas(title, year, recommendation_title, width=20):
    t = df[year][df[year]['title'] == recommendation_title].title.values[0]
    temp = doc_topic_dist[year][df[year]['title'] == title]
    ymax = temp.max(axis=1).values[0]*1.25
    temp = pd.concat([temp, doc_topic_dist[year][df[year]['title'] == recommendation_title]])
    temp.T.plot(kind='bar', title=t, figsize=(width, 4), ylim= [0, ymax])
    plt.xlabel('Topic')
    plt.legend(['Selection', 'Recommendation'])

def compare_tabs(title, year, recommendation_ids):
    k = len(recommendation_ids)
    outs = [widgets.Output() for i in range(k)]

    tab = widgets.Tab(children = outs)
    tab_titles = ['Paper ' + str(i+1) for i in range(k)]
    for i, t in enumerate(tab_titles):
        tab.set_title(i, t)
    display(tab)

    for i, t in enumerate(tab_titles):
        with outs[i]:
            ax = compare_dnas(title, year, recommendation_ids[i])
            plt.show(ax)



In [31]:
def recommendation(df, doc_topic_dist, title: str, year: int, k: int=5, plot_dna=False):
    '''
    Returns the title of the k papers that are closest (topic-wise) to the paper given through title.
    '''
    display(df[year][df[year]['title'] == title])

    recommended, dist = get_k_nearest_docs(doc_topic_dist[year][df[year]['title'] == title].iloc[0], k, get_dist=True)
    recommended = df[year].iloc[recommended].copy()
    recommended['similarity'] = 1 - dist 
    
    h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n + '</a>' +' (Similarity: ' + "{:.2f}".format(s) + ')' for l, n, s in recommended[['url','title', 'similarity']].values])
    display(HTML(h))
    
    if plot_dna:
        compare_tabs(title, year, recommended['title'].values)

In [32]:
recommendation(df, doc_topic_dist, "U.S. manufacturing output rises unexpectedly", 2020, k=10, plot_dna=True)

Unnamed: 0,year,date,month,day,author,title,article,url,section,publication
3,2020,2020-01-17 00:00:00,1.0,17,,U.S. manufacturing output rises unexpectedly,washington reuter we manufacture output rise u...,https://www.reuters.com/article/us-usa-economy...,Business News,Reuters


Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output…

In [None]:
def relevant_articles(tasks, year, k=5):
    tasks = [tasks] if type(tasks) is str else tasks 
    
    tasks_vectorized = vectorizer[year].transform(tasks)
    tasks_topic_dist = pd.DataFrame(lda[year].transform(tasks_vectorized))

    for index, bullet in enumerate(tasks):
        print(bullet)
        recommended, dist = get_k_nearest_docs(tasks_topic_dist.iloc[index], k, get_dist=True)
        recommended = df[year].iloc[recommended].copy()
        recommended['similarity'] = 1 - dist 

        h = '<br/>'.join(['<a href="' + l + '" target="_blank">'+ n + '</a>' +' (Similarity: ' + "{:.2f}".format(s) + ')' for l, n, s in recommended[['url','title', 'similarity']].values])
        display(HTML(h))

In [None]:
def relevant_articles_for_text():    
    textW = widgets.Textarea(
        value='',
        placeholder='Type something',
        description='',
        disabled=False,
        layout=Layout(width='90%', height='200px')
    )

    yearW = widgets.IntSlider(value=2018, description='Year', max=2020, min=2016, layout=Layout(width='40%'))

    kWidget = widgets.IntSlider(value=10, description='Number of articles', max=50, min=1, layout=Layout(width='40%'))

    button = widgets.Button(description="Search")

    display(VBox([HBox([kWidget, yearW], layout=Layout(width='90%', justify_content='space-around')),
        textW, button], layout=Layout(align_items='center')))

    def on_button_clicked(b):
        clear_output()
        display(VBox([HBox([kWidget, yearW], layout=Layout(width='90%', justify_content='space-around')),
            textW, button], layout=Layout(align_items='center')))        
        relevant_articles(textW.value, yearW.value, kWidget.value)

    button.on_click(on_button_clicked)

In [None]:
relevant_articles_for_text()

VBox(children=(HBox(children=(IntSlider(value=10, description='Number of articles', layout=Layout(width='40%')…

covid19
