# Wrangling the data.
In this notebook, several functions are defined which can be used to take a better look at the information created through the data processing, and the final implementation solutions which can be used by a user.

The functions are designed to be imported to the final results.ipynb notebook.

In [33]:
from scipy.spatial.distance import jensenshannon
from IPython.display import HTML, display
from ipywidgets import interact, Layout, HBox, VBox, Box
from IPython.display import clear_output
import numpy as np
import pandas as pd
import sys
import joblib
import matplotlib.pyplot as plt
import ipywidgets as widgets

sys.path.append("../datacleaning")
from load import load_years

In [10]:
years: list[int] = load_years("../years.txt")

In [30]:
def load_all_files(path: str, csv_path: str, years: list[int]) -> dict[str, dict[str, pd.DataFrame]]:
    """Load all topic modelling files into a dictionary.

    Args:
        path (str): Path to the data directory.
        csv_path (str): Path to the unprocssed csv file.
        years (list[int]): Years to load.

    Returns:
        A dictionary containing the LDA, the vectorized data, the vectorizer and
            the dataframe containing the unmodified articles.
    """
    lda: dict[str, pd.DataFrame] = {}
    data_vectorized: dict[str, pd.DataFrame] = {}
    vectorizer: dict[str, pd.DataFrame] = {}
    df : dict[str, pd.DataFrame] = {}
    doc_topic_dist: dict[str, pd.DataFrame] = {}

    # Update the files year by year.
    for year in years:
        lda.update({year: joblib.load(f"{path}/lda_{year}.csv")})
        data_vectorized.update(
            {year: joblib.load(f"{path}/data_vectorized_{year}.csv")}
        )
        vectorizer.update({year: joblib.load(f"{path}/vectorizer_{year}.csv")})
        df.update({year: pd.read_csv(f"{csv_path}/{year}_03.csv")})
        doc_topic_dist.update(
            {year: joblib.load(f"{path}/doc_topic_dist_{year}.csv")}
        )
        print(f"{year} done.")

    # Place the data in a dictionary.
    #   This is done to limit the number of variables to pass in function arguments.
    file_values: tuple[dict[str, pd.DataFrame]] = lda, data_vectorized, vectorizer, df, doc_topic_dist
    file_keys: list[str] = ["lda", "data_vectorized", "vectorizer", "df", "doc_topic_dist"]
    topic_modelling_files: dict[str, dict[str, pd.DataFrame]] = dict(zip(file_keys, file_values))

    return topic_modelling_files

In [12]:
topic_modelling_files = load_all_files("./csv/", "../csv", years)

2016 done.
2017 done.
2018 done.
2019 done.
2020 done.


In [13]:
def print_top_words(lda: pd.DataFrame, vectorizer: pd.DataFrame, n_top_words: int=10, n_topics: int=3) -> None:
    """Print the top words for each topic.
     
     Args:
        topic_modelling_files (dict[str, dict[str, pd.DataFrame]]): Dictionary containing topic modelling data.
        year (int): Year to be printed.
        n_top_words (int): Number of words printed per topic. Defaults to 10.
        n_topics (int): Number of topics to print. Defaults to 3.
    """
    feature_names = vectorizer.get_feature_names_out()
    dist = list(enumerate(lda.components_))

    for topic_id, topic in dist[:n_topics]:
        message = f"Topic {topic_id}: "
        message += " ".join([feature_names[i] for i in topic.argsort()[: -n_top_words - 1 : -1]])
        print(message)
    print()

In [14]:
for year in years:
    print_top_words(topic_modelling_files["lda"][year], topic_modelling_files["vectorizer"][year], n_top_words=25, n_topics=5)

Topic 0: use like make new not technology one also design camera get work app look system vr you device well car google even video build way
Topic 1: say police court case officer charge judge report law arrest lawyer crime justice attorney accord lawsuit tell prosecutor claim file investigation statement prison trial department
Topic 2: song music album band record space release year new track time not first one video last show make singer take fight like go fan tour
Topic 3: health say drug study patient use medical people dr care research doctor find one year also not disease researcher cancer treatment new may percent would
Topic 4: zika option next section indicate icon previous menu sometimes virus chevron navigation expandable case outbreak mosquito say health spread pregnant disease woman infection microcephaly infect

Topic 0: like not make get one go do good music sound time say come know play well you thing he new back re la way song
Topic 1: percent price stock say high oil

In [37]:
def get_n_nearest_docs(doc_topic_dist: pd.DataFrame, doc_dist: pd.Series, number_of_articles: int = 5, get_dist: bool = True) -> pd.Series | tuple[pd.Series, pd.Series.index]:
    """Retrieve the related articles for a specified article.

    Retrieval is based on the Jensen-Shannon divergence in topic space.

    Args:
        doc_topic_dist (pd.DataFrame): Topic distribution of all articles.
        doc_dist: Topic distribution of specified article.
        number_of_articles (int): Number of nearest articles to return.
        get_dist (bool): Whether to return the distances or not. Defaults to True.

    Returns:
        Index of the k nearest articles (as by Jensen-Shannon divergence in topic space).
    """
    # Assigned as temp as apply occurs in-place.
    temp = doc_topic_dist
    distances = temp.apply(lambda x: jensenshannon(x, doc_dist), axis=1)
    k_nearest = distances[distances != 0].nsmallest(n=number_of_articles).index

    if get_dist:
        # Get nearest neighbor distance to given article.
        k_distances = distances[distances != 0].nsmallest(n=number_of_articles)
        return k_nearest, k_distances
    else:
        return k_nearest

In [31]:
def compare_topic(df: pd.DataFrame, doc_topic_dist: pd.DataFrame, title: str, recommendation_title: str, width: int = 20) -> None:
    """Plot topic distribution between articles.

    Args:
        df (pd.Dataframe): Dataframe containing corpus of articles.
        doc_topic_dist (pd.DataFrame): Topic distribution of all articles.
        title (str): Title of the topic to compare against.
        recommendation_title (str): Title of the recommended article.
        width (int): Width of the figure.
    """
    recommended_title = df[df["title"] == recommendation_title].title.values[0]
    temp = doc_topic_dist[df["title"] == title]
    ymax = temp.max(axis=1).values[0] * 1.5
    temp = pd.concat([temp, doc_topic_dist[df["title"] == recommendation_title]])
    temp.T.plot(kind="bar", title=recommended_title, figsize=(width, 4), ylim=[0, ymax], ylabel="Density of topic")
    plt.xlabel("Topic")
    plt.legend(["Selection", "Recommendation"])

In [23]:
def compare_tabs(df: pd.DataFrame, doc_topic_dist: pd.DataFrame, title: str, recommendation_titles: np.ndarray) -> None:
    """Compare topic distribution between articles in tabs.

    Args:
        df (pd.Dataframe): Dataframe containing corpus of articles.
        doc_topic_dist (pd.DataFrame): Topic distribution of all articles.
        title (str): Title of seeding article.
        recommendation_ids (np.ndarray): Array of recommended articles.
    """
    number_of_compared_articles = len(recommendation_titles)
    outs = [widgets.Output() for i in range(number_of_compared_articles)]
    tab = widgets.Tab(children = outs)
    tab_titles = [f"Paper {str(i + 1)}" for i in range(number_of_compared_articles)]

    # Set the title of each tab.
    for i, tab_title in enumerate(tab_titles):
        tab.set_title(i, tab_title)
    display(tab)

    # Plot the topic distribution of the specified and recommended articles.
    for i in range(number_of_compared_articles):
        with outs[i]:
            ax = compare_topic(df, doc_topic_dist, title, recommendation_titles[i])
            plt.show(ax)       

In [39]:
def recommendation(topic_modelling_files: dict[str, dict[str, pd.DataFrame]], year: int, title: str, number_of_related_articles: int=5, plot_topics: bool=True) -> None:
    """Return the articles that are closest to the given paper.

    Args:
        topic_modelling_files (dict[str, dict[str, pd.DataFrame]]): Dictionary containing topic modelling data.
        year (int): Publishing year of the seeding article.
        title (str): Title of seeding article.
        number_of_related_articles (int): Number nearest articles to return. Defaults to 5.
        plot_dna (bool): Plot topic distribution of seeding and recommended articles. Defaults to True.
    """
    df: pd.DataFrame = topic_modelling_files["df"][year]
    doc_topic_dist: pd.DataFrame = topic_modelling_files["doc_topic_dist"][year]
    display(df[df["title"] == title])

    recommended, dist = get_n_nearest_docs(
        doc_topic_dist,
        doc_topic_dist[df["title"] == title].iloc[0],
        number_of_related_articles,
        get_dist=True,
    )
    recommended = df.iloc[recommended].copy()
    recommended["similarity"] = 1 - dist

    link = "<br/>".join(['<a href="'+ l + '" target="_blank">'+ n + "</a>" + " (Similarity: " + "{:.2f}%".format(s * 100) + ")" for l, n, s in recommended[["url", "title", "similarity"]].values])
    display(HTML(link))

    # Plot tabs to compare topic distribution of seed and recommended articles.
    if plot_topics is True:
        compare_tabs(df, doc_topic_dist, title, recommended["title"].values)

In [40]:
recommendation(topic_modelling_files, 2020, "Russia faces 2020 budget deficit of 0.9% of GDP at current oil prices", number_of_related_articles=10)

Unnamed: 0,year,date,month,day,author,title,article,url,section,publication
14610,2020,2020-03-14 00:00:00,3.0,14,,Russia faces 2020 budget deficit of 0.9% of GD...,moscow march 14 reuter russia budget deficit c...,https://www.reuters.com/article/russia-gdp-oil...,Credit RSS,Reuters


Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output…

In [43]:
def task_recommendation(topic_modelling_files: dict[str, dict[str, pd.DataFrame]], search_query: str, year: int, number_of_articles: int = 5) -> None:
    """Find recommendations for articles based on user input.
     
    User input can be of any length i.e. keywords, paragraphs,

     Args:
        topic_modelling_files (dict[str, dict[str, pd.DataFrame]]): Dictionary containing topic modelling data.
        search_query (str): User specified text to search.
        year (int): Year in which search should be performed.
        number_of_articles (int): The number of articles to return. Defaults to 5.
    """
    search_query = [search_query]
    vectorizer: pd.DataFrame = topic_modelling_files["vectorizer"][year]
    lda: pd.DataFrame = topic_modelling_files["lda"][year]
    doc_topic_dist: pd.DataFrame = topic_modelling_files["doc_topic_dist"][year]
    df: pd.DataFrame = topic_modelling_files["df"][year]

    tasks_vectorized = vectorizer.transform(search_query)
    tasks_topic_dist: pd.DataFrame = pd.DataFrame(lda.transform(tasks_vectorized))

    # Display a list of recommended articles.
    for index, bullet in enumerate(search_query):
        print(bullet)
        recommended, dist = get_n_nearest_docs(doc_topic_dist, tasks_topic_dist.iloc[index], number_of_articles)
        recommended = df.iloc[recommended].copy()
        recommended["similarity"] = 1 - dist

        h = "<br/>".join(['<a href="' + l + '" target="_blank">' + n + "</a>" + " (Similarity: " + "{:.2f}%".format(s * 100) + ")" for l, n, s in recommended[["url", "title", "similarity"]].values])
        display(HTML(h))

In [41]:
def recommendation_widget(topic_modelling_files: dict[str, dict[str, pd.DataFrame]]) -> None:
    """Widget to recommended articles based on search query.
    
    Args:
        topic_modelling_files (dict[str, dict[str, pd.DataFrame]]): Dictionary containing topic modelling data.
    """

    textW = widgets.Textarea(value="", placeholder="Type something", description="", disabled=False, layout=Layout(width="90%", height="200px"))
    yearW = widgets.IntSlider(value=2018, description="Year", max=2020, min=2016, layout=Layout(width="40%"))
    kWidget = widgets.IntSlider(value=10, description="Number of articles", max=50, min=1, layout=Layout(width="40%"), style= {'description_width': 'initial'})
    button = widgets.Button(description="Search")

    display(VBox([HBox([kWidget, yearW],layout=Layout(width="90%", justify_content="space-around")), textW, button], layout=Layout(align_items="center")))

    def on_button_clicked(b) -> None:
        """Display recommended articles on button click."""
        clear_output()
        display(VBox([HBox([kWidget, yearW], layout=Layout(width="90%", justify_content="space-around")), textW, button], layout=Layout(align_items="center")))
        task_recommendation(topic_modelling_files, textW.value, yearW.value, kWidget.value)

    button.on_click(on_button_clicked)

In [44]:
recommendation_widget(topic_modelling_files)

VBox(children=(HBox(children=(IntSlider(value=10, description='Number of articles', layout=Layout(width='40%')…

trump
