## Load the local DBLP cache
from the `snapshot` folder!

In [None]:
import pandas as pd
from datetime import datetime
from itables import init_notebook_mode, show
import ipywidgets as widgets
from IPython.display import display
from lxml import html

authors = pd.read_csv('snapshot/dblp_authors.csv', sep='\t')
papers = pd.read_csv('snapshot/dblp_papers.csv', sep='\t')
papers_authors = pd.read_csv('snapshot/dblp_papers_authors.csv', sep='\t')

### Guess Authors from Webpage and/or search Co-Authors directly

grab the HTML source of the submissions system and give it a go

In [None]:

search = ""
text_blob = ""
cutoff = 5
explain = False
description_width = '200px'

# Create UI elements
search_input = widgets.Text(value=search, description='Search (semicolon separated):')
search_input.layout.width = '600px'
search_input.style.description_width = description_width
cutoff_slider = widgets.IntSlider(value=cutoff, min=1, max=10, step=1, description='Cutoff (years):')
cutoff_slider.layout.width = '400px'
cutoff_slider.style.description_width = description_width
explain_toggle = widgets.Checkbox(value=explain, description='Explain')
explain_toggle.style.description_width = description_width
search_button = widgets.Button(description="Find Co-Authors")
search_button.layout.margin = '0px 0px 0px 210px'
output = widgets.Output()

page_text = widgets.Textarea(value=text_blob, description='Webpage Source:')
page_text.layout.width = '600px'
page_text.style.description_width = description_width
page_text.layout.height = '150px'
guesser = widgets.Dropdown(options=['PCS', 'EasyChair'], value='PCS', description='Guesser:')
guesser.layout.width = '400px'
guesser.style.description_width = description_width
guess_button = widgets.Button(description="Guess Authors")
guess_button.layout.margin = '0px 0px 0px 210px'

# Display UI elements
display(page_text, guesser, guess_button, search_input, cutoff_slider, explain_toggle, search_button, output)

def on_guess_button_clicked(b):
    with output:
        output.clear_output()
        # text = "\n".join(page_text.value.splitlines())
        text = page_text.value
        guesser_value = guesser.value
        if guesser_value == 'PCS':
            tree = html.fromstring(text)

            # Search for elements with the class "authorList"
            author_list_elements = tree.find_class("authorList")[0]
            # Extract <li> elements from the "authorList"
            li_elements = author_list_elements.xpath(".//li")

            # Extract the first <span> from each <li>
            guessed_authors = []
            for li in li_elements:
                span = li.find(".//span")  # Find the first <span> in the <li>
                if span is not None:
                    guessed_authors.append(span.text_content())
            search_input.value = "; ".join(guessed_authors)
        elif guesser_value == 'EasyChair':
            tree = html.fromstring(text)
            tables = tree.find_class("ct_table")
            for table in tables:
                if table.find(".//tr").find(".//td").text_content() == "Authors":
                    rows = table.xpath(".//tr")
                    guessed_authors = []
                    for row in rows[2:]:
                        data = row.xpath(".//td")
                        if len(data) > 1:
                            author = data[0].text_content() + " " + data[1].text_content()
                            if author:
                                guessed_authors.append(author)
            search_input.value = "; ".join(guessed_authors)


def on_search_button_clicked(b):
# Update variables based on UI input
    search = search_input.value
    cutoff = cutoff_slider.value
    explain = explain_toggle.value

    with output:
        output.clear_output()
        cutoffYear = datetime.now().year - cutoff
        hits_list = []
        hits_frame = pd.DataFrame(data=None, columns=authors.columns, index=authors.index)
        for s in search.split(";"):
            h = authors[authors["Name"].str.contains(s.strip(), case=False)]
            hits_list.append(h)
        hits_frame = pd.concat(hits_list, ignore_index=True)
        if hits_frame.empty:
            print(f"No authors found for {search}")
        else:
            authors_ids = []
            for h in hits_frame.itertuples():
                print(f"Author: {h.Name}, NumericID: {h.NumericID}, ORCID: {h.ORCID}, DBLP: {h.DBLP}")
                id = h.NumericID
                authors_ids.append(id)
                # Get all papers for these authors
            papers_ids = papers_authors[papers_authors["AuthorID"].isin(authors_ids)]["PaperID"]
            # print(papers_ids)
            papers_data = papers[papers["NumericID"].isin(papers_ids) & (papers["Year"] >= cutoffYear)]
            # print("papers_data")
            # print(papers_data)
            # get all co-authors for these papers
            filtered_papers_ids = papers_data["NumericID"]
            co_authors = papers_authors[papers_authors["PaperID"].isin(filtered_papers_ids)]
            co_authors = co_authors[co_authors["AuthorID"] != id]
            co_authors = co_authors["AuthorID"].unique()
            # print(co_authors)
            # lookup co_authors names
            co_authors_data = authors[authors["NumericID"].isin(co_authors)]
            # print("co_authors_data")
            # join the coauthors with the information from the papers table
            if (explain):
                merge = pd.merge(co_authors_data, papers_authors, left_on="NumericID", right_on="AuthorID")
                merge = pd.merge(merge, papers_data, left_on="PaperID", right_on="NumericID")
                # print(merge.columns.tolist())
                merge = merge[['DBLP_x', 'Name', 'ORCID', 'DBLP_y', 'Title', 'Year', 'PaperID', 'AuthorID', 'NumericID_x', 'NumericID_y']]
                merge["DBLP_x"] = ['<a href="{}">{}</a>'.format(d, d) for d in merge["DBLP_x"]]
                merge["DBLP_y"] = ['<a href="{}">{}</a>'.format(d, d) for d in merge["DBLP_y"]]
                merge["ORCID"] = ['<a href="{}">{}</a>'.format(d, d) for d in merge["ORCID"]]
                merge = merge.rename(columns={"DBLP_x": "DBLP Author", "DBLP_y": "DBLP Paper"})
                show(merge)
            else:
                cod = co_authors_data.copy()
                cod["DBLP"] = ['<a href="{}">{}</a>'.format(d, d) for d in cod["DBLP"]]
                cod["ORCID"] = ['<a href="{}">{}</a>'.format(d, d) for d in cod["ORCID"]]
                show(cod)

search_button.on_click(on_search_button_clicked)
guess_button.on_click(on_guess_button_clicked)
