In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import numpy as np
import re

### DBLP SPARQL Endpoint

In [1]:
# This notebook requires an sparql endpoint loaded with DBLP RDF dump. 
# Please refer to DBLP on how to download the latest RDF dump - https://blog.dblp.org/2022/03/02/dblp-in-rdf/
# Any triplestore with SPARQL support can be used to create the endpoint. For example, https://hub.docker.com/r/openlink/virtuoso-opensource-7/

dblp_sparql_endpoint = ""

### SPARQL Query Templates

In [44]:
conf_proceeding_list_query = """
PREFIX dblp: <https://dblp.org/rdf/schema#> 

SELECT ?p  WHERE {
    ?p a dblp:Editorship .
    FILTER(STRSTARTS(STR(?p), "__pattern__"))
}
ORDER BY ?p
"""

get_paper_list_query = """
PREFIX dblp: <https://dblp.org/rdf/schema#> 

#SELECT ?paper ?title ?doi ?pages ?year WHERE {
SELECT ?paper ?title ?year WHERE {
    ?paper a dblp:Publication, dblp:Inproceedings;
        dblp:title ?title;
        #dblp:doi ?doi;
        #dblp:pagination ?pages;
        dblp:yearOfPublication ?year;
        dblp:publishedAsPartOf <__pattern__> .
    #FILTER (STRSTARTS(str(?doi), "https://doi.org/"))
}
"""

get_author_list_query = """
PREFIX dblp: <https://dblp.org/rdf/schema#> 

SELECT ?paper ?title ?name ?ordinal ?orcid ?wikidata ?scholar {
    ?paper a dblp:Publication, dblp:Inproceedings;
        dblp:title ?title;
        dblp:hasSignature ?sign;
        dblp:publishedAsPartOf <__pattern__> .

    ?sign dblp:signatureDblpName ?name;
        dblp:signatureCreator ?dblp_person;
        dblp:signatureOrdinal ?ordinal .

    OPTIONAL { ?dblp_person dblp:orcid ?orcid }
    OPTIONAL { ?dblp_person dblp:wikidata ?wikidata }
    OPTIONAL { ?dblp_person dblp:webpage ?scholar . FILTER (STRSTARTS(str(?scholar), "https://scholar.google.com/")) }
}
"""

### Utility Methods

In [72]:
def get_results(query):
    sparql = SPARQLWrapper(dblp_sparql_endpoint)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    return results

def parse_results(results, var_list):
    processed_results = list()
    for result in results["results"]["bindings"]:
        result_array = list()
        for var in var_list:
            if var in result:
                result_array.append(result[var]['value'])
            else:
                result_array.append(None)
        processed_results.append(result_array)
    df = pd.DataFrame(processed_results, columns=var_list, index=None)
    return df


def get_conf_proceedings(url_prefix):
    query = conf_proceeding_list_query.replace("__pattern__", url_prefix)
    conf_list = parse_results(get_results(query), ["p"])["p"].to_list()
    pattern = r"https://dblp.org/rec/conf/semweb/\d{4}-?\d?$"
    conf_list = [conf for conf in conf_list if re.match(pattern, conf)]
    return conf_list

def get_papers(conf_url):
    query = get_paper_list_query.replace("__pattern__", conf_url)
    paper_list = parse_results(get_results(query), ["paper", "title", "doi", "pages", "year"])
    paper_list["doi"] = paper_list["doi"].replace(r"https://doi.org/", "", regex=True)
    paper_list["paper"] = paper_list["paper"].replace(r"https://dblp.org/rec/", "", regex=True)
    paper_list['title'] = paper_list['title'].str.rstrip('.')
    paper_list["title_2"] = paper_list['title']
    return paper_list

def get_paper_authors(conf_url):
    query = get_author_list_query.replace("__pattern__", conf_url)
    author_list = parse_results(get_results(query), ["paper", "title", "name", "ordinal", "orcid", "wikidata", "scholar"])
    author_list['name'] = author_list['name'].replace(r'\d', '', regex=True)
    author_list['orcid'] = author_list['orcid'].replace(r"https://orcid.org/", "", regex=True)
    author_list['scholar'] = author_list['scholar'].replace(r"https://scholar.google.com/citations\?user=", "", regex=True)
    author_list['wikidata'] = author_list['wikidata'].replace(r"http://www.wikidata.org/entity/", "", regex=True)
    author_list['name_2'] = author_list['name']

    return author_list


In [6]:
name_map_df = pd.read_csv("name_map.csv")
name_to_qid = dict()
for i, row in name_map_df.iterrows():
    name_to_qid[row['name']] = row['Name_Matched']
print(f"name map size: {len(name_to_qid)}")

name map size: 1658


In [80]:
!pwd

/Users/nandana/Documents/src/public/conf-data/src


In [73]:
conf_proceedings_list = get_conf_proceedings("https://dblp.org/rec/conf/semweb/")

In [74]:
conf_proceedings_list

['https://dblp.org/rec/conf/semweb/2002',
 'https://dblp.org/rec/conf/semweb/2003',
 'https://dblp.org/rec/conf/semweb/2004',
 'https://dblp.org/rec/conf/semweb/2005',
 'https://dblp.org/rec/conf/semweb/2006',
 'https://dblp.org/rec/conf/semweb/2007',
 'https://dblp.org/rec/conf/semweb/2008',
 'https://dblp.org/rec/conf/semweb/2009',
 'https://dblp.org/rec/conf/semweb/2010-1',
 'https://dblp.org/rec/conf/semweb/2010-2',
 'https://dblp.org/rec/conf/semweb/2011-1',
 'https://dblp.org/rec/conf/semweb/2011-2',
 'https://dblp.org/rec/conf/semweb/2012-1',
 'https://dblp.org/rec/conf/semweb/2012-2',
 'https://dblp.org/rec/conf/semweb/2013-1',
 'https://dblp.org/rec/conf/semweb/2013-2',
 'https://dblp.org/rec/conf/semweb/2014-1',
 'https://dblp.org/rec/conf/semweb/2014-2',
 'https://dblp.org/rec/conf/semweb/2015-1',
 'https://dblp.org/rec/conf/semweb/2015-2',
 'https://dblp.org/rec/conf/semweb/2016-1',
 'https://dblp.org/rec/conf/semweb/2016-2',
 'https://dblp.org/rec/conf/semweb/2017-1',
 'ht

In [67]:
conf_url = conf_proceedings_list[0]
conf_url

'https://dblp.org/rec/conf/ekaw/1992'

In [75]:
all_paper_df = pd.DataFrame()
all_author_df = pd.DataFrame()
for conf_url in conf_proceedings_list:
    year = conf_url.rsplit("/",1)[1]
    paper_df = get_papers(conf_url)
    paper_df['desc'] = f"scientific article published in K-CAP {year.split('-')[0]}"
    #paper_df['proceedings'] = iswc_year_to_proceeding[year]
    all_paper_df = pd.concat([all_paper_df, paper_df], ignore_index=True)
    author_df = get_paper_authors(conf_url)
    author_df['name'] = author_df['name'].map(lambda x: name_to_qid[x] if x in name_to_qid else x)
    author_df['wikidata'] = author_df['wikidata'].fillna('')
    author_df.loc[author_df['wikidata'].str.startswith('Q'), 'name'] = author_df.loc[author_df['wikidata'].str.startswith('Q'), 'wikidata']
    all_author_df = pd.concat([all_author_df, author_df], ignore_index=True)

all_paper_df.to_excel(f"../data/iswc/papers/iswc-2002-2023_paper_list.xlsx")
all_author_df.to_excel(f"../data/iswc/papers/iswc-2002-2023_author_list.xlsx")


In [109]:
conf_proceedings_list[0:-23]

['https://dblp.org/rec/conf/semweb/2002',
 'https://dblp.org/rec/conf/semweb/2003',
 'https://dblp.org/rec/conf/semweb/2004',
 'https://dblp.org/rec/conf/semweb/2005',
 'https://dblp.org/rec/conf/semweb/2006',
 'https://dblp.org/rec/conf/semweb/2007',
 'https://dblp.org/rec/conf/semweb/2008',
 'https://dblp.org/rec/conf/semweb/2009',
 'https://dblp.org/rec/conf/semweb/2010-1',
 'https://dblp.org/rec/conf/semweb/2010-2',
 'https://dblp.org/rec/conf/semweb/2011-1']

In [6]:
df = pd.read_csv('../2008-2023-poster-demo-author-list-xlsx.csv')
df = df[['name', 'Name_Matched']]
df.replace("", np.nan, inplace=True)
df.drop_duplicates(subset=['name'], keep=False, inplace=True)
df = df.dropna(subset=['Name_Matched'])
already_exist = df["name"].isin(name_to_qid.keys())
df = df[~already_exist]
df.to_csv("name_map_2.csv")


In [110]:
iswc_year_to_proceeding = {
	"2011-1": "Q56840148",
	"2010-1": "Q56840159",
	"2010-2": "Q56840159",
	"2009": "Q48733570",
	"2008": "Q98093643",
	"2007": "Q28553532",
	"2006": "Q28916709",
	"2005": "Q125320179",
	"2004": "Q56834677",
	"2003": "Q55903617",
	"2002": "Q113568316"
}

In [118]:
proc_to_qid = dict()
df = pd.read_csv("../data/iswc/poster_demo/poster_demo_proc_editors.csv")
for idx, row in df.iterrows():
    proc_to_qid[row['proc']] = row['QID']

In [119]:
proc_to_qid

{'https://dblp.org/rec/conf/semweb/2008p': 'Q113545745',
 'https://dblp.org/rec/conf/semweb/2010pd': 'Q113545435',
 'https://dblp.org/rec/conf/semweb/2012p': 'Q113545151',
 'https://dblp.org/rec/conf/semweb/2013p': 'Q113545021',
 'https://dblp.org/rec/conf/semweb/2014p': 'Q113544758',
 'https://dblp.org/rec/conf/semweb/2015p': 'Q111517264',
 'https://dblp.org/rec/conf/semweb/2016p': 'Q113544230',
 'https://dblp.org/rec/conf/semweb/2017p': 'Q42309709',
 'https://dblp.org/rec/conf/semweb/2018p': 'Q57393931',
 'https://dblp.org/rec/conf/semweb/2019p': 'Q83489954',
 'https://dblp.org/rec/conf/semweb/2021p': 'Q113287562',
 'https://dblp.org/rec/conf/semweb/2022p': 'Q115053047',
 'https://dblp.org/rec/conf/semweb/2023p': 'Q124407889',
 'https://dblp.org/rec/conf/semweb/2020demo': 'Q101089329'}

In [123]:
qid_to_title = dict()
df = pd.read_csv("../data/iswc/poster_demo/iswc_posters_demos.csv")
for idx, row in df.iterrows():
    qid_to_title[row['QID']] = row['title'].lower()

In [124]:
qid_to_title

{'Q113545745': 'iswc2008 posters and demonstrations',
 'Q113545435': 'proceedings of the iswc 2010 posters & demonstrations track: collected abstracts',
 'Q113545151': 'proceedings of the iswc 2012 posters & demonstrations track',
 'Q113545021': 'proceedings of the iswc 2013 posters & demonstrations track',
 'Q113544758': 'proceedings of the iswc 2014 posters & demonstrations track',
 'Q111517264': 'proceedings of the iswc 2015 posters & demonstrations track',
 'Q113544230': 'proceedings of the iswc 2016 posters & demonstrations track',
 'Q42309709': 'proceedings of the iswc 2017 posters & demonstrations and industry tracks',
 'Q57393931': 'proceedings of the iswc 2018 posters & demonstrations, industry and blue sky ideas tracks',
 'Q83489954': 'proceedings of the iswc 2019 satellite tracks (posters & demonstrations, industry, and outrageous ideas)',
 'Q101089329': 'proceedings of the iswc 2020 demos and industry tracks: from novel ideas to industrial practice',
 'Q113287562': 'proceed

In [120]:
proc_urls = list(proc_to_qid.keys())
proc_urls

['https://dblp.org/rec/conf/semweb/2008p',
 'https://dblp.org/rec/conf/semweb/2010pd',
 'https://dblp.org/rec/conf/semweb/2012p',
 'https://dblp.org/rec/conf/semweb/2013p',
 'https://dblp.org/rec/conf/semweb/2014p',
 'https://dblp.org/rec/conf/semweb/2015p',
 'https://dblp.org/rec/conf/semweb/2016p',
 'https://dblp.org/rec/conf/semweb/2017p',
 'https://dblp.org/rec/conf/semweb/2018p',
 'https://dblp.org/rec/conf/semweb/2019p',
 'https://dblp.org/rec/conf/semweb/2021p',
 'https://dblp.org/rec/conf/semweb/2022p',
 'https://dblp.org/rec/conf/semweb/2023p',
 'https://dblp.org/rec/conf/semweb/2020demo']

In [134]:
all_paper_df = pd.DataFrame()
all_author_df = pd.DataFrame()
for conf_url in proc_urls:
    #year = conf_url.replace("https://dblp.org/rec/conf/semweb/", "").replace("p","").replace("pd","").replace("demo","")
    q_id = proc_to_qid[conf_url]
    proc_title = qid_to_title[q_id]
    paper_df = get_papers(conf_url)
    paper_df['desc'] = f"scientific article published in {proc_title}"
    paper_df['proceedings'] = q_id
    all_paper_df = pd.concat([all_paper_df, paper_df], ignore_index=True)
    author_df = get_paper_authors(conf_url)
    author_df['name'] = author_df['name'].map(lambda x: name_to_qid[x] if x in name_to_qid else x)
    author_df['wikidata'] = author_df['wikidata'].fillna('')
    author_df.loc[author_df['wikidata'].str.startswith('Q'), 'name'] = author_df.loc[author_df['wikidata'].str.startswith('Q'), 'wikidata']
    all_author_df = pd.concat([all_author_df, author_df], ignore_index=True)

all_paper_df.to_excel(f"../data/iswc/papers/2008-2023_poster_demo_list.xlsx")
all_author_df.to_excel(f"../data/iswc/papers/2008-2023_poster_demo_author_list.xlsx")