In [1]:
import pandas as pd
import rltk
import re
from difflib import SequenceMatcher
from tqdm import tqdm
import json

## Load Data

In [2]:
TOPIC = 'ai'

**Arxiv**

In [3]:
df_arxiv = pd.read_json(f'crawler558/arxiv_crawler_{TOPIC}.jl', lines=True)
print(f'df_arxiv.shape pre  deduplucation: {df_arxiv.shape}')
df_arxiv = df_arxiv.drop_duplicates(subset='title')
print(f'df_arxiv.shape post deduplucation: {df_arxiv.shape}')
df_arxiv.head()

df_arxiv.shape pre  deduplucation: (1780, 7)
df_arxiv.shape post deduplucation: (1780, 7)


Unnamed: 0,url,updated,published,title,summary,authors,categories
0,http://arxiv.org/abs/1409.0473v7,2016-05-19T21:53:22Z,2014-09-01T16:33:02Z,Neural Machine Translation by Jointly Learning...,Neural machine translation is a recently pro...,"[Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio]","[cs.CL, cs.LG, cs.NE, stat.ML]"
1,http://arxiv.org/abs/1511.06434v2,2016-01-07T23:09:39Z,2015-11-19T22:50:32Z,Unsupervised Representation Learning with Deep...,"In recent years, supervised learning with co...","[Alec Radford, Luke Metz, Soumith Chintala]","[cs.LG, cs.CV]"
2,http://arxiv.org/abs/1412.6572v3,2015-03-20T20:19:16Z,2014-12-20T01:17:12Z,Explaining and Harnessing Adversarial Examples,"Several machine learning models, including n...","[Ian J. Goodfellow, Jonathon Shlens, Christian...","[stat.ML, cs.LG]"
3,http://arxiv.org/abs/1609.02907v4,2017-02-22T09:55:36Z,2016-09-09T19:48:41Z,Semi-Supervised Classification with Graph Conv...,We present a scalable approach for semi-supe...,"[Thomas N. Kipf, Max Welling]","[cs.LG, stat.ML]"
4,http://arxiv.org/abs/1509.02971v6,2019-07-05T10:47:27Z,2015-09-09T23:01:36Z,Continuous control with deep reinforcement lea...,We adapt the ideas underlying the success of...,"[Timothy P. Lillicrap, Jonathan J. Hunt, Alexa...","[cs.LG, stat.ML]"


In [4]:
# Generate an id column for RLTK to use
df_arxiv.reset_index(inplace=True)
df_arxiv['index'] = df_arxiv['index'].astype('str')
df_arxiv.rename(columns={'index':'ID'}, inplace=True)
df_arxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1780 entries, 0 to 1779
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          1780 non-null   object
 1   url         1780 non-null   object
 2   updated     1780 non-null   object
 3   published   1780 non-null   object
 4   title       1780 non-null   object
 5   summary     1780 non-null   object
 6   authors     1780 non-null   object
 7   categories  1780 non-null   object
dtypes: object(8)
memory usage: 111.4+ KB


**Google Scholar**

In [5]:
df_gscholar = pd.read_json(f'Google_Scholar/articles_{TOPIC}.json')
print(f'df_gscholar.shape pre  deduplication: {df_gscholar.shape}')
df_gscholar = df_gscholar.drop_duplicates(subset='title')
print(f'df_gscholar.shape post deduplication: {df_gscholar.shape}')

# Fix the wrong URLs
df_gscholar['url'] = df_gscholar['url'].apply(lambda x: x[27:])

df_gscholar.head()

df_gscholar.shape pre  deduplication: (1020, 6)
df_gscholar.shape post deduplication: (1016, 6)


Unnamed: 0,title,url,authors,journal,citations,year
0,Neural Machine Translation by Jointly Learning...,http://scholar.google.com/scholar?oi=bibs&clus...,"[D Bahdanau, K Cho, Y Bengio]",ICLR,13000,2015
1,Unsupervised Representation Learning with Deep...,http://scholar.google.com/scholar?oi=bibs&clus...,"[A Radford, L Metz, S Chintala]",ICLR (Poster),6544,2016
2,Explaining and Harnessing Adversarial Examples.,http://scholar.google.com/scholar?oi=bibs&clus...,"[IJ Goodfellow, J Shlens, C Szegedy]",ICLR (Poster),4827,2015
3,Semi-Supervised Classification with Graph Conv...,http://scholar.google.com/scholar?oi=bibs&clus...,"[TN Kipf, M Welling]",ICLR (Poster),4036,2017
4,Continuous control with deep reinforcement lea...,http://scholar.google.com/scholar?oi=bibs&clus...,"[TP Lillicrap, JJ Hunt, A Pritzel, N Heess, T ...",ICLR (Poster),3683,2016


In [6]:
# Generate an id column for RLTK to use
df_gscholar.reset_index(inplace=True)
df_gscholar['index'] = df_gscholar['index'].astype('str')
df_gscholar.rename(columns={'index':'ID'}, inplace=True)

# Also set all columns to string type
df_gscholar['citations'] = df_gscholar['citations'].astype('str')
df_gscholar['year'] = df_gscholar['year'].astype('str')
df_gscholar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1016 entries, 0 to 1015
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         1016 non-null   object
 1   title      1016 non-null   object
 2   url        1016 non-null   object
 3   authors    1016 non-null   object
 4   journal    1016 non-null   object
 5   citations  1016 non-null   object
 6   year       1016 non-null   object
dtypes: object(7)
memory usage: 55.7+ KB


### Naïve Data Merge

In [7]:
# How many matches can be found with a naÏve identical string approach?
arxiv_titles = df_arxiv['title']
gscholar_titles = df_gscholar['title']
print(len([1 for w in arxiv_titles.values if w in gscholar_titles.values]))

49


In [8]:
# Merge both datasets by "SQL" join
df_merged = df_arxiv.merge(df_gscholar, on='title', how='outer')
print(f'df_merged.shape: {df_merged.shape}')

# Number of match found with pd.merge:
print(f'Number of match found with pd.merge: {df_arxiv.shape[0] + df_gscholar.shape[0] - df_merged.shape[0]}')

df_merged.head(3)

df_merged.shape: (2747, 14)
Number of match found with pd.merge: 49


Unnamed: 0,ID_x,url_x,updated,published,title,summary,authors_x,categories,ID_y,url_y,authors_y,journal,citations,year
0,0,http://arxiv.org/abs/1409.0473v7,2016-05-19T21:53:22Z,2014-09-01T16:33:02Z,Neural Machine Translation by Jointly Learning...,Neural machine translation is a recently pro...,"[Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio]","[cs.CL, cs.LG, cs.NE, stat.ML]",,,,,,
1,1,http://arxiv.org/abs/1511.06434v2,2016-01-07T23:09:39Z,2015-11-19T22:50:32Z,Unsupervised Representation Learning with Deep...,"In recent years, supervised learning with co...","[Alec Radford, Luke Metz, Soumith Chintala]","[cs.LG, cs.CV]",,,,,,
2,2,http://arxiv.org/abs/1412.6572v3,2015-03-20T20:19:16Z,2014-12-20T01:17:12Z,Explaining and Harnessing Adversarial Examples,"Several machine learning models, including n...","[Ian J. Goodfellow, Jonathon Shlens, Christian...","[stat.ML, cs.LG]",,,,,,


## RLTK Data Merge

In [9]:
# RLTK Tokenizer
tokenizer = rltk.CrfTokenizer()

**Arxiv Dataset**

In [10]:
class ArxivRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = 'ArxivRecord'
        
    @property
    def id(self):
        return self.raw_object['ID']
    
    @rltk.cached_property
    def authors_string(self):
        return self.raw_object['authors']
    
    @rltk.cached_property
    def title_string(self):
        return self.raw_object['title']
        
    @rltk.cached_property
    def summary_string(self):
        return self.raw_object['summary']
    
    @rltk.cached_property
    def categories_string(self):
        return self.raw_object['categories']
    
    @rltk.cached_property
    def published_string(self):
        return self.raw_object['published']
    
    @rltk.cached_property
    def updated_string(self):
        return self.raw_object['updated']
    
    @rltk.cached_property
    def url_string(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def blocking_tokens(self):
        tokens = ' '.join([self.title_string])
        tokens = re.sub(r'\bThe\b', '', tokens)
        tokens = re.sub(r'\bthe\b', '', tokens)
        tokens = re.sub(r'\bof\b', '', tokens)
        tokens = re.sub(r"\b's\b", '', tokens)
        tokens = re.sub(r'\band\b', '', tokens)
        tokens = re.sub(r'\bI\b', '', tokens)
        tokens = re.sub(r'\bA\b', '', tokens)
        tokens = re.sub(r'\bin\b', '', tokens)
        tokens = re.sub(r'\bfor\b', '', tokens)
        tokens = re.sub(r'\bon\b', '', tokens)
        tokens = re.sub(r'\bwith\b', '', tokens)
        return set(tokenizer.tokenize(tokens))

In [11]:
ds_arxiv = rltk.Dataset(reader=rltk.DataFrameReader(df_arxiv), record_class=ArxivRecord, adapter=rltk.MemoryKeyValueAdapter())
print(type(ds_arxiv))
ds_arxiv.generate_dataframe().head(3)

<class 'rltk.dataset.Dataset'>


Unnamed: 0,id,authors_string,title_string,summary_string,categories_string,published_string,updated_string,url_string,blocking_tokens
0,0,"[Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio]",Neural Machine Translation by Jointly Learning...,Neural machine translation is a recently pro...,"[cs.CL, cs.LG, cs.NE, stat.ML]",2014-09-01T16:33:02Z,2016-05-19T21:53:22Z,http://arxiv.org/abs/1409.0473v7,"{Translation, by, Neural, Machine, Align, Join..."
1,1,"[Alec Radford, Luke Metz, Soumith Chintala]",Unsupervised Representation Learning with Deep...,"In recent years, supervised learning with co...","[cs.LG, cs.CV]",2015-11-19T22:50:32Z,2016-01-07T23:09:39Z,http://arxiv.org/abs/1511.06434v2,"{Representation, Unsupervised, Deep, Learning,..."
2,2,"[Ian J. Goodfellow, Jonathon Shlens, Christian...",Explaining and Harnessing Adversarial Examples,"Several machine learning models, including n...","[stat.ML, cs.LG]",2014-12-20T01:17:12Z,2015-03-20T20:19:16Z,http://arxiv.org/abs/1412.6572v3,"{Explaining, Examples, Adversarial, Harnessing}"


**Google Scholar Dataset**

In [12]:
class GScholarRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = 'GScholarRecord'
        
    @property
    def id(self):
        return self.raw_object['ID']
    
    @rltk.cached_property
    def authors_string(self):
        return self.raw_object['authors']
    
    @rltk.cached_property
    def title_string(self):
        return self.raw_object['title']
        
    @rltk.cached_property
    def journal_string(self):
        return self.raw_object['journal']
    
    @rltk.cached_property
    def citations_string(self):
        return self.raw_object['citations']
    
    @rltk.cached_property
    def year_string(self):
        return self.raw_object['year']
    
    @rltk.cached_property
    def url_string(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def blocking_tokens(self):
        tokens = ' '.join([self.title_string])
        tokens = re.sub(r'\bThe\b', '', tokens)
        tokens = re.sub(r'\bthe\b', '', tokens)
        tokens = re.sub(r'\bof\b', '', tokens)
        tokens = re.sub(r"\b's\b", '', tokens)
        tokens = re.sub(r'\band\b', '', tokens)
        tokens = re.sub(r'\bI\b', '', tokens)
        tokens = re.sub(r'\bA\b', '', tokens)
        tokens = re.sub(r'\bin\b', '', tokens)
        tokens = re.sub(r'\bfor\b', '', tokens)
        tokens = re.sub(r'\bon\b', '', tokens)
        tokens = re.sub(r'\bwith\b', '', tokens)
        return set(tokenizer.tokenize(tokens))

In [13]:
ds_gscholar = rltk.Dataset(reader=rltk.DataFrameReader(df_gscholar), record_class=GScholarRecord, adapter=rltk.MemoryKeyValueAdapter())
print(type(ds_gscholar))
ds_gscholar.generate_dataframe().head(3)

<class 'rltk.dataset.Dataset'>


Unnamed: 0,id,authors_string,title_string,journal_string,citations_string,year_string,url_string,blocking_tokens
0,0,"[D Bahdanau, K Cho, Y Bengio]",Neural Machine Translation by Jointly Learning...,ICLR,13000,2015,http://scholar.google.com/scholar?oi=bibs&clus...,"{., Translation, by, Neural, Machine, Align, J..."
1,1,"[A Radford, L Metz, S Chintala]",Unsupervised Representation Learning with Deep...,ICLR (Poster),6544,2016,http://scholar.google.com/scholar?oi=bibs&clus...,"{., Representation, Unsupervised, Deep, Learni..."
2,2,"[IJ Goodfellow, J Shlens, C Szegedy]",Explaining and Harnessing Adversarial Examples.,ICLR (Poster),4827,2015,http://scholar.google.com/scholar?oi=bibs&clus...,"{Examples, ., Harnessing, Explaining, Adversar..."


### Blocking

In [14]:
# Generate blocks from tokens
token_blocker = rltk.TokenBlockGenerator()
blocks = token_blocker.generate(
    token_blocker.block(ds_arxiv, property_='blocking_tokens'),
    token_blocker.block(ds_gscholar, property_='blocking_tokens'))
print(type(blocks))

<class 'rltk.blocking.block.Block'>


In [15]:
# Extract all record pairs from the block
record_pairs = rltk.get_record_pairs(ds_arxiv, ds_gscholar, block=blocks)

# Get the total number of record pairs generated
compared_pairs = len(list(record_pairs))

# Get the number of elements in each rltk.Dataset
tally_imdb = ds_arxiv.generate_dataframe().shape[0]
tally_tmd = ds_gscholar.generate_dataframe().shape[0]

# Calculate the total number of pairs if both datasets were to be compared without any blocking (eg: a double for loop)
tally_unblocked = tally_imdb * tally_tmd

# Calculate how much smaller the blocked pairings are
reduction_ratio = compared_pairs / tally_unblocked

# Calculate the reduction ratio (the inverse of the )
reduction_ratio = 1 - reduction_ratio
print(f'Reduction Ratio: {reduction_ratio:.5f}')

Reduction Ratio: 0.60755


### Matching Functions

In [16]:
def title_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_title = arxiv_tuple.title_string.strip().lower()
    gscholar_title = gscholar_tuple.title_string.strip().lower()
    similarity = SequenceMatcher(None, arxiv_title, gscholar_title).ratio()

    penalties = sum([len(arxiv_title)<=6,
                     len(gscholar_title)<=6])

    return similarity * (0.9**penalties)

In [17]:
def author_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_author = ' '.join(arxiv_tuple.authors_string).strip().lower()
    gscholar_author = ' '.join(gscholar_tuple.authors_string).strip().lower()
    similarity = SequenceMatcher(None, arxiv_author, gscholar_author).ratio() 
    return similarity

In [18]:
def year_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_year = int(float(arxiv_tuple.updated_string[0:4]))
    gscholar_year = int(float(gscholar_tuple.year_string))
    similarity = 1 /(1 + abs(arxiv_year-gscholar_year))
    return similarity

In [19]:
def elementwise_similarity(arxiv_tuple, gscholar_tuple, match_threshold=0.75):
    sim_title = title_similarity(arxiv_tuple, gscholar_tuple)
    sim_author = author_similarity(arxiv_tuple, gscholar_tuple)
    sim_year = year_similarity(arxiv_tuple, gscholar_tuple)

    element_similarity = (0.70 * sim_title) + (0.15 * sim_author) + (0.15 * sim_year)

    return element_similarity > match_threshold, element_similarity

In [20]:
# Predict matches for all pairs in the blocked data 
print(f'Arxiv samples: {df_arxiv.shape[0]}')
print(f'GScholar samples: {df_gscholar.shape[0]}')

summary_df = pd.DataFrame()
THRESHOLDS = [T/100 for T in range(0, 101, 5)]

# Iterate through various thresholds to find the most matches without any duplicates
for T in tqdm(THRESHOLDS):

    # Set to store pairs of IDs matched
    ids_matched = set()
    
    # Iterate through candidates on the block
    for block_id, arxiv_id, gscholar_id in blocks.pairwise(ds_arxiv, ds_gscholar):
        
        # Find similarity at a given threshold
        match , similarity = elementwise_similarity(ds_arxiv.get_record(arxiv_id),
                                                    ds_gscholar.get_record(gscholar_id),
                                                    match_threshold=T)
        # If a match is found, add to the set of matches
        if match:
            ids_matched.add((arxiv_id, gscholar_id))
    
    # Count the number of unique elements derived from each source
    set_a = set()
    set_b = set()
    for tp in ids_matched:
        set_a.add(tp[0])
        set_b.add(tp[1])
    
    summary_df.at[T, 'Matches'] = int(len(ids_matched))
    summary_df.at[T, 'Set_A Size'] = int(len(set_a))
    summary_df.at[T, 'Set_B Size'] = int(len(ids_matched))
    summary_df.at[T, 'Duplicates'] = int((len(ids_matched)-len(set_a)) + (len(ids_matched)-len(set_b)))
    
summary_df

  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

Arxiv samples: 1780
GScholar samples: 1016


100%|███████████████████████████████████████████████████████████████████████████████| 21/21 [2:48:52<00:00, 482.49s/it]


Unnamed: 0,Matches,Set_A Size,Set_B Size,Duplicates
0.0,572870.0,1778.0,572870.0,1142947.0
0.05,572870.0,1778.0,572870.0,1142947.0
0.1,572694.0,1778.0,572694.0,1142595.0
0.15,565465.0,1778.0,565465.0,1128137.0
0.2,522940.0,1778.0,522940.0,1043087.0
0.25,410293.0,1777.0,410293.0,817794.0
0.3,247252.0,1777.0,247252.0,491712.0
0.35,119245.0,1769.0,119245.0,235707.0
0.4,49201.0,1661.0,49201.0,95735.0
0.45,17674.0,1399.0,17674.0,33009.0


In [21]:
# Find the lowest threshold which gives no duplicates
optimal_threshold = summary_df[summary_df['Duplicates']==0].index[0]
optimal_threshold

1.0

In [37]:
# Manually set threshold to 0.80 to trade 616 extra True Positives for 4 extra False Positives
optimal_threshold = 0.80

In [38]:
# Generate matches based on the optimal (no-duplicate) threshold
print(f'Arxiv samples: {df_arxiv.shape[0]}')
print(f'GScholar samples: {df_gscholar.shape[0]}')

# Store tuples of matches IDs, as well as singletons witouth a match
ids_matched = set()
singles_arxiv = set()
singles_gscholar = set()

# Write matches (and non-matches) to a CSV
with open(f'Matches_{TOPIC}.csv', 'w') as predictions_full:
    for block_id, arxiv_id, gscholar_id in blocks.pairwise(ds_arxiv, ds_gscholar):

        match , similarity = elementwise_similarity(ds_arxiv.get_record(arxiv_id),
                                                    ds_gscholar.get_record(gscholar_id),
                                                    match_threshold=optimal_threshold)

        if match:
            ids_matched.add((arxiv_id, gscholar_id))
        else:
            singles_arxiv.add(arxiv_id)
            singles_gscholar.add(gscholar_id)
    
    # After finding all matches, write them to a csv
    for match_pair in ids_matched:
        predictions_full.write(f'{match_pair[0]},{match_pair[1]},1\n')
        # And ensure that no item in the matches is counted as a single
        try:
            singles_arxiv.remove(match_pair[0])
        except:
            pass
        try:
            singles_gscholar.remove(match_pair[1])
        except:
            pass
    
    # Then write all the singles which didn't find a match
    NULL = None
    for arxiv_id in singles_arxiv:
        predictions_full.write(f'{arxiv_id},{NULL},0\n')
    for gscholar_id in singles_gscholar:
        predictions_full.write(f'{NULL},{gscholar_id},0\n')        
        
print()
print(f'Matches: {len(ids_matched)}')
print(f'Non-Matches Arxiv: {len(singles_arxiv)}')
print(f'Non-Matches GScholar: {len(singles_gscholar)}')

Arxiv samples: 1780
GScholar samples: 1016

Matches: 616
Non-Matches Arxiv: 1164
Non-Matches GScholar: 401


### Create Merged Dataset

In [39]:
import rdflib
from rdflib import URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS, XSD

MYNS = Namespace('http://inf558.org/myfakenamespace#')
SCHEMA = Namespace("https://schema.org/")

# Initliaze the graph
g = rdflib.Graph()

# Bind namespace and prefixes
g.bind('my_ns', MYNS)
g.bind('schema', SCHEMA)
g.bind('rdf', RDF)
g.bind('rdfs', RDFS)
g.bind('xsd', XSD)

In [40]:
# Load predictions to be used in populating the RDF
predictions_df = pd.read_csv(f'Matches_{TOPIC}.csv', header=None, names=['ARXIV_ID', 'GSCHOLAR_ID', 'LABEL'])
print(f'predictions_df.shape: {predictions_df.shape}')
predicted_matches = predictions_df['LABEL'].sum()
print(f'predicted matches: {predicted_matches}  [{100*predicted_matches/predictions_df.shape[0]:.2f} %]')
predictions_df.head()

predictions_df.shape: (2181, 3)
predicted matches: 616  [28.24 %]


Unnamed: 0,ARXIV_ID,GSCHOLAR_ID,LABEL
0,570,204,1
1,1052,297,1
2,1438,475,1
3,1766,919,1
4,1180,354,1


In [41]:
# Dataframe to store the merged datasets
df_merged = pd.DataFrame(columns = ['ID', 'title', 'authors', 'published', 'updated', 
                                    'abstract', 'categories', 'citations', 'arxiv_url', 'gscholar_url'],
                         dtype='object')
json_merged = {}

NEW_ID = 0

# Populate the RDF with predictions with a positive (1) label
for idx, row in tqdm(predictions_df.iterrows(), total=predictions_df.shape[0]):
    
    # Populate the json object
    json_merged[NEW_ID] = {}
    
    ### URI ###
    node_uri = URIRef(str(NEW_ID))
    g.add((node_uri, RDF.type, SCHEMA.ScholarlyArticle))
    df_merged.at[NEW_ID, 'ID'] = NEW_ID
    json_merged[NEW_ID]['ID'] = NEW_ID

    
    ### Title ###
    try:
        title_arxiv = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['title'].values[0])
    except:
        title_arxiv = '<___>'
    try:
        title_gscholar = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['title'].values[0])
    except:
        title_gscholar = '<___>'
    title = title_arxiv if title_arxiv != '<___>' else title_gscholar if title_gscholar != '<___>' else None
    g.add((node_uri, SCHEMA.headline, Literal(title, datatype=SCHEMA.Text)))
    df_merged.at[NEW_ID, 'title'] = title
    json_merged[NEW_ID]['title'] = title

    
    ### Author(s) ###
    try:
        author_arxiv = df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['authors'].values[0]
        author_arxiv = [name.strip() for name in author_arxiv if name != '<___>']
    except:
        author_arxiv = '<___>'
    try:
        author_gscholar = df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['authors'].values[0]
        author_gscholar = [name.strip() for name in author_gscholar if name != '<___>']
    except:
        author_gscholar = '<___>'
    if author_arxiv != '<___>':
        authors = list(set(author_arxiv))
    else:
        authors = list(set(author_gscholar))           
    [g.add((node_uri, SCHEMA.author, Literal(author, datatype=SCHEMA.Person))) for author in authors]
    df_merged.at[NEW_ID, 'authors'] = authors
    json_merged[NEW_ID]['authors'] = authors
                       
    ### Published ###
    try:
        published = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['published'].values[0])
        g.add((node_uri, SCHEMA.datePublished, Literal(published, datatype=SCHEMA.DateTime)))
        df_merged.at[NEW_ID, 'published'] = published
        json_merged[NEW_ID]['published'] = published
    except:
        pass
        
                       
    ### Updated ###
    try:
        updated = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['updated'].values[0])
        g.add((node_uri, SCHEMA.dateModified, Literal(updated, datatype=SCHEMA.DateTime)))
        df_merged.at[NEW_ID, 'updated'] = updated
        json_merged[NEW_ID]['updated'] = updated
    except:
        pass
          
                       
    ### Abstract ###
    try:
        abstract = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['summary'].values[0]).strip()
        g.add((node_uri, SCHEMA.abstract, Literal(abstract, datatype=SCHEMA.Text)))
        df_merged.at[NEW_ID, 'abstract'] = abstract
        json_merged[NEW_ID]['abstract'] = abstract
    except:
        pass
       
                       
    ### Categories ###
    try:
        categories = df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['categories'].values[0]
        categories = [name.strip() for name in categories if name != '<___>']
        [g.add((node_uri, SCHEMA.genre, Literal(category, datatype=SCHEMA.Text))) for category in categories]
        df_merged.at[NEW_ID, 'categories'] = categories
        json_merged[NEW_ID]['categories'] = categories
    except:
        pass
          
                       
    ### Journal ###
    try:
        journal = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['journal'].values[0])
        g.add((node_uri, SCHEMA.publisher, Literal(journal, datatype=SCHEMA.Periodical))) #datatype=SCHEMA.Organisation
        df_merged.at[NEW_ID, 'journal'] = journal
        json_merged[NEW_ID]['journal'] = journal
    except:
        pass
     
                       
    ### Citations ###
    try:
        citations = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['citations'].values[0])
        g.add((node_uri, SCHEMA.commentCount, Literal(citations, datatype=SCHEMA.Integer)))
        df_merged.at[NEW_ID, 'citations'] = citations
        json_merged[NEW_ID]['citations'] = citations
    except:
        pass
            
                       
    ### Arxiv URL ###
    try:
        arxiv_url = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['url'].values[0])
        g.add((node_uri, SCHEMA.url, Literal(arxiv_url, datatype=SCHEMA.URL)))
        df_merged.at[NEW_ID, 'arxiv_url'] = arxiv_url    
        json_merged[NEW_ID]['arxiv_url'] = arxiv_url
    except:
        pass
        
                       
    ### Google Scholar URL ###
    try:
        gscholar_url = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['url'].values[0])
        g.add((node_uri, SCHEMA.url, Literal(gscholar_url, datatype=SCHEMA.URL)))
        df_merged.at[NEW_ID, 'gscholar_url'] = gscholar_url   
        json_merged[NEW_ID]['gscholar_url'] = gscholar_url
    except:
        pass
             
                       
    NEW_ID += 1
    
# Save to disk using turtle format
g.serialize(f'Triples_{TOPIC}.ttl.', format="turtle")

# And save the merged DataFrame as CSV
df_merged.to_csv(f'Merged_{TOPIC}.csv', index=False)

# Also save as Json, just because
with open(f'Json_{TOPIC}.json', 'w') as fout:
    json.dump(json_merged, fout)

100%|██████████████████████████████████████████████████████████████████████████████| 2181/2181 [00:22<00:00, 95.75it/s]


In [42]:
df_merged.head(3)

Unnamed: 0,ID,title,authors,published,updated,abstract,categories,citations,arxiv_url,gscholar_url,journal
0,0,Improved Training of Wasserstein GANs,"[Aaron Courville, Martin Arjovsky, Faruk Ahmed...",2017-03-31T19:25:00Z,2017-12-25T23:03:49Z,Generative Adversarial Networks (GANs) are pow...,"[cs.LG, stat.ML]",3041,http://arxiv.org/abs/1704.00028v3,http://scholar.google.com/scholar?oi=bibs&clus...,Proceedings of the 31st International Conferen...
1,1,RETAIN: An Interpretable Predictive Model for ...,"[Walter F. Stewart, Edward Choi, Mohammad Taha...",2016-08-19T21:54:46Z,2017-02-26T15:13:31Z,Accuracy and interpretability are two dominant...,"[cs.LG, cs.AI, cs.NE]",385,http://arxiv.org/abs/1608.05745v4,http://scholar.google.com/scholar?oi=bibs&clus...,Proceedings of the 30th International Conferen...
2,2,Markov Chain Monte Carlo and Variational Infer...,"[Diederik P. Kingma, Max Welling, Tim Salimans]",2014-10-23T19:23:53Z,2015-05-19T13:53:13Z,Recent advances in stochastic gradient variati...,"[stat.CO, stat.ML]",331,http://arxiv.org/abs/1410.6460v4,http://scholar.google.com/scholar?oi=bibs&clus...,"ICML, 1218-1226"


In [43]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2181 entries, 0 to 2180
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            2181 non-null   object
 1   title         2181 non-null   object
 2   authors       2181 non-null   object
 3   published     1780 non-null   object
 4   updated       1780 non-null   object
 5   abstract      1780 non-null   object
 6   categories    1780 non-null   object
 7   citations     1017 non-null   object
 8   arxiv_url     1780 non-null   object
 9   gscholar_url  1017 non-null   object
 10  journal       1017 non-null   object
dtypes: object(11)
memory usage: 269.0+ KB


In [44]:
json_merged

{0: {'ID': 0,
  'title': 'Improved Training of Wasserstein GANs',
  'authors': ['Aaron Courville',
   'Martin Arjovsky',
   'Faruk Ahmed',
   'Vincent Dumoulin',
   'Ishaan Gulrajani'],
  'published': '2017-03-31T19:25:00Z',
  'updated': '2017-12-25T23:03:49Z',
  'abstract': 'Generative Adversarial Networks (GANs) are powerful generative models, butsuffer from training instability. The recently proposed Wasserstein GAN (WGAN)makes progress toward stable training of GANs, but sometimes can still generateonly low-quality samples or fail to converge. We find that these problems areoften due to the use of weight clipping in WGAN to enforce a Lipschitzconstraint on the critic, which can lead to undesired behavior. We propose analternative to clipping weights: penalize the norm of gradient of the criticwith respect to its input. Our proposed method performs better than standardWGAN and enables stable training of a wide variety of GAN architectures withalmost no hyperparameter tuning, includi

## End