In [1]:
import pandas as pd
import rltk
import re
from difflib import SequenceMatcher
from tqdm import tqdm
import json

## Load Data

In [2]:
TOPIC = 'graphics'

**Arxiv**

In [3]:
df_arxiv = pd.read_json(f'crawler558/arxiv_crawler_{TOPIC}.jl', lines=True)
print(f'df_arxiv.shape pre  deduplucation: {df_arxiv.shape}')
df_arxiv = df_arxiv.drop_duplicates(subset='title')
print(f'df_arxiv.shape post deduplucation: {df_arxiv.shape}')
df_arxiv.head()

df_arxiv.shape pre  deduplucation: (122, 7)
df_arxiv.shape post deduplucation: (61, 7)


Unnamed: 0,url,updated,published,title,summary,authors,categories
0,http://arxiv.org/abs/1801.07829v2,2019-06-11T06:11:21Z,2018-01-24T01:14:04Z,Dynamic Graph CNN for Learning on Point Clouds,Point clouds provide a flexible geometric re...,"[Yue Wang, Yongbin Sun, Ziwei Liu, Sanjay E. S...",[cs.CV]
1,http://arxiv.org/abs/1705.01583v1,2017-05-03T19:13:23Z,2017-05-03T19:13:23Z,VNect: Real-time 3D Human Pose Estimation with...,We present the first real-time method to cap...,"[Dushyant Mehta, Srinath Sridhar, Oleksandr So...","[cs.CV, cs.GR]"
2,http://arxiv.org/abs/1712.01537v1,2017-12-05T09:25:19Z,2017-12-05T09:25:19Z,O-CNN: Octree-based Convolutional Neural Netwo...,"We present O-CNN, an Octree-based Convolutio...","[Peng-Shuai Wang, Yang Liu, Yu-Xiao Guo, Chun-...",[cs.CV]
3,http://arxiv.org/abs/1609.02974v1,2016-09-09T23:33:38Z,2016-09-09T23:33:38Z,Learning-Based View Synthesis for Light Field ...,With the introduction of consumer light fiel...,"[Nima Khademi Kalantari, Ting-Chun Wang, Ravi ...","[cs.CV, cs.GR, I.4.1]"
4,http://arxiv.org/abs/1804.02717v3,2018-07-27T03:44:10Z,2018-04-08T17:04:58Z,DeepMimic: Example-Guided Deep Reinforcement L...,A longstanding goal in character animation i...,"[Xue Bin Peng, Pieter Abbeel, Sergey Levine, M...","[cs.GR, cs.AI, cs.LG]"


In [4]:
# Generate an id column for RLTK to use
df_arxiv.reset_index(inplace=True)
df_arxiv['index'] = df_arxiv['index'].astype('str')
df_arxiv.rename(columns={'index':'ID'}, inplace=True)
df_arxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          61 non-null     object
 1   url         61 non-null     object
 2   updated     61 non-null     object
 3   published   61 non-null     object
 4   title       61 non-null     object
 5   summary     61 non-null     object
 6   authors     61 non-null     object
 7   categories  61 non-null     object
dtypes: object(8)
memory usage: 3.9+ KB


**Google Scholar**

In [5]:
df_gscholar = pd.read_json(f'Google_Scholar/articles_{TOPIC}.json')
print(f'df_gscholar.shape pre  deduplication: {df_gscholar.shape}')
df_gscholar = df_gscholar.drop_duplicates(subset='title')
print(f'df_gscholar.shape post deduplication: {df_gscholar.shape}')

# Fix the wrong URLs
df_gscholar['url'] = df_gscholar['url'].apply(lambda x: x[27:])

df_gscholar.head()

df_gscholar.shape pre  deduplication: (605, 6)
df_gscholar.shape post deduplication: (481, 6)


Unnamed: 0,title,url,authors,journal,citations,year
0,SMPL: a skinned multi-person linear model,http://scholar.google.com/scholar?oi=bibs&clus...,"[M Loper, N Mahmood, J Romero, G Pons-Moll, MJ...","ACM Transactions on Graphics (TOG) 34 (6), 1-16",620,2015
1,Dynamic Graph CNN for Learning on Point Clouds,http://scholar.google.com/scholar?oi=bibs&clus...,"[Y Wang, Y Sun, Z Liu, SE Sarma, MM Bronstein,...","ACM Transactions on Graphics (TOG) 38 (5), 1-12",536,2019
2,Let there be color! joint end-to-end learning ...,http://scholar.google.com/scholar?oi=bibs&clus...,"[S Iizuka, E Simo-Serra, H Ishikawa]","ACM Transactions on Graphics (TOG) 35 (4), 1-11",441,2016
3,VNect: real-time 3D human pose estimation with...,http://scholar.google.com/scholar?oi=bibs&clus...,"[D Mehta, S Sridhar, O Sotnychenko, H Rhodin, ...","ACM Transactions on Graphics (TOG) 36 (4), 1-14",400,2017
4,O-CNN: octree-based convolutional neural netwo...,http://scholar.google.com/scholar?oi=bibs&clus...,"[PS Wang, Y Liu, YX Guo, CY Sun, X Tong]","ACM Transactions on Graphics (TOG) 36 (4), 1-11",345,2017


In [6]:
# Generate an id column for RLTK to use
df_gscholar.reset_index(inplace=True)
df_gscholar['index'] = df_gscholar['index'].astype('str')
df_gscholar.rename(columns={'index':'ID'}, inplace=True)

# Also set all columns to string type
df_gscholar['citations'] = df_gscholar['citations'].astype('str')
df_gscholar['year'] = df_gscholar['year'].astype('str')
df_gscholar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         481 non-null    object
 1   title      481 non-null    object
 2   url        481 non-null    object
 3   authors    481 non-null    object
 4   journal    481 non-null    object
 5   citations  481 non-null    object
 6   year       481 non-null    object
dtypes: object(7)
memory usage: 26.4+ KB


### Naïve Data Merge

In [7]:
# How many matches can be found with a naÏve identical string approach?
arxiv_titles = df_arxiv['title']
gscholar_titles = df_gscholar['title']
print(len([1 for w in arxiv_titles.values if w in gscholar_titles.values]))

13


In [8]:
# Merge both datasets by "SQL" join
df_merged = df_arxiv.merge(df_gscholar, on='title', how='outer')
print(f'df_merged.shape: {df_merged.shape}')

# Number of match found with pd.merge:
print(f'Number of match found with pd.merge: {df_arxiv.shape[0] + df_gscholar.shape[0] - df_merged.shape[0]}')

df_merged.head(3)

df_merged.shape: (529, 14)
Number of match found with pd.merge: 13


Unnamed: 0,ID_x,url_x,updated,published,title,summary,authors_x,categories,ID_y,url_y,authors_y,journal,citations,year
0,0,http://arxiv.org/abs/1801.07829v2,2019-06-11T06:11:21Z,2018-01-24T01:14:04Z,Dynamic Graph CNN for Learning on Point Clouds,Point clouds provide a flexible geometric re...,"[Yue Wang, Yongbin Sun, Ziwei Liu, Sanjay E. S...",[cs.CV],1.0,http://scholar.google.com/scholar?oi=bibs&clus...,"[Y Wang, Y Sun, Z Liu, SE Sarma, MM Bronstein,...","ACM Transactions on Graphics (TOG) 38 (5), 1-12",536.0,2019.0
1,1,http://arxiv.org/abs/1705.01583v1,2017-05-03T19:13:23Z,2017-05-03T19:13:23Z,VNect: Real-time 3D Human Pose Estimation with...,We present the first real-time method to cap...,"[Dushyant Mehta, Srinath Sridhar, Oleksandr So...","[cs.CV, cs.GR]",,,,,,
2,2,http://arxiv.org/abs/1712.01537v1,2017-12-05T09:25:19Z,2017-12-05T09:25:19Z,O-CNN: Octree-based Convolutional Neural Netwo...,"We present O-CNN, an Octree-based Convolutio...","[Peng-Shuai Wang, Yang Liu, Yu-Xiao Guo, Chun-...",[cs.CV],,,,,,


## RLTK Data Merge

In [9]:
# RLTK Tokenizer
tokenizer = rltk.CrfTokenizer()

**Arxiv Dataset**

In [10]:
class ArxivRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = 'ArxivRecord'
        
    @property
    def id(self):
        return self.raw_object['ID']
    
    @rltk.cached_property
    def authors_string(self):
        return self.raw_object['authors']
    
    @rltk.cached_property
    def title_string(self):
        return self.raw_object['title']
        
    @rltk.cached_property
    def summary_string(self):
        return self.raw_object['summary']
    
    @rltk.cached_property
    def categories_string(self):
        return self.raw_object['categories']
    
    @rltk.cached_property
    def published_string(self):
        return self.raw_object['published']
    
    @rltk.cached_property
    def updated_string(self):
        return self.raw_object['updated']
    
    @rltk.cached_property
    def url_string(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def blocking_tokens(self):
        tokens = ' '.join([self.title_string])
        tokens = re.sub(r'\bThe\b', '', tokens)
        tokens = re.sub(r'\bthe\b', '', tokens)
        tokens = re.sub(r'\bof\b', '', tokens)
        tokens = re.sub(r"\b's\b", '', tokens)
        tokens = re.sub(r'\band\b', '', tokens)
        tokens = re.sub(r'\bI\b', '', tokens)
        tokens = re.sub(r'\bA\b', '', tokens)
        tokens = re.sub(r'\bin\b', '', tokens)
        tokens = re.sub(r'\bfor\b', '', tokens)
        tokens = re.sub(r'\bon\b', '', tokens)
        tokens = re.sub(r'\bwith\b', '', tokens)
        return set(tokenizer.tokenize(tokens))

In [11]:
ds_arxiv = rltk.Dataset(reader=rltk.DataFrameReader(df_arxiv), record_class=ArxivRecord, adapter=rltk.MemoryKeyValueAdapter())
print(type(ds_arxiv))
ds_arxiv.generate_dataframe().head(3)

<class 'rltk.dataset.Dataset'>


Unnamed: 0,id,authors_string,title_string,summary_string,categories_string,published_string,updated_string,url_string,blocking_tokens
0,0,"[Yue Wang, Yongbin Sun, Ziwei Liu, Sanjay E. S...",Dynamic Graph CNN for Learning on Point Clouds,Point clouds provide a flexible geometric re...,[cs.CV],2018-01-24T01:14:04Z,2019-06-11T06:11:21Z,http://arxiv.org/abs/1801.07829v2,"{Solomon, Bronstein, Sanjay, Yue, Wang, Point,..."
1,1,"[Dushyant Mehta, Srinath Sridhar, Oleksandr So...",VNect: Real-time 3D Human Pose Estimation with...,We present the first real-time method to cap...,"[cs.CV, cs.GR]",2017-05-03T19:13:23Z,2017-05-03T19:13:23Z,http://arxiv.org/abs/1705.01583v1,"{Mehta, -, Theobalt, Sotnychenko, 3D, Xu, Srin..."
2,2,"[Peng-Shuai Wang, Yang Liu, Yu-Xiao Guo, Chun-...",O-CNN: Octree-based Convolutional Neural Netwo...,"We present O-CNN, an Octree-based Convolutio...",[cs.CV],2017-12-05T09:25:19Z,2017-12-05T09:25:19Z,http://arxiv.org/abs/1712.01537v1,"{Yu, O, -, 3D, Neural, based, Xin, Peng, Shape..."


**Google Scholar Dataset**

In [12]:
class GScholarRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = 'GScholarRecord'
        
    @property
    def id(self):
        return self.raw_object['ID']
    
    @rltk.cached_property
    def authors_string(self):
        return self.raw_object['authors']
    
    @rltk.cached_property
    def title_string(self):
        return self.raw_object['title']
        
    @rltk.cached_property
    def journal_string(self):
        return self.raw_object['journal']
    
    @rltk.cached_property
    def citations_string(self):
        return self.raw_object['citations']
    
    @rltk.cached_property
    def year_string(self):
        return self.raw_object['year']
    
    @rltk.cached_property
    def url_string(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def blocking_tokens(self):
        tokens = ' '.join([self.title_string])
        tokens = re.sub(r'\bThe\b', '', tokens)
        tokens = re.sub(r'\bthe\b', '', tokens)
        tokens = re.sub(r'\bof\b', '', tokens)
        tokens = re.sub(r"\b's\b", '', tokens)
        tokens = re.sub(r'\band\b', '', tokens)
        tokens = re.sub(r'\bI\b', '', tokens)
        tokens = re.sub(r'\bA\b', '', tokens)
        tokens = re.sub(r'\bin\b', '', tokens)
        tokens = re.sub(r'\bfor\b', '', tokens)
        tokens = re.sub(r'\bon\b', '', tokens)
        tokens = re.sub(r'\bwith\b', '', tokens)
        return set(tokenizer.tokenize(tokens))

In [13]:
ds_gscholar = rltk.Dataset(reader=rltk.DataFrameReader(df_gscholar), record_class=GScholarRecord, adapter=rltk.MemoryKeyValueAdapter())
print(type(ds_gscholar))
ds_gscholar.generate_dataframe().head(3)

<class 'rltk.dataset.Dataset'>


Unnamed: 0,id,authors_string,title_string,journal_string,citations_string,year_string,url_string,blocking_tokens
0,0,"[M Loper, N Mahmood, J Romero, G Pons-Moll, MJ...",SMPL: a skinned multi-person linear model,"ACM Transactions on Graphics (TOG) 34 (6), 1-16",620,2015,http://scholar.google.com/scholar?oi=bibs&clus...,"{-, J, linear, a, Pons, skinned, Moll, multi, ..."
1,1,"[Y Wang, Y Sun, Z Liu, SE Sarma, MM Bronstein,...",Dynamic Graph CNN for Learning on Point Clouds,"ACM Transactions on Graphics (TOG) 38 (5), 1-12",536,2019,http://scholar.google.com/scholar?oi=bibs&clus...,"{Dynamic, Graph, Y, MM, JM, Solomon, Wang, Poi..."
2,2,"[S Iizuka, E Simo-Serra, H Ishikawa]",Let there be color! joint end-to-end learning ...,"ACM Transactions on Graphics (TOG) 35 (4), 1-11",441,2016,http://scholar.google.com/scholar?oi=bibs&clus...,"{Let, -, to, be, image, there, !, local, Serra..."


### Blocking

In [14]:
# Generate blocks from tokens
token_blocker = rltk.TokenBlockGenerator()
blocks = token_blocker.generate(
    token_blocker.block(ds_arxiv, property_='blocking_tokens'),
    token_blocker.block(ds_gscholar, property_='blocking_tokens'))
print(type(blocks))

<class 'rltk.blocking.block.Block'>


In [15]:
# Extract all record pairs from the block
record_pairs = rltk.get_record_pairs(ds_arxiv, ds_gscholar, block=blocks)

# Get the total number of record pairs generated
compared_pairs = len(list(record_pairs))

# Get the number of elements in each rltk.Dataset
tally_imdb = ds_arxiv.generate_dataframe().shape[0]
tally_tmd = ds_gscholar.generate_dataframe().shape[0]

# Calculate the total number of pairs if both datasets were to be compared without any blocking (eg: a double for loop)
tally_unblocked = tally_imdb * tally_tmd

# Calculate how much smaller the blocked pairings are
reduction_ratio = compared_pairs / tally_unblocked

# Calculate the reduction ratio (the inverse of the )
reduction_ratio = 1 - reduction_ratio
print(f'Reduction Ratio: {reduction_ratio:.5f}')

Reduction Ratio: 0.43673


### Matching Functions

In [16]:
def title_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_title = arxiv_tuple.title_string.strip().lower()
    gscholar_title = gscholar_tuple.title_string.strip().lower()
    similarity = SequenceMatcher(None, arxiv_title, gscholar_title).ratio()

    penalties = sum([len(arxiv_title)<=6,
                     len(gscholar_title)<=6])

    return similarity * (0.9**penalties)

In [17]:
def author_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_author = ' '.join(arxiv_tuple.authors_string).strip().lower()
    gscholar_author = ' '.join(gscholar_tuple.authors_string).strip().lower()
    similarity = SequenceMatcher(None, arxiv_author, gscholar_author).ratio() 
    return similarity

In [18]:
def year_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_year = int(float(arxiv_tuple.updated_string[0:4]))
    gscholar_year = int(float(gscholar_tuple.year_string))
    similarity = 1 /(1 + abs(arxiv_year-gscholar_year))
    return similarity

In [19]:
def elementwise_similarity(arxiv_tuple, gscholar_tuple, match_threshold=0.75):
    sim_title = title_similarity(arxiv_tuple, gscholar_tuple)
    sim_author = author_similarity(arxiv_tuple, gscholar_tuple)
    sim_year = year_similarity(arxiv_tuple, gscholar_tuple)

    element_similarity = (0.70 * sim_title) + (0.15 * sim_author) + (0.15 * sim_year)

    return element_similarity > match_threshold, element_similarity

In [20]:
# Predict matches for all pairs in the blocked data 
print(f'Arxiv samples: {df_arxiv.shape[0]}')
print(f'GScholar samples: {df_gscholar.shape[0]}')

summary_df = pd.DataFrame()
THRESHOLDS = [T/100 for T in range(0, 101, 5)]

# Iterate through various thresholds to find the most matches without any duplicates
for T in tqdm(THRESHOLDS):

    # Set to store pairs of IDs matched
    ids_matched = set()
    
    # Iterate through candidates on the block
    for block_id, arxiv_id, gscholar_id in blocks.pairwise(ds_arxiv, ds_gscholar):
        
        # Find similarity at a given threshold
        match , similarity = elementwise_similarity(ds_arxiv.get_record(arxiv_id),
                                                    ds_gscholar.get_record(gscholar_id),
                                                    match_threshold=T)
        # If a match is found, add to the set of matches
        if match:
            ids_matched.add((arxiv_id, gscholar_id))
    
    # Count the number of unique elements derived from each source
    set_a = set()
    set_b = set()
    for tp in ids_matched:
        set_a.add(tp[0])
        set_b.add(tp[1])
    
    summary_df.at[T, 'Matches'] = int(len(ids_matched))
    summary_df.at[T, 'Set_A Size'] = int(len(set_a))
    summary_df.at[T, 'Set_B Size'] = int(len(set_b))
    summary_df.at[T, 'Duplicates'] = int((len(ids_matched)-len(set_a)) + (len(ids_matched)-len(set_b)))
    
summary_df

  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

Arxiv samples: 61
GScholar samples: 481


100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [04:18<00:00, 12.31s/it]


Unnamed: 0,Matches,Set_A Size,Set_B Size,Duplicates
0.0,11956.0,61.0,11956.0,23375.0
0.05,11956.0,61.0,11956.0,23375.0
0.1,11954.0,61.0,11954.0,23371.0
0.15,11833.0,61.0,11833.0,23129.0
0.2,11141.0,61.0,11141.0,21745.0
0.25,8863.0,61.0,8863.0,17190.0
0.3,5253.0,61.0,5253.0,9972.0
0.35,2385.0,61.0,2385.0,4263.0
0.4,919.0,60.0,919.0,1447.0
0.45,278.0,56.0,278.0,343.0


In [21]:
# Find the lowest threshold which gives no duplicates
optimal_threshold = summary_df[summary_df['Duplicates']==0].index[0]
optimal_threshold

0.75

In [22]:
# Generate matches based on the optimal (no-duplicate) threshold
print(f'Arxiv samples: {df_arxiv.shape[0]}')
print(f'GScholar samples: {df_gscholar.shape[0]}')

# Store tuples of matches IDs, as well as singletons witouth a match
ids_matched = set()
singles_arxiv = set()
singles_gscholar = set()

# Write matches (and non-matches) to a CSV
with open(f'Matches_{TOPIC}.csv', 'w') as predictions_full:
    for block_id, arxiv_id, gscholar_id in blocks.pairwise(ds_arxiv, ds_gscholar):

        match , similarity = elementwise_similarity(ds_arxiv.get_record(arxiv_id),
                                                    ds_gscholar.get_record(gscholar_id),
                                                    match_threshold=optimal_threshold)

        if match:
            ids_matched.add((arxiv_id, gscholar_id))
        else:
            singles_arxiv.add(arxiv_id)
            singles_gscholar.add(gscholar_id)
    
    # After finding all matches, write them to a csv
    for match_pair in ids_matched:
        predictions_full.write(f'{match_pair[0]},{match_pair[1]},1\n')
        # And ensure that no item in the matches is counted as a single
        try:
            singles_arxiv.remove(match_pair[0])
        except:
            pass
        try:
            singles_gscholar.remove(match_pair[1])
        except:
            pass
    
    # Then write all the singles which didn't find a match
    NULL = None
    for arxiv_id in singles_arxiv:
        predictions_full.write(f'{arxiv_id},{NULL},0\n')
    for gscholar_id in singles_gscholar:
        predictions_full.write(f'{NULL},{gscholar_id},0\n')        
        
print()
print(f'Matches: {len(ids_matched)}')
print(f'Non-Matches Arxiv: {len(singles_arxiv)}')
print(f'Non-Matches GScholar: {len(singles_gscholar)}')

Arxiv samples: 61
GScholar samples: 481

Matches: 52
Non-Matches Arxiv: 9
Non-Matches GScholar: 424


### Create Merged Dataset

In [23]:
import rdflib
from rdflib import URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS, XSD

MYNS = Namespace('http://inf558.org/myfakenamespace#')
SCHEMA = Namespace("https://schema.org/")

# Initliaze the graph
g = rdflib.Graph()

# Bind namespace and prefixes
g.bind('my_ns', MYNS)
g.bind('schema', SCHEMA)
g.bind('rdf', RDF)
g.bind('rdfs', RDFS)
g.bind('xsd', XSD)

In [24]:
# Load predictions to be used in populating the RDF
predictions_df = pd.read_csv(f'Matches_{TOPIC}.csv', header=None, names=['ARXIV_ID', 'GSCHOLAR_ID', 'LABEL'])
print(f'predictions_df.shape: {predictions_df.shape}')
predicted_matches = predictions_df['LABEL'].sum()
print(f'predicted matches: {predicted_matches}  [{100*predicted_matches/predictions_df.shape[0]:.2f} %]')
predictions_df.head()

predictions_df.shape: (485, 3)
predicted matches: 52  [10.72 %]


Unnamed: 0,ARXIV_ID,GSCHOLAR_ID,LABEL
0,0,1,1
1,6,23,1
2,45,297,1
3,12,40,1
4,55,447,1


In [25]:
# Dataframe to store the merged datasets
df_merged = pd.DataFrame(columns = ['ID', 'title', 'authors', 'published', 'updated', 
                                    'abstract', 'categories', 'citations', 'arxiv_url', 'gscholar_url'],
                         dtype='object')
json_merged = {}

NEW_ID = 0

# Populate the RDF with predictions with a positive (1) label
for idx, row in tqdm(predictions_df.iterrows(), total=predictions_df.shape[0]):
    
    # Populate the json object
    json_merged[NEW_ID] = {}
    
    ### URI ###
    node_uri = URIRef(str(NEW_ID))
    g.add((node_uri, RDF.type, SCHEMA.ScholarlyArticle))
    df_merged.at[NEW_ID, 'ID'] = NEW_ID
    json_merged[NEW_ID]['ID'] = NEW_ID

    
    ### Title ###
    try:
        title_arxiv = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['title'].values[0])
    except:
        title_arxiv = '<___>'
    try:
        title_gscholar = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['title'].values[0])
    except:
        title_gscholar = '<___>'
    title = title_arxiv if title_arxiv != '<___>' else title_gscholar if title_gscholar != '<___>' else None
    g.add((node_uri, SCHEMA.headline, Literal(title, datatype=SCHEMA.Text)))
    df_merged.at[NEW_ID, 'title'] = title
    json_merged[NEW_ID]['title'] = title

    
    ### Author(s) ###
    try:
        author_arxiv = df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['authors'].values[0]
        author_arxiv = [name.strip() for name in author_arxiv if name != '<___>']
    except:
        author_arxiv = '<___>'
    try:
        author_gscholar = df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['authors'].values[0]
        author_gscholar = [name.strip() for name in author_gscholar if name != '<___>']
    except:
        author_gscholar = '<___>'
    if author_arxiv != '<___>':
        authors = list(set(author_arxiv))
    else:
        authors = list(set(author_gscholar))           
    [g.add((node_uri, SCHEMA.author, Literal(author, datatype=SCHEMA.Person))) for author in authors]
    df_merged.at[NEW_ID, 'authors'] = authors
    json_merged[NEW_ID]['authors'] = authors
                       
    ### Published ###
    try:
        published = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['published'].values[0])
        g.add((node_uri, SCHEMA.datePublished, Literal(published, datatype=SCHEMA.DateTime)))
        df_merged.at[NEW_ID, 'published'] = published
        json_merged[NEW_ID]['published'] = published
    except:
        pass
        
                       
    ### Updated ###
    try:
        updated = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['updated'].values[0])
        g.add((node_uri, SCHEMA.dateModified, Literal(updated, datatype=SCHEMA.DateTime)))
        df_merged.at[NEW_ID, 'updated'] = updated
        json_merged[NEW_ID]['updated'] = updated
    except:
        pass
          
                       
    ### Abstract ###
    try:
        abstract = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['summary'].values[0]).strip()
        g.add((node_uri, SCHEMA.abstract, Literal(abstract, datatype=SCHEMA.Text)))
        df_merged.at[NEW_ID, 'abstract'] = abstract
        json_merged[NEW_ID]['abstract'] = abstract
    except:
        pass
       
                       
    ### Categories ###
    try:
        categories = df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['categories'].values[0]
        categories = [name.strip() for name in categories if name != '<___>']
        [g.add((node_uri, SCHEMA.genre, Literal(category, datatype=SCHEMA.Text))) for category in categories]
        df_merged.at[NEW_ID, 'categories'] = categories
        json_merged[NEW_ID]['categories'] = categories
    except:
        pass
          
                       
    ### Journal ###
    try:
        journal = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['journal'].values[0])
        g.add((node_uri, SCHEMA.publisher, Literal(journal, datatype=SCHEMA.Periodical))) #datatype=SCHEMA.Organisation
        df_merged.at[NEW_ID, 'journal'] = journal
        json_merged[NEW_ID]['journal'] = journal
    except:
        pass
     
                       
    ### Citations ###
    try:
        citations = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['citations'].values[0])
        g.add((node_uri, SCHEMA.commentCount, Literal(citations, datatype=SCHEMA.Integer)))
        df_merged.at[NEW_ID, 'citations'] = citations
        json_merged[NEW_ID]['citations'] = citations
    except:
        pass
            
                       
    ### Arxiv URL ###
    try:
        arxiv_url = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['url'].values[0])
        g.add((node_uri, SCHEMA.url, Literal(arxiv_url, datatype=SCHEMA.URL)))
        df_merged.at[NEW_ID, 'arxiv_url'] = arxiv_url    
        json_merged[NEW_ID]['arxiv_url'] = arxiv_url
    except:
        pass
        
                       
    ### Google Scholar URL ###
    try:
        gscholar_url = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['url'].values[0])
        g.add((node_uri, SCHEMA.url, Literal(gscholar_url, datatype=SCHEMA.URL)))
        df_merged.at[NEW_ID, 'gscholar_url'] = gscholar_url   
        json_merged[NEW_ID]['gscholar_url'] = gscholar_url
    except:
        pass
             
                       
    NEW_ID += 1
    
# Save to disk using turtle format
g.serialize(f'Triples_{TOPIC}.ttl.', format="turtle")

# And save the merged DataFrame as CSV
df_merged.to_csv(f'Merged_{TOPIC}.csv', index=False)

# Also save as Json, just because
with open(f'Json_{TOPIC}.json', 'w') as fout:
    json.dump(json_merged, fout)

100%|███████████████████████████████████████████████████████████████████████████████| 485/485 [00:04<00:00, 110.19it/s]


In [26]:
df_merged.head(3)

Unnamed: 0,ID,title,authors,published,updated,abstract,categories,citations,arxiv_url,gscholar_url,journal
0,0,Dynamic Graph CNN for Learning on Point Clouds,"[Yue Wang, Sanjay E. Sarma, Justin M. Solomon,...",2018-01-24T01:14:04Z,2019-06-11T06:11:21Z,Point clouds provide a flexible geometric repr...,[cs.CV],536,http://arxiv.org/abs/1801.07829v2,http://scholar.google.com/scholar?oi=bibs&clus...,"ACM Transactions on Graphics (TOG) 38 (5), 1-12"
1,1,Looking to Listen at the Cocktail Party: A Spe...,"[Avinatan Hassidim, Oran Lang, Michael Rubinst...",2018-04-10T16:28:59Z,2018-08-09T21:22:37Z,We present a joint audio-visual model for isol...,"[cs.SD, cs.CV, eess.AS]",189,http://arxiv.org/abs/1804.03619v2,http://scholar.google.com/scholar?oi=bibs&clus...,"ACM Transactions on Graphics (TOG) 37 (4), 1-11"
2,2,Convergence of univariate non-stationary subdi...,"[Carla Manni, Costanza Conti, Marie-Laurence M...",2014-10-10T10:30:14Z,2014-10-10T10:30:14Z,A new equivalence notion between non-stationar...,[math.NA],34,http://arxiv.org/abs/1410.2729v1,http://scholar.google.com/scholar?oi=bibs&clus...,"Computer Aided Geometric Design 37, 1-8"


In [27]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 485 entries, 0 to 484
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            485 non-null    object
 1   title         485 non-null    object
 2   authors       485 non-null    object
 3   published     61 non-null     object
 4   updated       61 non-null     object
 5   abstract      61 non-null     object
 6   categories    61 non-null     object
 7   citations     476 non-null    object
 8   arxiv_url     61 non-null     object
 9   gscholar_url  476 non-null    object
 10  journal       476 non-null    object
dtypes: object(11)
memory usage: 61.6+ KB


In [28]:
json_merged

{0: {'ID': 0,
  'title': 'Dynamic Graph CNN for Learning on Point Clouds',
  'authors': ['Yue Wang',
   'Sanjay E. Sarma',
   'Justin M. Solomon',
   'Yongbin Sun',
   'Ziwei Liu',
   'Michael M. Bronstein'],
  'published': '2018-01-24T01:14:04Z',
  'updated': '2019-06-11T06:11:21Z',
  'abstract': 'Point clouds provide a flexible geometric representation suitable forcountless applications in computer graphics; they also comprise the raw outputof most 3D data acquisition devices. While hand-designed features on pointclouds have long been proposed in graphics and vision, however, the recentoverwhelming success of convolutional neural networks (CNNs) for image analysissuggests the value of adapting insight from CNN to the point cloud world. Pointclouds inherently lack topological information so designing a model to recovertopology can enrich the representation power of point clouds. To this end, wepropose a new neural network module dubbed EdgeConv suitable for CNN-basedhigh-level tasks o

## End