In [1]:
import pandas as pd
import rltk
import re
from difflib import SequenceMatcher
from tqdm import tqdm
import json

## Load Data

In [2]:
TOPIC = 'vision'

**Arxiv**

In [3]:
df_arxiv = pd.read_json(f'crawler558/arxiv_crawler_{TOPIC}.jl', lines=True)
print(f'df_arxiv.shape pre  deduplucation: {df_arxiv.shape}')
df_arxiv = df_arxiv.drop_duplicates(subset='title')
print(f'df_arxiv.shape post deduplucation: {df_arxiv.shape}')
df_arxiv.head()

df_arxiv.shape pre  deduplucation: (1153, 7)
df_arxiv.shape post deduplucation: (1151, 7)


Unnamed: 0,url,updated,published,title,summary,authors,categories
0,http://arxiv.org/abs/1409.4842v1,2014-09-17T01:03:11Z,2014-09-17T01:03:11Z,Going Deeper with Convolutions,We propose a deep convolutional neural netwo...,"[Christian Szegedy, Wei Liu, Yangqing Jia, Pie...",[cs.CV]
1,http://arxiv.org/abs/2011.06825v2,2020-12-03T19:50:31Z,2020-11-13T09:33:03Z,LULC classification by semantic segmentation o...,This paper analyses how well a Fast Fully Co...,"[Md. Saif Hassan Onim, Aiman Rafeed Ehtesham, ...","[cs.CV, cs.LG]"
2,http://arxiv.org/abs/2003.11213v1,2020-03-25T04:27:01Z,2020-03-25T04:27:01Z,A New Multiple Max-pooling Integration Module ...,To better retain the deep features of an ima...,"[Hongfeng You, Shengwei Tian, Long Yu, Xiang M...","[cs.CV, cs.LG, eess.IV]"
3,http://arxiv.org/abs/1907.09194v2,2020-04-30T10:39:17Z,2019-07-22T09:19:05Z,FD-FCN: 3D Fully Dense and Fully Convolutional...,"In this paper, a 3D patch-based fully dense ...","[Binbin Yang, Weiwei Zhang]","[eess.IV, cs.CV]"
4,http://arxiv.org/abs/1907.06915v1,2019-07-16T09:41:13Z,2019-07-16T09:41:13Z,Mango Tree Net -- A fully convolutional networ...,This work presents a method for semantic seg...,"[Vikas Agaradahalli Gurumurthy, Ramesh Kestur,...",[cs.CV]


In [4]:
# Generate an id column for RLTK to use
df_arxiv.reset_index(inplace=True)
df_arxiv['index'] = df_arxiv['index'].astype('str')
df_arxiv.rename(columns={'index':'ID'}, inplace=True)
df_arxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1151 entries, 0 to 1150
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          1151 non-null   object
 1   url         1151 non-null   object
 2   updated     1151 non-null   object
 3   published   1151 non-null   object
 4   title       1151 non-null   object
 5   summary     1151 non-null   object
 6   authors     1151 non-null   object
 7   categories  1151 non-null   object
dtypes: object(8)
memory usage: 72.1+ KB


**Google Scholar**

In [5]:
df_gscholar = pd.read_json(f'Google_Scholar/articles_{TOPIC}.json')
print(f'df_gscholar.shape pre  deduplication: {df_gscholar.shape}')
df_gscholar = df_gscholar.drop_duplicates(subset='title')
print(f'df_gscholar.shape post deduplication: {df_gscholar.shape}')

# Fix the wrong URLs
df_gscholar['url'] = df_gscholar['url'].apply(lambda x: x[27:])

df_gscholar.head()

df_gscholar.shape pre  deduplication: (746, 6)
df_gscholar.shape post deduplication: (738, 6)


Unnamed: 0,title,url,authors,journal,citations,year
0,Going Deeper With Convolutions,http://scholar.google.com/scholar?oi=bibs&clus...,"[C Szegedy, W Liu, Y Jia, P Sermanet, S Reed, ...",Proceedings of the IEEE Conference on Computer...,22434,2015
1,Fully Convolutional Networks for Semantic Segm...,http://scholar.google.com/scholar?oi=bibs&clus...,"[J Long, E Shelhamer, T Darrell]",Proceedings of the IEEE Conference on Computer...,16664,2015
2,"You Only Look Once: Unified, Real-Time Object ...",http://scholar.google.com/scholar?oi=bibs&clus...,"[J Redmon, S Divvala, R Girshick, A Farhadi]",Proceedings of the IEEE Conference on Computer...,9772,2016
3,Densely Connected Convolutional Networks,http://scholar.google.com/scholar?oi=bibs&clus...,"[G Huang, Z Liu, L van der Maaten, KQ Weinberger]",Proceedings of the IEEE Conference on Computer...,9733,2017
4,Rethinking the Inception Architecture for Comp...,http://scholar.google.com/scholar?oi=bibs&clus...,"[C Szegedy, V Vanhoucke, S Ioffe, J Shlens, Z ...",Proceedings of the IEEE Conference on Computer...,8499,2016


In [6]:
# Generate an id column for RLTK to use
df_gscholar.reset_index(inplace=True)
df_gscholar['index'] = df_gscholar['index'].astype('str')
df_gscholar.rename(columns={'index':'ID'}, inplace=True)

# Also set all columns to string type
df_gscholar['citations'] = df_gscholar['citations'].astype('str')
df_gscholar['year'] = df_gscholar['year'].astype('str')
df_gscholar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 738 entries, 0 to 737
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         738 non-null    object
 1   title      738 non-null    object
 2   url        738 non-null    object
 3   authors    738 non-null    object
 4   journal    738 non-null    object
 5   citations  738 non-null    object
 6   year       738 non-null    object
dtypes: object(7)
memory usage: 40.5+ KB


### Naïve Data Merge

In [7]:
# How many matches can be found with a naÏve identical string approach?
arxiv_titles = df_arxiv['title']
gscholar_titles = df_gscholar['title']
print(len([1 for w in arxiv_titles.values if w in gscholar_titles.values]))

234


In [8]:
# Merge both datasets by "SQL" join
df_merged = df_arxiv.merge(df_gscholar, on='title', how='outer')
print(f'df_merged.shape: {df_merged.shape}')

# Number of match found with pd.merge:
print(f'Number of match found with pd.merge: {df_arxiv.shape[0] + df_gscholar.shape[0] - df_merged.shape[0]}')

df_merged.head(3)

df_merged.shape: (1655, 14)
Number of match found with pd.merge: 234


Unnamed: 0,ID_x,url_x,updated,published,title,summary,authors_x,categories,ID_y,url_y,authors_y,journal,citations,year
0,0,http://arxiv.org/abs/1409.4842v1,2014-09-17T01:03:11Z,2014-09-17T01:03:11Z,Going Deeper with Convolutions,We propose a deep convolutional neural netwo...,"[Christian Szegedy, Wei Liu, Yangqing Jia, Pie...",[cs.CV],,,,,,
1,1,http://arxiv.org/abs/2011.06825v2,2020-12-03T19:50:31Z,2020-11-13T09:33:03Z,LULC classification by semantic segmentation o...,This paper analyses how well a Fast Fully Co...,"[Md. Saif Hassan Onim, Aiman Rafeed Ehtesham, ...","[cs.CV, cs.LG]",,,,,,
2,2,http://arxiv.org/abs/2003.11213v1,2020-03-25T04:27:01Z,2020-03-25T04:27:01Z,A New Multiple Max-pooling Integration Module ...,To better retain the deep features of an ima...,"[Hongfeng You, Shengwei Tian, Long Yu, Xiang M...","[cs.CV, cs.LG, eess.IV]",,,,,,


## RLTK Data Merge

In [9]:
# RLTK Tokenizer
tokenizer = rltk.CrfTokenizer()

**Arxiv Dataset**

In [10]:
class ArxivRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = 'ArxivRecord'
        
    @property
    def id(self):
        return self.raw_object['ID']
    
    @rltk.cached_property
    def authors_string(self):
        return self.raw_object['authors']
    
    @rltk.cached_property
    def title_string(self):
        return self.raw_object['title']
        
    @rltk.cached_property
    def summary_string(self):
        return self.raw_object['summary']
    
    @rltk.cached_property
    def categories_string(self):
        return self.raw_object['categories']
    
    @rltk.cached_property
    def published_string(self):
        return self.raw_object['published']
    
    @rltk.cached_property
    def updated_string(self):
        return self.raw_object['updated']
    
    @rltk.cached_property
    def url_string(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def blocking_tokens(self):
        tokens = ' '.join([self.title_string])
        tokens = re.sub(r'\bThe\b', '', tokens)
        tokens = re.sub(r'\bthe\b', '', tokens)
        tokens = re.sub(r'\bof\b', '', tokens)
        tokens = re.sub(r"\b's\b", '', tokens)
        tokens = re.sub(r'\band\b', '', tokens)
        tokens = re.sub(r'\bI\b', '', tokens)
        tokens = re.sub(r'\bA\b', '', tokens)
        tokens = re.sub(r'\bin\b', '', tokens)
        tokens = re.sub(r'\bfor\b', '', tokens)
        tokens = re.sub(r'\bon\b', '', tokens)
        tokens = re.sub(r'\bwith\b', '', tokens)
        return set(tokenizer.tokenize(tokens))

In [11]:
ds_arxiv = rltk.Dataset(reader=rltk.DataFrameReader(df_arxiv), record_class=ArxivRecord, adapter=rltk.MemoryKeyValueAdapter())
print(type(ds_arxiv))
ds_arxiv.generate_dataframe().head(3)

<class 'rltk.dataset.Dataset'>


Unnamed: 0,id,authors_string,title_string,summary_string,categories_string,published_string,updated_string,url_string,blocking_tokens
0,0,"[Christian Szegedy, Wei Liu, Yangqing Jia, Pie...",Going Deeper with Convolutions,We propose a deep convolutional neural netwo...,[cs.CV],2014-09-17T01:03:11Z,2014-09-17T01:03:11Z,http://arxiv.org/abs/1409.4842v1,"{Convolutions, Going, Deeper}"
1,1,"[Md. Saif Hassan Onim, Aiman Rafeed Ehtesham, ...",LULC classification by semantic segmentation o...,This paper analyses how well a Fast Fully Co...,"[cs.CV, cs.LG]",2020-11-13T09:33:03Z,2020-12-03T19:50:31Z,http://arxiv.org/abs/2011.06825v2,"{images, by, satellite, semantic, using, segme..."
2,2,"[Hongfeng You, Shengwei Tian, Long Yu, Xiang M...",A New Multiple Max-pooling Integration Module ...,To better retain the deep features of an ima...,"[cs.CV, cs.LG, eess.IV]",2020-03-25T04:27:01Z,2020-03-25T04:27:01Z,http://arxiv.org/abs/2003.11213v1,"{Multiple, Segmentation, New, -, Module, Decon..."


**Google Scholar Dataset**

In [12]:
class GScholarRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = 'GScholarRecord'
        
    @property
    def id(self):
        return self.raw_object['ID']
    
    @rltk.cached_property
    def authors_string(self):
        return self.raw_object['authors']
    
    @rltk.cached_property
    def title_string(self):
        return self.raw_object['title']
        
    @rltk.cached_property
    def journal_string(self):
        return self.raw_object['journal']
    
    @rltk.cached_property
    def citations_string(self):
        return self.raw_object['citations']
    
    @rltk.cached_property
    def year_string(self):
        return self.raw_object['year']
    
    @rltk.cached_property
    def url_string(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def blocking_tokens(self):
        tokens = ' '.join([self.title_string])
        tokens = re.sub(r'\bThe\b', '', tokens)
        tokens = re.sub(r'\bthe\b', '', tokens)
        tokens = re.sub(r'\bof\b', '', tokens)
        tokens = re.sub(r"\b's\b", '', tokens)
        tokens = re.sub(r'\band\b', '', tokens)
        tokens = re.sub(r'\bI\b', '', tokens)
        tokens = re.sub(r'\bA\b', '', tokens)
        tokens = re.sub(r'\bin\b', '', tokens)
        tokens = re.sub(r'\bfor\b', '', tokens)
        tokens = re.sub(r'\bon\b', '', tokens)
        tokens = re.sub(r'\bwith\b', '', tokens)
        return set(tokenizer.tokenize(tokens))

In [13]:
ds_gscholar = rltk.Dataset(reader=rltk.DataFrameReader(df_gscholar), record_class=GScholarRecord, adapter=rltk.MemoryKeyValueAdapter())
print(type(ds_gscholar))
ds_gscholar.generate_dataframe().head(3)

<class 'rltk.dataset.Dataset'>


Unnamed: 0,id,authors_string,title_string,journal_string,citations_string,year_string,url_string,blocking_tokens
0,0,"[C Szegedy, W Liu, Y Jia, P Sermanet, S Reed, ...",Going Deeper With Convolutions,Proceedings of the IEEE Conference on Computer...,22434,2015,http://scholar.google.com/scholar?oi=bibs&clus...,"{With, Convolutions, Going, Deeper}"
1,1,"[J Long, E Shelhamer, T Darrell]",Fully Convolutional Networks for Semantic Segm...,Proceedings of the IEEE Conference on Computer...,16664,2015,http://scholar.google.com/scholar?oi=bibs&clus...,"{Fully, Segmentation, Networks, Semantic, Conv..."
2,2,"[J Redmon, S Divvala, R Girshick, A Farhadi]","You Only Look Once: Unified, Real-Time Object ...",Proceedings of the IEEE Conference on Computer...,9772,2016,http://scholar.google.com/scholar?oi=bibs&clus...,"{Detection, -, Look, You, :, Time, Only, Unifi..."


### Blocking

In [14]:
# Generate blocks from tokens
token_blocker = rltk.TokenBlockGenerator()
blocks = token_blocker.generate(
    token_blocker.block(ds_arxiv, property_='blocking_tokens'),
    token_blocker.block(ds_gscholar, property_='blocking_tokens'))
print(type(blocks))

<class 'rltk.blocking.block.Block'>


In [15]:
# Extract all record pairs from the block
record_pairs = rltk.get_record_pairs(ds_arxiv, ds_gscholar, block=blocks)

# Get the total number of record pairs generated
compared_pairs = len(list(record_pairs))

# Get the number of elements in each rltk.Dataset
tally_imdb = ds_arxiv.generate_dataframe().shape[0]
tally_tmd = ds_gscholar.generate_dataframe().shape[0]

# Calculate the total number of pairs if both datasets were to be compared without any blocking (eg: a double for loop)
tally_unblocked = tally_imdb * tally_tmd

# Calculate how much smaller the blocked pairings are
reduction_ratio = compared_pairs / tally_unblocked

# Calculate the reduction ratio (the inverse of the )
reduction_ratio = 1 - reduction_ratio
print(f'Reduction Ratio: {reduction_ratio:.5f}')

Reduction Ratio: 0.45906


### Matching Functions

In [16]:
def title_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_title = arxiv_tuple.title_string.strip().lower()
    gscholar_title = gscholar_tuple.title_string.strip().lower()
    similarity = SequenceMatcher(None, arxiv_title, gscholar_title).ratio()

    penalties = sum([len(arxiv_title)<=6,
                     len(gscholar_title)<=6])

    return similarity * (0.9**penalties)

In [17]:
def author_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_author = ' '.join(arxiv_tuple.authors_string).strip().lower()
    gscholar_author = ' '.join(gscholar_tuple.authors_string).strip().lower()
    similarity = SequenceMatcher(None, arxiv_author, gscholar_author).ratio() 
    return similarity

In [18]:
def year_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_year = int(float(arxiv_tuple.updated_string[0:4]))
    gscholar_year = int(float(gscholar_tuple.year_string))
    similarity = 1 /(1 + abs(arxiv_year-gscholar_year))
    return similarity

In [19]:
def elementwise_similarity(arxiv_tuple, gscholar_tuple, match_threshold=0.75):
    sim_title = title_similarity(arxiv_tuple, gscholar_tuple)
    sim_author = author_similarity(arxiv_tuple, gscholar_tuple)
    sim_year = year_similarity(arxiv_tuple, gscholar_tuple)

    element_similarity = (0.70 * sim_title) + (0.15 * sim_author) + (0.15 * sim_year)

    return element_similarity > match_threshold, element_similarity

In [20]:
# Predict matches for all pairs in the blocked data 
print(f'Arxiv samples: {df_arxiv.shape[0]}')
print(f'GScholar samples: {df_gscholar.shape[0]}')

summary_df = pd.DataFrame()
THRESHOLDS = [T/100 for T in range(0, 101, 5)]

# Iterate through various thresholds to find the most matches without any duplicates
for T in tqdm(THRESHOLDS):

    # Set to store pairs of IDs matched
    ids_matched = set()
    
    # Iterate through candidates on the block
    for block_id, arxiv_id, gscholar_id in blocks.pairwise(ds_arxiv, ds_gscholar):
        
        # Find similarity at a given threshold
        match , similarity = elementwise_similarity(ds_arxiv.get_record(arxiv_id),
                                                    ds_gscholar.get_record(gscholar_id),
                                                    match_threshold=T)
        # If a match is found, add to the set of matches
        if match:
            ids_matched.add((arxiv_id, gscholar_id))
    
    # Count the number of unique elements derived from each source
    set_a = set()
    set_b = set()
    for tp in ids_matched:
        set_a.add(tp[0])
        set_b.add(tp[1])
    
    summary_df.at[T, 'Matches'] = int(len(ids_matched))
    summary_df.at[T, 'Set_A Size'] = int(len(set_a))
    summary_df.at[T, 'Set_B Size'] = int(len(ids_matched))
    summary_df.at[T, 'Duplicates'] = int((len(ids_matched)-len(set_a)) + (len(ids_matched)-len(set_b)))
    
summary_df

  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

Arxiv samples: 1151
GScholar samples: 738


100%|███████████████████████████████████████████████████████████████████████████████| 21/21 [1:48:31<00:00, 310.09s/it]


Unnamed: 0,Matches,Set_A Size,Set_B Size,Duplicates
0.0,342675.0,1144.0,342675.0,683468.0
0.05,342675.0,1144.0,342675.0,683468.0
0.1,342578.0,1144.0,342578.0,683274.0
0.15,339952.0,1144.0,339952.0,678022.0
0.2,323173.0,1144.0,323173.0,644464.0
0.25,273317.0,1144.0,273317.0,544752.0
0.3,187785.0,1139.0,187785.0,373693.0
0.35,105151.0,1107.0,105151.0,208457.0
0.4,50316.0,1037.0,50316.0,98857.0
0.45,20727.0,900.0,20727.0,39819.0


In [21]:
# Find the lowest threshold which gives no duplicates
optimal_threshold = summary_df[summary_df['Duplicates']==0].index[0]
optimal_threshold

0.9

In [23]:
# Manually set threshold to 0.85 to trade 104 extra True Positives for 3 extra False Positive
optimal_threshold = 0.85

In [24]:
# Generate matches based on the optimal (no-duplicate) threshold
print(f'Arxiv samples: {df_arxiv.shape[0]}')
print(f'GScholar samples: {df_gscholar.shape[0]}')

# Store tuples of matches IDs, as well as singletons witouth a match
ids_matched = set()
singles_arxiv = set()
singles_gscholar = set()

# Write matches (and non-matches) to a CSV
with open(f'Matches_{TOPIC}.csv', 'w') as predictions_full:
    for block_id, arxiv_id, gscholar_id in blocks.pairwise(ds_arxiv, ds_gscholar):

        match , similarity = elementwise_similarity(ds_arxiv.get_record(arxiv_id),
                                                    ds_gscholar.get_record(gscholar_id),
                                                    match_threshold=optimal_threshold)

        if match:
            ids_matched.add((arxiv_id, gscholar_id))
        else:
            singles_arxiv.add(arxiv_id)
            singles_gscholar.add(gscholar_id)
    
    # After finding all matches, write them to a csv
    for match_pair in ids_matched:
        predictions_full.write(f'{match_pair[0]},{match_pair[1]},1\n')
        # And ensure that no item in the matches is counted as a single
        try:
            singles_arxiv.remove(match_pair[0])
        except:
            pass
        try:
            singles_gscholar.remove(match_pair[1])
        except:
            pass
    
    # Then write all the singles which didn't find a match
    NULL = None
    for arxiv_id in singles_arxiv:
        predictions_full.write(f'{arxiv_id},{NULL},0\n')
    for gscholar_id in singles_gscholar:
        predictions_full.write(f'{NULL},{gscholar_id},0\n')        
        
print()
print(f'Matches: {len(ids_matched)}')
print(f'Non-Matches Arxiv: {len(singles_arxiv)}')
print(f'Non-Matches GScholar: {len(singles_gscholar)}')

Arxiv samples: 1151
GScholar samples: 738

Matches: 553
Non-Matches Arxiv: 594
Non-Matches GScholar: 185


### Create Merged Dataset

In [25]:
import rdflib
from rdflib import URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS, XSD

MYNS = Namespace('http://inf558.org/myfakenamespace#')
SCHEMA = Namespace("https://schema.org/")

# Initliaze the graph
g = rdflib.Graph()

# Bind namespace and prefixes
g.bind('my_ns', MYNS)
g.bind('schema', SCHEMA)
g.bind('rdf', RDF)
g.bind('rdfs', RDFS)
g.bind('xsd', XSD)

In [26]:
# Load predictions to be used in populating the RDF
predictions_df = pd.read_csv(f'Matches_{TOPIC}.csv', header=None, names=['ARXIV_ID', 'GSCHOLAR_ID', 'LABEL'])
print(f'predictions_df.shape: {predictions_df.shape}')
predicted_matches = predictions_df['LABEL'].sum()
print(f'predicted matches: {predicted_matches}  [{100*predicted_matches/predictions_df.shape[0]:.2f} %]')
predictions_df.head()

predictions_df.shape: (1332, 3)
predicted matches: 553  [41.52 %]


Unnamed: 0,ARXIV_ID,GSCHOLAR_ID,LABEL
0,364,156,1
1,1120,713,1
2,1049,630,1
3,949,533,1
4,527,275,1


In [27]:
# Dataframe to store the merged datasets
df_merged = pd.DataFrame(columns = ['ID', 'title', 'authors', 'published', 'updated', 
                                    'abstract', 'categories', 'citations', 'arxiv_url', 'gscholar_url'],
                         dtype='object')
json_merged = {}

NEW_ID = 0

# Populate the RDF with predictions with a positive (1) label
for idx, row in tqdm(predictions_df.iterrows(), total=predictions_df.shape[0]):
    
    # Populate the json object
    json_merged[NEW_ID] = {}
    
    ### URI ###
    node_uri = URIRef(str(NEW_ID))
    g.add((node_uri, RDF.type, SCHEMA.ScholarlyArticle))
    df_merged.at[NEW_ID, 'ID'] = NEW_ID
    json_merged[NEW_ID]['ID'] = NEW_ID

    
    ### Title ###
    try:
        title_arxiv = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['title'].values[0])
    except:
        title_arxiv = '<___>'
    try:
        title_gscholar = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['title'].values[0])
    except:
        title_gscholar = '<___>'
    title = title_arxiv if title_arxiv != '<___>' else title_gscholar if title_gscholar != '<___>' else None
    g.add((node_uri, SCHEMA.headline, Literal(title, datatype=SCHEMA.Text)))
    df_merged.at[NEW_ID, 'title'] = title
    json_merged[NEW_ID]['title'] = title

    
    ### Author(s) ###
    try:
        author_arxiv = df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['authors'].values[0]
        author_arxiv = [name.strip() for name in author_arxiv if name != '<___>']
    except:
        author_arxiv = '<___>'
    try:
        author_gscholar = df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['authors'].values[0]
        author_gscholar = [name.strip() for name in author_gscholar if name != '<___>']
    except:
        author_gscholar = '<___>'
    if author_arxiv != '<___>':
        authors = list(set(author_arxiv))
    else:
        authors = list(set(author_gscholar))           
    [g.add((node_uri, SCHEMA.author, Literal(author, datatype=SCHEMA.Person))) for author in authors]
    df_merged.at[NEW_ID, 'authors'] = authors
    json_merged[NEW_ID]['authors'] = authors
                       
    ### Published ###
    try:
        published = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['published'].values[0])
        g.add((node_uri, SCHEMA.datePublished, Literal(published, datatype=SCHEMA.DateTime)))
        df_merged.at[NEW_ID, 'published'] = published
        json_merged[NEW_ID]['published'] = published
    except:
        pass
        
                       
    ### Updated ###
    try:
        updated = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['updated'].values[0])
        g.add((node_uri, SCHEMA.dateModified, Literal(updated, datatype=SCHEMA.DateTime)))
        df_merged.at[NEW_ID, 'updated'] = updated
        json_merged[NEW_ID]['updated'] = updated
    except:
        pass
          
                       
    ### Abstract ###
    try:
        abstract = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['summary'].values[0]).strip()
        g.add((node_uri, SCHEMA.abstract, Literal(abstract, datatype=SCHEMA.Text)))
        df_merged.at[NEW_ID, 'abstract'] = abstract
        json_merged[NEW_ID]['abstract'] = abstract
    except:
        pass
       
                       
    ### Categories ###
    try:
        categories = df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['categories'].values[0]
        categories = [name.strip() for name in categories if name != '<___>']
        [g.add((node_uri, SCHEMA.genre, Literal(category, datatype=SCHEMA.Text))) for category in categories]
        df_merged.at[NEW_ID, 'categories'] = categories
        json_merged[NEW_ID]['categories'] = categories
    except:
        pass
          
                       
    ### Journal ###
    try:
        journal = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['journal'].values[0])
        g.add((node_uri, SCHEMA.publisher, Literal(journal, datatype=SCHEMA.Periodical))) #datatype=SCHEMA.Organisation
        df_merged.at[NEW_ID, 'journal'] = journal
        json_merged[NEW_ID]['journal'] = journal
    except:
        pass
     
                       
    ### Citations ###
    try:
        citations = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['citations'].values[0])
        g.add((node_uri, SCHEMA.commentCount, Literal(citations, datatype=SCHEMA.Integer)))
        df_merged.at[NEW_ID, 'citations'] = citations
        json_merged[NEW_ID]['citations'] = citations
    except:
        pass
            
                       
    ### Arxiv URL ###
    try:
        arxiv_url = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['url'].values[0])
        g.add((node_uri, SCHEMA.url, Literal(arxiv_url, datatype=SCHEMA.URL)))
        df_merged.at[NEW_ID, 'arxiv_url'] = arxiv_url    
        json_merged[NEW_ID]['arxiv_url'] = arxiv_url
    except:
        pass
        
                       
    ### Google Scholar URL ###
    try:
        gscholar_url = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['url'].values[0])
        g.add((node_uri, SCHEMA.url, Literal(gscholar_url, datatype=SCHEMA.URL)))
        df_merged.at[NEW_ID, 'gscholar_url'] = gscholar_url   
        json_merged[NEW_ID]['gscholar_url'] = gscholar_url
    except:
        pass
             
                       
    NEW_ID += 1
    
# Save to disk using turtle format
g.serialize(f'Triples_{TOPIC}.ttl.', format="turtle")

# And save the merged DataFrame as CSV
df_merged.to_csv(f'Merged_{TOPIC}.csv', index=False)

# Also save as Json, just because
with open(f'Json_{TOPIC}.json', 'w') as fout:
    json.dump(json_merged, fout)

100%|██████████████████████████████████████████████████████████████████████████████| 1332/1332 [00:16<00:00, 81.36it/s]


In [28]:
df_merged.head(3)

Unnamed: 0,ID,title,authors,published,updated,abstract,categories,citations,arxiv_url,gscholar_url,journal
0,0,HyperNet: Towards Accurate Region Proposal Gen...,"[Fuchun Sun, Anbang Yao, Yurong Chen, Tao Kong]",2016-04-03T06:52:14Z,2016-04-03T06:52:14Z,Almost all of the current top-performing objec...,[cs.CV],485,http://arxiv.org/abs/1604.00600v1,http://scholar.google.com/scholar?oi=bibs&clus...,Proceedings of the IEEE Conference on Computer...
1,1,Nested Hierarchical Dirichlet Processes,"[Michael I. Jordan, David M. Blei, Chong Wang,...",2012-10-25T04:25:00Z,2014-05-02T16:36:57Z,We develop a nested hierarchical Dirichlet pro...,"[stat.ML, cs.LG]",174,http://arxiv.org/abs/1210.6738v4,http://scholar.google.com/scholar?oi=bibs&clus...,IEEE Transactions on Pattern Analysis & Machin...
2,2,An End-to-End Trainable Neural Network for Ima...,"[Cong Yao, Xiang Bai, Baoguang Shi]",2015-07-21T06:26:32Z,2015-07-21T06:26:32Z,Image-based sequence recognition has been a lo...,[cs.CV],807,http://arxiv.org/abs/1507.05717v1,http://scholar.google.com/scholar?oi=bibs&clus...,IEEE Transactions on Pattern Analysis and Mach...


In [29]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1332 entries, 0 to 1331
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            1332 non-null   object
 1   title         1332 non-null   object
 2   authors       1332 non-null   object
 3   published     1147 non-null   object
 4   updated       1147 non-null   object
 5   abstract      1147 non-null   object
 6   categories    1147 non-null   object
 7   citations     738 non-null    object
 8   arxiv_url     1147 non-null   object
 9   gscholar_url  738 non-null    object
 10  journal       738 non-null    object
dtypes: object(11)
memory usage: 157.2+ KB


In [30]:
json_merged

{0: {'ID': 0,
  'title': 'HyperNet: Towards Accurate Region Proposal Generation and Joint Object  Detection',
  'authors': ['Fuchun Sun', 'Anbang Yao', 'Yurong Chen', 'Tao Kong'],
  'published': '2016-04-03T06:52:14Z',
  'updated': '2016-04-03T06:52:14Z',
  'abstract': 'Almost all of the current top-performing object detection networks employregion proposals to guide the search for object instances. State-of-the-artregion proposal methods usually need several thousand proposals to get highrecall, thus hurting the detection efficiency. Although the latest RegionProposal Network method gets promising detection accuracy with several hundredproposals, it still struggles in small-size object detection and preciselocalization (e.g., large IoU thresholds), mainly due to the coarseness of itsfeature maps. In this paper, we present a deep hierarchical network, namelyHyperNet, for handling region proposal generation and object detection jointly.Our HyperNet is primarily based on an elaborately d

## End