In [1]:
import pandas as pd
import rltk
import re
from difflib import SequenceMatcher
from tqdm import tqdm
import json

## Load Data

In [2]:
TOPIC = 'nlp'

**Arxiv**

In [3]:
df_arxiv = pd.read_json(f'crawler558/arxiv_crawler_{TOPIC}.jl', lines=True)
print(f'df_arxiv.shape pre  deduplucation: {df_arxiv.shape}')
df_arxiv = df_arxiv.drop_duplicates(subset='title')
print(f'df_arxiv.shape post deduplucation: {df_arxiv.shape}')
df_arxiv.head()

df_arxiv.shape pre  deduplucation: (423, 7)
df_arxiv.shape post deduplucation: (422, 7)


Unnamed: 0,url,updated,published,title,summary,authors,categories
0,http://arxiv.org/abs/1503.00075v3,2015-05-30T06:51:20Z,2015-02-28T06:31:50Z,Improved Semantic Representations From Tree-St...,Because of their superior ability to preserv...,"[Kai Sheng Tai, Richard Socher, Christopher D....","[cs.CL, cs.AI, cs.LG]"
1,http://arxiv.org/abs/1603.01354v5,2016-05-29T00:42:15Z,2016-03-04T05:55:02Z,End-to-end Sequence Labeling via Bi-directiona...,State-of-the-art sequence labeling systems t...,"[Xuezhe Ma, Eduard Hovy]","[cs.LG, cs.CL, stat.ML]"
2,http://arxiv.org/abs/1904.09076v1,2019-04-19T04:38:32Z,2019-04-19T04:38:32Z,Suggestion Mining from Online Reviews using UL...,In this paper we present our approach and th...,"[Sarthak Anand, Debanjan Mahata, Kartik Aggarw...",[cs.CL]
3,http://arxiv.org/abs/1801.06146v5,2018-05-23T09:23:47Z,2018-01-18T17:54:52Z,Universal Language Model Fine-tuning for Text ...,Inductive transfer learning has greatly impa...,"[Jeremy Howard, Sebastian Ruder]","[cs.CL, cs.LG, stat.ML]"
4,http://arxiv.org/abs/1704.04368v2,2017-04-25T05:47:50Z,2017-04-14T07:55:19Z,Get To The Point: Summarization with Pointer-G...,Neural sequence-to-sequence models have prov...,"[Abigail See, Peter J. Liu, Christopher D. Man...",[cs.CL]


In [4]:
# Generate an id column for RLTK to use
df_arxiv.reset_index(inplace=True)
df_arxiv['index'] = df_arxiv['index'].astype('str')
df_arxiv.rename(columns={'index':'ID'}, inplace=True)
df_arxiv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          422 non-null    object
 1   url         422 non-null    object
 2   updated     422 non-null    object
 3   published   422 non-null    object
 4   title       422 non-null    object
 5   summary     422 non-null    object
 6   authors     422 non-null    object
 7   categories  422 non-null    object
dtypes: object(8)
memory usage: 26.5+ KB


**Google Scholar**

In [5]:
df_gscholar = pd.read_json(f'Google_Scholar/articles_{TOPIC}.json')
print(f'df_gscholar.shape pre  deduplication: {df_gscholar.shape}')
df_gscholar = df_gscholar.drop_duplicates(subset='title')
print(f'df_gscholar.shape post deduplication: {df_gscholar.shape}')

# Fix the wrong URLs
df_gscholar['url'] = df_gscholar['url'].apply(lambda x: x[27:])

df_gscholar.head()

df_gscholar.shape pre  deduplication: (547, 6)
df_gscholar.shape post deduplication: (547, 6)


Unnamed: 0,title,url,authors,journal,citations,year
0,Improved Semantic Representations From Tree-St...,http://scholar.google.com/scholar?oi=bibs&clus...,"[KS Tai, R Socher, CD Manning]",Proceedings of the 53rd Annual Meeting of the ...,1943,2015
1,End-to-end Sequence Labeling via Bi-directiona...,http://scholar.google.com/scholar?oi=bibs&clus...,"[X Ma, E Hovy]",Proceedings of the 54th Annual Meeting of the ...,1349,2016
2,Universal Language Model Fine-tuning for Text ...,http://scholar.google.com/scholar?oi=bibs&clus...,"[J Howard, S Ruder]",Proceedings of the 56th Annual Meeting of the ...,998,2018
3,Get To The Point: Summarization with Pointer-G...,http://scholar.google.com/scholar?oi=bibs&clus...,"[A See, PJ Liu, CD Manning]",Proceedings of the 55th Annual Meeting of the ...,951,2017
4,OpenNMT: Open-Source Toolkit for Neural Machin...,http://scholar.google.com/scholar?oi=bibs&clus...,"[G Klein, Y Kim, Y Deng, J Senellart, AM Rush]","ACL (System Demonstrations), 67-72",899,2017


In [6]:
# Generate an id column for RLTK to use
df_gscholar.reset_index(inplace=True)
df_gscholar['index'] = df_gscholar['index'].astype('str')
df_gscholar.rename(columns={'index':'ID'}, inplace=True)

# Also set all columns to string type
df_gscholar['citations'] = df_gscholar['citations'].astype('str')
df_gscholar['year'] = df_gscholar['year'].astype('str')
df_gscholar.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 547 entries, 0 to 546
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         547 non-null    object
 1   title      547 non-null    object
 2   url        547 non-null    object
 3   authors    547 non-null    object
 4   journal    547 non-null    object
 5   citations  547 non-null    object
 6   year       547 non-null    object
dtypes: object(7)
memory usage: 30.0+ KB


### Naïve Data Merge

In [7]:
# How many matches can be found with a naÏve identical string approach?
arxiv_titles = df_arxiv['title']
gscholar_titles = df_gscholar['title']
print(len([1 for w in arxiv_titles.values if w in gscholar_titles.values]))

138


In [8]:
# Merge both datasets by "SQL" join
df_merged = df_arxiv.merge(df_gscholar, on='title', how='outer')
print(f'df_merged.shape: {df_merged.shape}')

# Number of match found with pd.merge:
print(f'Number of match found with pd.merge: {df_arxiv.shape[0] + df_gscholar.shape[0] - df_merged.shape[0]}')

df_merged.head(3)

df_merged.shape: (831, 14)
Number of match found with pd.merge: 138


Unnamed: 0,ID_x,url_x,updated,published,title,summary,authors_x,categories,ID_y,url_y,authors_y,journal,citations,year
0,0,http://arxiv.org/abs/1503.00075v3,2015-05-30T06:51:20Z,2015-02-28T06:31:50Z,Improved Semantic Representations From Tree-St...,Because of their superior ability to preserv...,"[Kai Sheng Tai, Richard Socher, Christopher D....","[cs.CL, cs.AI, cs.LG]",,,,,,
1,1,http://arxiv.org/abs/1603.01354v5,2016-05-29T00:42:15Z,2016-03-04T05:55:02Z,End-to-end Sequence Labeling via Bi-directiona...,State-of-the-art sequence labeling systems t...,"[Xuezhe Ma, Eduard Hovy]","[cs.LG, cs.CL, stat.ML]",1.0,http://scholar.google.com/scholar?oi=bibs&clus...,"[X Ma, E Hovy]",Proceedings of the 54th Annual Meeting of the ...,1349.0,2016.0
2,2,http://arxiv.org/abs/1904.09076v1,2019-04-19T04:38:32Z,2019-04-19T04:38:32Z,Suggestion Mining from Online Reviews using UL...,In this paper we present our approach and th...,"[Sarthak Anand, Debanjan Mahata, Kartik Aggarw...",[cs.CL],,,,,,


## RLTK Data Merge

In [9]:
# RLTK Tokenizer
tokenizer = rltk.CrfTokenizer()

**Arxiv Dataset**

In [10]:
class ArxivRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = 'ArxivRecord'
        
    @property
    def id(self):
        return self.raw_object['ID']
    
    @rltk.cached_property
    def authors_string(self):
        return self.raw_object['authors']
    
    @rltk.cached_property
    def title_string(self):
        return self.raw_object['title']
        
    @rltk.cached_property
    def summary_string(self):
        return self.raw_object['summary']
    
    @rltk.cached_property
    def categories_string(self):
        return self.raw_object['categories']
    
    @rltk.cached_property
    def published_string(self):
        return self.raw_object['published']
    
    @rltk.cached_property
    def updated_string(self):
        return self.raw_object['updated']
    
    @rltk.cached_property
    def url_string(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def blocking_tokens(self):
        tokens = ' '.join([self.title_string])
        tokens = re.sub(r'\bThe\b', '', tokens)
        tokens = re.sub(r'\bthe\b', '', tokens)
        tokens = re.sub(r'\bof\b', '', tokens)
        tokens = re.sub(r"\b's\b", '', tokens)
        tokens = re.sub(r'\band\b', '', tokens)
        tokens = re.sub(r'\bI\b', '', tokens)
        tokens = re.sub(r'\bA\b', '', tokens)
        tokens = re.sub(r'\bin\b', '', tokens)
        tokens = re.sub(r'\bfor\b', '', tokens)
        tokens = re.sub(r'\bon\b', '', tokens)
        tokens = re.sub(r'\bwith\b', '', tokens)
        return set(tokenizer.tokenize(tokens))

In [11]:
ds_arxiv = rltk.Dataset(reader=rltk.DataFrameReader(df_arxiv), record_class=ArxivRecord, adapter=rltk.MemoryKeyValueAdapter())
print(type(ds_arxiv))
ds_arxiv.generate_dataframe().head(3)

<class 'rltk.dataset.Dataset'>


Unnamed: 0,id,authors_string,title_string,summary_string,categories_string,published_string,updated_string,url_string,blocking_tokens
0,0,"[Kai Sheng Tai, Richard Socher, Christopher D....",Improved Semantic Representations From Tree-St...,Because of their superior ability to preserv...,"[cs.CL, cs.AI, cs.LG]",2015-02-28T06:31:50Z,2015-05-30T06:51:20Z,http://arxiv.org/abs/1503.00075v3,"{Structured, Semantic, Improved, Memory, Tree,..."
1,1,"[Xuezhe Ma, Eduard Hovy]",End-to-end Sequence Labeling via Bi-directiona...,State-of-the-art sequence labeling systems t...,"[cs.LG, cs.CL, stat.ML]",2016-03-04T05:55:02Z,2016-05-29T00:42:15Z,http://arxiv.org/abs/1603.01354v5,"{Labeling, Sequence, via, directional, End, en..."
2,2,"[Sarthak Anand, Debanjan Mahata, Kartik Aggarw...",Suggestion Mining from Online Reviews using UL...,In this paper we present our approach and th...,[cs.CL],2019-04-19T04:38:32Z,2019-04-19T04:38:32Z,http://arxiv.org/abs/1904.09076v1,"{Online, ULMFiT, Reviews, using, Suggestion, M..."


**Google Scholar Dataset**

In [12]:
class GScholarRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = 'GScholarRecord'
        
    @property
    def id(self):
        return self.raw_object['ID']
    
    @rltk.cached_property
    def authors_string(self):
        return self.raw_object['authors']
    
    @rltk.cached_property
    def title_string(self):
        return self.raw_object['title']
        
    @rltk.cached_property
    def journal_string(self):
        return self.raw_object['journal']
    
    @rltk.cached_property
    def citations_string(self):
        return self.raw_object['citations']
    
    @rltk.cached_property
    def year_string(self):
        return self.raw_object['year']
    
    @rltk.cached_property
    def url_string(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def blocking_tokens(self):
        tokens = ' '.join([self.title_string])
        tokens = re.sub(r'\bThe\b', '', tokens)
        tokens = re.sub(r'\bthe\b', '', tokens)
        tokens = re.sub(r'\bof\b', '', tokens)
        tokens = re.sub(r"\b's\b", '', tokens)
        tokens = re.sub(r'\band\b', '', tokens)
        tokens = re.sub(r'\bI\b', '', tokens)
        tokens = re.sub(r'\bA\b', '', tokens)
        tokens = re.sub(r'\bin\b', '', tokens)
        tokens = re.sub(r'\bfor\b', '', tokens)
        tokens = re.sub(r'\bon\b', '', tokens)
        tokens = re.sub(r'\bwith\b', '', tokens)
        return set(tokenizer.tokenize(tokens))

In [13]:
ds_gscholar = rltk.Dataset(reader=rltk.DataFrameReader(df_gscholar), record_class=GScholarRecord, adapter=rltk.MemoryKeyValueAdapter())
print(type(ds_gscholar))
ds_gscholar.generate_dataframe().head(3)

<class 'rltk.dataset.Dataset'>


Unnamed: 0,id,authors_string,title_string,journal_string,citations_string,year_string,url_string,blocking_tokens
0,0,"[KS Tai, R Socher, CD Manning]",Improved Semantic Representations From Tree-St...,Proceedings of the 53rd Annual Meeting of the ...,1943,2015,http://scholar.google.com/scholar?oi=bibs&clus...,"{Structured, Semantic, Improved, Memory, Tree,..."
1,1,"[X Ma, E Hovy]",End-to-end Sequence Labeling via Bi-directiona...,Proceedings of the 54th Annual Meeting of the ...,1349,2016,http://scholar.google.com/scholar?oi=bibs&clus...,"{Labeling, Sequence, via, directional, End, en..."
2,2,"[J Howard, S Ruder]",Universal Language Model Fine-tuning for Text ...,Proceedings of the 56th Annual Meeting of the ...,998,2018,http://scholar.google.com/scholar?oi=bibs&clus...,"{Universal, Text, Language, tuning, Model, Fin..."


### Blocking

In [14]:
# Generate blocks from tokens
token_blocker = rltk.TokenBlockGenerator()
blocks = token_blocker.generate(
    token_blocker.block(ds_arxiv, property_='blocking_tokens'),
    token_blocker.block(ds_gscholar, property_='blocking_tokens'))
print(type(blocks))

<class 'rltk.blocking.block.Block'>


In [15]:
# Extract all record pairs from the block
record_pairs = rltk.get_record_pairs(ds_arxiv, ds_gscholar, block=blocks)

# Get the total number of record pairs generated
compared_pairs = len(list(record_pairs))

# Get the number of elements in each rltk.Dataset
tally_imdb = ds_arxiv.generate_dataframe().shape[0]
tally_tmd = ds_gscholar.generate_dataframe().shape[0]

# Calculate the total number of pairs if both datasets were to be compared without any blocking (eg: a double for loop)
tally_unblocked = tally_imdb * tally_tmd

# Calculate how much smaller the blocked pairings are
reduction_ratio = compared_pairs / tally_unblocked

# Calculate the reduction ratio (the inverse of the )
reduction_ratio = 1 - reduction_ratio
print(f'Reduction Ratio: {reduction_ratio:.5f}')

Reduction Ratio: 0.40544


### Matching Functions

In [16]:
def title_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_title = arxiv_tuple.title_string.strip().lower()
    gscholar_title = gscholar_tuple.title_string.strip().lower()
    similarity = SequenceMatcher(None, arxiv_title, gscholar_title).ratio()

    penalties = sum([len(arxiv_title)<=6,
                     len(gscholar_title)<=6])

    return similarity * (0.9**penalties)

In [17]:
def author_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_author = ' '.join(arxiv_tuple.authors_string).strip().lower()
    gscholar_author = ' '.join(gscholar_tuple.authors_string).strip().lower()
    similarity = SequenceMatcher(None, arxiv_author, gscholar_author).ratio() 
    return similarity

In [18]:
def year_similarity(arxiv_tuple, gscholar_tuple):
    arxiv_year = int(float(arxiv_tuple.updated_string[0:4]))
    gscholar_year = int(float(gscholar_tuple.year_string))
    similarity = 1 /(1 + abs(arxiv_year-gscholar_year))
    return similarity

In [19]:
def elementwise_similarity(arxiv_tuple, gscholar_tuple, match_threshold=0.75):
    sim_title = title_similarity(arxiv_tuple, gscholar_tuple)
    sim_author = author_similarity(arxiv_tuple, gscholar_tuple)
    sim_year = year_similarity(arxiv_tuple, gscholar_tuple)

    element_similarity = (0.70 * sim_title) + (0.15 * sim_author) + (0.15 * sim_year)

    return element_similarity > match_threshold, element_similarity

In [20]:
# Predict matches for all pairs in the blocked data 
print(f'Arxiv samples: {df_arxiv.shape[0]}')
print(f'GScholar samples: {df_gscholar.shape[0]}')

summary_df = pd.DataFrame()
THRESHOLDS = [T/100 for T in range(0, 101, 5)]

# Iterate through various thresholds to find the most matches without any duplicates
for T in tqdm(THRESHOLDS):

    # Set to store pairs of IDs matched
    ids_matched = set()
    
    # Iterate through candidates on the block
    for block_id, arxiv_id, gscholar_id in blocks.pairwise(ds_arxiv, ds_gscholar):
        
        # Find similarity at a given threshold
        match , similarity = elementwise_similarity(ds_arxiv.get_record(arxiv_id),
                                                    ds_gscholar.get_record(gscholar_id),
                                                    match_threshold=T)
        # If a match is found, add to the set of matches
        if match:
            ids_matched.add((arxiv_id, gscholar_id))
    
    # Count the number of unique elements derived from each source
    set_a = set()
    set_b = set()
    for tp in ids_matched:
        set_a.add(tp[0])
        set_b.add(tp[1])
    
    summary_df.at[T, 'Matches'] = int(len(ids_matched))
    summary_df.at[T, 'Set_A Size'] = int(len(set_a))
    summary_df.at[T, 'Set_B Size'] = int(len(ids_matched))
    summary_df.at[T, 'Duplicates'] = int((len(ids_matched)-len(set_a)) + (len(ids_matched)-len(set_b)))
    
summary_df

  0%|                                                                                           | 0/21 [00:00<?, ?it/s]

Arxiv samples: 422
GScholar samples: 547


100%|█████████████████████████████████████████████████████████████████████████████████| 21/21 [35:23<00:00, 101.10s/it]


Unnamed: 0,Matches,Set_A Size,Set_B Size,Duplicates
0.0,97935.0,422.0,97935.0,194901.0
0.05,97935.0,422.0,97935.0,194901.0
0.1,97927.0,422.0,97927.0,194885.0
0.15,97457.0,422.0,97457.0,193945.0
0.2,93822.0,422.0,93822.0,186675.0
0.25,81778.0,422.0,81778.0,162587.0
0.3,59305.0,422.0,59305.0,117641.0
0.35,35590.0,421.0,35590.0,70212.0
0.4,17955.0,415.0,17955.0,34951.0
0.45,7662.0,400.0,7662.0,14391.0


In [21]:
# Find the lowest threshold which gives no duplicates
optimal_threshold = summary_df[summary_df['Duplicates']==0].index[0]
optimal_threshold

0.95

In [31]:
# Manually set threshold to 0.85 to trade 76 extra True Positives for 1 extra False Positive
optimal_threshold = 0.85

In [32]:
# Generate matches based on the optimal (no-duplicate) threshold
print(f'Arxiv samples: {df_arxiv.shape[0]}')
print(f'GScholar samples: {df_gscholar.shape[0]}')

# Store tuples of matches IDs, as well as singletons witouth a match
ids_matched = set()
singles_arxiv = set()
singles_gscholar = set()

# Write matches (and non-matches) to a CSV
with open(f'Matches_{TOPIC}.csv', 'w') as predictions_full:
    for block_id, arxiv_id, gscholar_id in blocks.pairwise(ds_arxiv, ds_gscholar):

        match , similarity = elementwise_similarity(ds_arxiv.get_record(arxiv_id),
                                                    ds_gscholar.get_record(gscholar_id),
                                                    match_threshold=optimal_threshold)

        if match:
            ids_matched.add((arxiv_id, gscholar_id))
        else:
            singles_arxiv.add(arxiv_id)
            singles_gscholar.add(gscholar_id)
    
    # After finding all matches, write them to a csv
    for match_pair in ids_matched:
        predictions_full.write(f'{match_pair[0]},{match_pair[1]},1\n')
        # And ensure that no item in the matches is counted as a single
        try:
            singles_arxiv.remove(match_pair[0])
        except:
            pass
        try:
            singles_gscholar.remove(match_pair[1])
        except:
            pass
    
    # Then write all the singles which didn't find a match
    NULL = None
    for arxiv_id in singles_arxiv:
        predictions_full.write(f'{arxiv_id},{NULL},0\n')
    for gscholar_id in singles_gscholar:
        predictions_full.write(f'{NULL},{gscholar_id},0\n')        
        
print()
print(f'Matches: {len(ids_matched)}')
print(f'Non-Matches Arxiv: {len(singles_arxiv)}')
print(f'Non-Matches GScholar: {len(singles_gscholar)}')

Arxiv samples: 422
GScholar samples: 547

Matches: 322
Non-Matches Arxiv: 100
Non-Matches GScholar: 226


### Create Merged Dataset

In [33]:
import rdflib
from rdflib import URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS, XSD

MYNS = Namespace('http://inf558.org/myfakenamespace#')
SCHEMA = Namespace("https://schema.org/")

# Initliaze the graph
g = rdflib.Graph()

# Bind namespace and prefixes
g.bind('my_ns', MYNS)
g.bind('schema', SCHEMA)
g.bind('rdf', RDF)
g.bind('rdfs', RDFS)
g.bind('xsd', XSD)

In [34]:
# Load predictions to be used in populating the RDF
predictions_df = pd.read_csv(f'Matches_{TOPIC}.csv', header=None, names=['ARXIV_ID', 'GSCHOLAR_ID', 'LABEL'])
print(f'predictions_df.shape: {predictions_df.shape}')
predicted_matches = predictions_df['LABEL'].sum()
print(f'predicted matches: {predicted_matches}  [{100*predicted_matches/predictions_df.shape[0]:.2f} %]')
predictions_df.head()

predictions_df.shape: (648, 3)
predicted matches: 322  [49.69 %]


Unnamed: 0,ARXIV_ID,GSCHOLAR_ID,LABEL
0,218,271,1
1,229,290,1
2,84,110,1
3,119,164,1
4,215,267,1


In [35]:
# Dataframe to store the merged datasets
df_merged = pd.DataFrame(columns = ['ID', 'title', 'authors', 'published', 'updated', 
                                    'abstract', 'categories', 'citations', 'arxiv_url', 'gscholar_url'],
                         dtype='object')
json_merged = {}

NEW_ID = 0

# Populate the RDF with predictions with a positive (1) label
for idx, row in tqdm(predictions_df.iterrows(), total=predictions_df.shape[0]):
    
    # Populate the json object
    json_merged[NEW_ID] = {}
    
    ### URI ###
    node_uri = URIRef(str(NEW_ID))
    g.add((node_uri, RDF.type, SCHEMA.ScholarlyArticle))
    df_merged.at[NEW_ID, 'ID'] = NEW_ID
    json_merged[NEW_ID]['ID'] = NEW_ID

    
    ### Title ###
    try:
        title_arxiv = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['title'].values[0])
    except:
        title_arxiv = '<___>'
    try:
        title_gscholar = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['title'].values[0])
    except:
        title_gscholar = '<___>'
    title = title_arxiv if title_arxiv != '<___>' else title_gscholar if title_gscholar != '<___>' else None
    g.add((node_uri, SCHEMA.headline, Literal(title, datatype=SCHEMA.Text)))
    df_merged.at[NEW_ID, 'title'] = title
    json_merged[NEW_ID]['title'] = title

    
    ### Author(s) ###
    try:
        author_arxiv = df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['authors'].values[0]
        author_arxiv = [name.strip() for name in author_arxiv if name != '<___>']
    except:
        author_arxiv = '<___>'
    try:
        author_gscholar = df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['authors'].values[0]
        author_gscholar = [name.strip() for name in author_gscholar if name != '<___>']
    except:
        author_gscholar = '<___>'
    if author_arxiv != '<___>':
        authors = list(set(author_arxiv))
    else:
        authors = list(set(author_gscholar))           
    [g.add((node_uri, SCHEMA.author, Literal(author, datatype=SCHEMA.Person))) for author in authors]
    df_merged.at[NEW_ID, 'authors'] = authors
    json_merged[NEW_ID]['authors'] = authors
                       
    ### Published ###
    try:
        published = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['published'].values[0])
        g.add((node_uri, SCHEMA.datePublished, Literal(published, datatype=SCHEMA.DateTime)))
        df_merged.at[NEW_ID, 'published'] = published
        json_merged[NEW_ID]['published'] = published
    except:
        pass
        
                       
    ### Updated ###
    try:
        updated = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['updated'].values[0])
        g.add((node_uri, SCHEMA.dateModified, Literal(updated, datatype=SCHEMA.DateTime)))
        df_merged.at[NEW_ID, 'updated'] = updated
        json_merged[NEW_ID]['updated'] = updated
    except:
        pass
          
                       
    ### Abstract ###
    try:
        abstract = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['summary'].values[0]).strip()
        g.add((node_uri, SCHEMA.abstract, Literal(abstract, datatype=SCHEMA.Text)))
        df_merged.at[NEW_ID, 'abstract'] = abstract
        json_merged[NEW_ID]['abstract'] = abstract
    except:
        pass
       
                       
    ### Categories ###
    try:
        categories = df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['categories'].values[0]
        categories = [name.strip() for name in categories if name != '<___>']
        [g.add((node_uri, SCHEMA.genre, Literal(category, datatype=SCHEMA.Text))) for category in categories]
        df_merged.at[NEW_ID, 'categories'] = categories
        json_merged[NEW_ID]['categories'] = categories
    except:
        pass
          
                       
    ### Journal ###
    try:
        journal = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['journal'].values[0])
        g.add((node_uri, SCHEMA.publisher, Literal(journal, datatype=SCHEMA.Periodical))) #datatype=SCHEMA.Organisation
        df_merged.at[NEW_ID, 'journal'] = journal
        json_merged[NEW_ID]['journal'] = journal
    except:
        pass
     
                       
    ### Citations ###
    try:
        citations = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['citations'].values[0])
        g.add((node_uri, SCHEMA.commentCount, Literal(citations, datatype=SCHEMA.Integer)))
        df_merged.at[NEW_ID, 'citations'] = citations
        json_merged[NEW_ID]['citations'] = citations
    except:
        pass
            
                       
    ### Arxiv URL ###
    try:
        arxiv_url = str(df_arxiv[df_arxiv['ID'] == str(row['ARXIV_ID'])]['url'].values[0])
        g.add((node_uri, SCHEMA.url, Literal(arxiv_url, datatype=SCHEMA.URL)))
        df_merged.at[NEW_ID, 'arxiv_url'] = arxiv_url    
        json_merged[NEW_ID]['arxiv_url'] = arxiv_url
    except:
        pass
        
                       
    ### Google Scholar URL ###
    try:
        gscholar_url = str(df_gscholar[df_gscholar['ID'] == str(row['GSCHOLAR_ID'])]['url'].values[0])
        g.add((node_uri, SCHEMA.url, Literal(gscholar_url, datatype=SCHEMA.URL)))
        df_merged.at[NEW_ID, 'gscholar_url'] = gscholar_url   
        json_merged[NEW_ID]['gscholar_url'] = gscholar_url
    except:
        pass
             
                       
    NEW_ID += 1
    
# Save to disk using turtle format
g.serialize(f'Triples_{TOPIC}.ttl.', format="turtle")

# And save the merged DataFrame as CSV
df_merged.to_csv(f'Merged_{TOPIC}.csv', index=False)

# Also save as Json, just because
with open(f'Json_{TOPIC}.json', 'w') as fout:
    json.dump(json_merged, fout)

100%|███████████████████████████████████████████████████████████████████████████████| 648/648 [00:06<00:00, 107.61it/s]


In [36]:
df_merged.head(3)

Unnamed: 0,ID,title,authors,published,updated,abstract,categories,citations,arxiv_url,gscholar_url,journal
0,0,Multi-Source Neural Translation,"[Barret Zoph, Kevin Knight]",2016-01-05T00:49:22Z,2016-01-05T00:49:22Z,We build a multi-source machine translation mo...,[cs.CL],195,http://arxiv.org/abs/1601.00710v1,http://scholar.google.com/scholar?oi=bibs&clus...,"HLT-NAACL, 30-34"
1,1,Deep Communicating Agents for Abstractive Summ...,"[Xiaodong He, Asli Celikyilmaz, Antoine Bossel...",2018-03-27T23:29:23Z,2018-08-15T18:54:22Z,We present deep communicating agents in an enc...,[cs.CL],146,http://arxiv.org/abs/1803.10357v3,http://scholar.google.com/scholar?oi=bibs&clus...,"NAACL-HLT, 1662-1675"
2,2,Selective Encoding for Abstractive Sentence Su...,"[Furu Wei, Ming Zhou, Nan Yang, Qingyu Zhou]",2017-04-24T07:57:37Z,2017-04-24T07:57:37Z,We propose a selective encoding model to exten...,[cs.CL],160,http://arxiv.org/abs/1704.07073v1,http://scholar.google.com/scholar?oi=bibs&clus...,Proceedings of the 55th Annual Meeting of the ...


In [37]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 648 entries, 0 to 647
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            648 non-null    object
 1   title         648 non-null    object
 2   authors       648 non-null    object
 3   published     422 non-null    object
 4   updated       422 non-null    object
 5   abstract      422 non-null    object
 6   categories    422 non-null    object
 7   citations     548 non-null    object
 8   arxiv_url     422 non-null    object
 9   gscholar_url  548 non-null    object
 10  journal       548 non-null    object
dtypes: object(11)
memory usage: 76.9+ KB


In [38]:
json_merged

{0: {'ID': 0,
  'title': 'Multi-Source Neural Translation',
  'authors': ['Barret Zoph', 'Kevin Knight'],
  'published': '2016-01-05T00:49:22Z',
  'updated': '2016-01-05T00:49:22Z',
  'abstract': 'We build a multi-source machine translation model and train it to maximizethe probability of a target English string given French and German sources.Using the neural encoder-decoder framework, we explore several combinationmethods and report up to +4.8 Bleu increases on top of a very strongattention-based neural translation model.',
  'categories': ['cs.CL'],
  'journal': 'HLT-NAACL, 30-34',
  'citations': '195',
  'arxiv_url': 'http://arxiv.org/abs/1601.00710v1',
  'gscholar_url': 'http://scholar.google.com/scholar?oi=bibs&cluster=9798500345837394101&btnI=1&nossl=1&hl=en&oe=ASCII'},
 1: {'ID': 1,
  'title': 'Deep Communicating Agents for Abstractive Summarization',
  'authors': ['Xiaodong He',
   'Asli Celikyilmaz',
   'Antoine Bosselut',
   'Yejin Choi'],
  'published': '2018-03-27T23:29:23

## End