In [None]:
import pandas as pd
import json
import requests
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

author_papers = {}

# get all the papers from an author
with open("data.json", "r") as file:
    data = json.load(file)
    for paper in data:
        if paper['s2data'] == None: continue
        authors = paper['s2data']['authors']
        for author in authors: 
            if author['authorId'] in author_papers: continue
            author_id = author['authorId']
            response = requests.get("https://api.semanticscholar.org/graph/v1/author/{}/papers".format(author_id) +
                        "?fields=url,title,year,authors,abstract,publicationDate")
            while 'message' in response.json() and response.json()['message'] == 'Too Many Requests':
                print("--- Waiting")
                time.sleep(300)
            author_papers[author_id] = response.json()['data']

In [None]:
# returns the cosine similarity value of the two given texts
def compute_cosine_similarity(text1, text2):
    
    # stores text in a list
    list_text = [text1, text2]
    
    # converts text into vectors with the TF-IDF 
    vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer.fit_transform(list_text)
    tfidf_text1, tfidf_text2 = vectorizer.transform([list_text[0]]), vectorizer.transform([list_text[1]])
    
    # computes the cosine similarity
    cs_score = cosine_similarity(tfidf_text1, tfidf_text2)
    
    return np.round(cs_score[0][0],2)

In [None]:
# get the paper info in hw2
paper_dic = {}
for key, value in author_papers.items():
    for paper in value:
        paper_dic[paper['paperId']] = paper

In [None]:
# For each paper, get a list of relevant papers for each author
dic = {}
with open("data.json", "r") as file:
    data = json.load(file)
    for paper in data:
        if paper['s2data'] == None: continue
        abstract = paper['s2data']['abstract']
        authors = paper['s2data']['authors']
        if paper['s2data']['paperId'] not in dic:
            dic[paper['s2data']['paperId']] = {}
                
        for author in authors: 
            # check relevancy of the author's papers
            author_id = author['authorId']
            rel_paper_lst = []
            for rel_paper in author_papers[author_id]:
                rel_abstract = rel_paper['abstract']
                if abstract == None or rel_abstract == None:
                    score = 0
                else:
                    score = compute_cosine_similarity(abstract, rel_abstract)
                    
                rel_paper_lst.append({
                    "source": paper['s2data']['paperId'],
                    "target": rel_paper['paperId'],
                    "similarity": score
                })
            paper_lst = sorted(rel_paper_lst, key=lambda d: d['similarity'], reverse=True)
            
            ret = []
            for p in paper_lst:
                temp = paper_dic[p['target']]
                temp['score'] = p['similarity']
                ret.append(temp)
            dic[paper['s2data']['paperId']][author['authorId']] = ret

In [None]:
# rank based on the similarity score
for k, v in dic.items():
    for author, lst in v.items():
        v[author] = [item for item in lst if item['paperId'] != k]
        v[author].sort(key=lambda x: x['score'], reverse=True)

In [None]:
# Append meta info 
import json
with open("relevant3.json", "r") as file:
    data = json.load(file)

with open("data.json", "r") as file:
    original = json.load(file)

In [None]:
# get the original paper information
paper_mapper, author_mapper = {}, {}
for p in original: 
    info = p['s2data']
    if info is None: continue
    
    id = info['paperId']
    paper_mapper[id] = {
        "url": info['url'],
        "title": info['title'],
        "abstract": info["abstract"],
        "year": info["year"],
        "citationCount": info["citationCount"],
        "referenceCount": info["referenceCount"],
        "tldr": info["tldr"],
        "authors": info["authors"],
        "externalIds": info["externalIds"]
    }

In [None]:
import time
author_mapper = {}
for p, author_dic in data.items():
    for author_id, lst in author_dic.items():
        if author_id in author_mapper: continue
#         print(author_id)
        # api call for author info
        response = requests.get("https://api.semanticscholar.org/graph/v1/author/{}".format(author_id) +
                        "?fields=name,aliases,affiliations,homepage,paperCount,citationCount,hIndex")
       
        while 'message' in response.json() and response.json()['message'] == 'Too Many Requests':
            print("Waiting ---")
            time.sleep(300)
            response = requests.get("https://api.semanticscholar.org/graph/v1/author/{}".format(author_id) +
                        "?fields=name,aliases,affiliations,homepage,paperCount,citationCount,hIndex")
            
        author_mapper[author_id] = response.json()

In [None]:
with open("authors.json", "w") as file:
    file.write(json.dumps(author_mapper))

In [None]:
# get all tldr. This is too slow.
output = {}
count = 0
unique_papers = set()
TLDRS = {}
import time
# add tldr for all relevant papers
for p, author_dic in data.items():
    for author_id, lst in author_dic.items():
#         output[p][author_id] = []
        for idx, rel_p in enumerate(lst):
            rel_id = rel_p['paperId']
            
            response = requests.get("https://api.semanticscholar.org/graph/v1/paper/{}".format(rel_id) +
                        "?fields=tldr")
       
            while 'message' in response.json() and response.json()['message'] == 'Too Many Requests':
                print("Waiting ----")
                time.sleep(300)
                response = requests.get("https://api.semanticscholar.org/graph/v1/paper/{}".format(rel_id) +
            "?fields=tldr")
            
            if 'tldr' in response.json():
                TLDRS[rel_id] = response.json()['tldr']