In [59]:
import requests
import json

def get_author_details(author_ids):
    BASE_URL = "https://api.semanticscholar.org/graph/v1/author/{}"
    FIELDS = "?fields=name,paperCount,citationCount,hIndex,affiliations"
    PAPER_FIELDS = "/papers?fields=paperId,corpusId,title&limit=1000"

    authors = []

    for author_id in author_ids:
        url = BASE_URL.format(author_id) + FIELDS
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()

            author_data = {
                "author_id": data.get("authorId", ""),
                "author_name": data.get("name", ""),
                "paper_count": data.get("paperCount", ""),
                "citation_count": data.get("citationCount", ""),
                "hindex": data.get("hIndex", ""),
                "affiliations": [affiliation for affiliation in data.get("affiliations", [])],
                "papers": [],
            }
            author_data['author_name'] = author_data['author_name'].replace("Jimmy J.", "Jimmy")
            # Fetching author's papers
            papers_url = BASE_URL.format(author_id) + PAPER_FIELDS
            papers_response = requests.get(papers_url)
            if papers_response.status_code == 200:
                papers_data = papers_response.json()
                for paper in papers_data.get("data", []):
                    paper_info = {
                        "paper_id": paper.get("paperId", ""),
                        "corpus_id": paper.get("corpusId", ""),
                        "title": paper.get("title", "")
                    }
                    author_data["papers"].append(paper_info)

            authors.append(author_data)
        else:
            print(f"Failed to retrieve data for author ID: {author_id}")

    return authors

# Example usage
author_ids = ["145580839", "1816753042", "1982950", "144783904", "2166511", "2928777", "2154743364", '2154743381', '2121626141']  # Add more author IDs to this list as needed

MAP_WRONG_ID_TO_CORRECT_ID = {"2154743364":"145580839", "2154743381":"145580839", "2121626141":"145580839"}
author_details = get_author_details(author_ids)
# Sort author details by paper_count
new_authors = []
for author in author_details:
    if author["author_id"] in MAP_WRONG_ID_TO_CORRECT_ID:
        author["author_id"] = MAP_WRONG_ID_TO_CORRECT_ID.get(author["author_id"], author["author_id"])
        for new_author in new_authors:
            if new_author["author_id"] == author["author_id"]:
                new_author["paper_count"] += author["paper_count"]
                new_author["citation_count"] += author["citation_count"]
                new_author["papers"] += author["papers"]
    else:
        new_authors.append(author)
    # Find author in author_details with this author_id and add all papers to that author
author_details = new_authors[:]
author_details = sorted(author_details, key=lambda x: x["paper_count"], reverse=False)
print(json.dumps(author_details, indent=4))
# print obfuscating the paper details
for author in author_details:
    print(author["author_name"])
    print(author["paper_count"])
    print(author["citation_count"])
    print(author["hindex"])
    print(len(author["papers"]))

[
    {
        "author_id": "1816753042",
        "author_name": "Ronak Pradeep",
        "paper_count": 27,
        "citation_count": 856,
        "hindex": 10,
        "affiliations": [
            "University of Waterloo"
        ],
        "papers": [
            {
                "paper_id": "20a7b1e274aff828466bba3760992aa54e14951a",
                "corpus_id": 258822999,
                "title": "How Does Generative Retrieval Scale to Millions of Passages?"
            },
            {
                "paper_id": "532a24fbf508c40234e9126c7a3f80dd92e8545f",
                "corpus_id": 257584922,
                "title": "Pre-processing Matters! Improved Wikipedia Corpora for Open-Domain Question Answering"
            },
            {
                "paper_id": "6a0c18aa1bf69798b15ef2355d5256ae8fa80186",
                "corpus_id": 257584993,
                "title": "PyGaggle: A Gaggle of Resources for Open-Domain Question Answering"
            },
            {
           

In [60]:
# For the first author in author_details print out all the papers
for paper in author_details[0]["papers"]:
    print(paper["title"])
    print(paper["paper_id"])
    print(paper["corpus_id"])

How Does Generative Retrieval Scale to Millions of Passages?
20a7b1e274aff828466bba3760992aa54e14951a
258822999
Pre-processing Matters! Improved Wikipedia Corpora for Open-Domain Question Answering
532a24fbf508c40234e9126c7a3f80dd92e8545f
257584922
PyGaggle: A Gaggle of Resources for Open-Domain Question Answering
6a0c18aa1bf69798b15ef2355d5256ae8fa80186
257584993
Zero-Shot Listwise Document Reranking with a Large Language Model
8be0ec99f80710887e3a8e6bac5fba51a8fd7186
258461030
ReadProbe: A Demo of Retrieval-Enhanced Large Language Models to Support Lateral Reading
8dec602dcd4df3db500ad36d01fffd53ec92701a
259145284
Vector Search with OpenAI Embeddings: Lucene Is All You Need
a08bde10a47059b0ba1e58b425dd080ec9b42339
261276669
RankVicuna: Zero-Shot Listwise Document Reranking with Open-Source Large Language Models
ba03ca8faa9f01cd9d26b80f08d421376f70de22
262825475
Neural Query Synthesis and Domain-Specific Ranking Templates for Multi-Stage Clinical Trial Matching
35a318073ab5b18cf364699

In [61]:
from tqdm import tqdm
def get_paper_details(paper_ids, author_id):
    PAPER_DETAIL_FIELDS = "?fields=title,venue,publicationVenue,year,authors,abstract,referenceCount,citationCount,fieldsOfStudy,s2FieldsOfStudy,publicationDate,journal,embedding.specter_v2,tldr"
    papers_info = []
    
    for paper_id in tqdm(paper_ids, desc=f"Fetching papers for author {author_id}", unit="paper"):
        paper_detail_url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}" + PAPER_DETAIL_FIELDS
        paper_detail_response = requests.get(paper_detail_url)
        if paper_detail_response.status_code == 200:
            paper_detail = paper_detail_response.json()
            # Convert all None fields to readable format
            for key, value in paper_detail.items():
                if value is None:
                    paper_detail[key] = {}  
            paper_info = {
                "paper_id": paper_detail.get("paperId", ""),
                "corpus_id": paper_detail.get("corpusId", ""),
                "title": paper_detail.get("title", ""),
                "venue": paper_detail.get("venue", ""),
                "publication_venue": paper_detail.get("publicationVenue", ""),
                "year": paper_detail.get("year", ""),
                "authors": [author for author in paper_detail.get("authors", [])],
                "abstract": paper_detail.get("abstract", ""),
                "reference_count": paper_detail.get("referenceCount", ""),
                "citation_count": paper_detail.get("citationCount", ""),
                "fields_of_study": paper_detail.get("fieldsOfStudy", ""),
                "s2_fields_of_study": paper_detail.get("s2FieldsOfStudy", ""),
                "publication_date": paper_detail.get("publicationDate", ""),
                "journal": paper_detail.get("journal", ""),
                "embedding": paper_detail.get("embedding", {}).get("vector", []),
                "tldr": paper_detail.get("tldr", "")
            }
            if 'authors' in paper_info and len(paper_info["authors"]) > 0:
                paper_info["authors"] = [{k: MAP_WRONG_ID_TO_CORRECT_ID.get(v, v) for k, v in author.items()} for author in paper_info["authors"]]                        
            papers_info.append(paper_info)
    return papers_info

# Example usage
for author in author_details:
    print(f"Fetching paper details for author: {author['author_id']} and name {author['author_name']}")
    
    author["paper_details"] = get_paper_details([paper["paper_id"] for paper in author["papers"]], author["author_id"])
    if author["author_id"] in MAP_WRONG_ID_TO_CORRECT_ID:
        author["author_id"] = MAP_WRONG_ID_TO_CORRECT_ID[author["author_id"]]


Fetching paper details for author: 1816753042 and name Ronak Pradeep


Fetching papers for author 1816753042: 100%|██████████| 27/27 [00:10<00:00,  2.66paper/s]


Fetching paper details for author: 2928777 and name Wenhu Chen


Fetching papers for author 2928777: 100%|██████████| 58/58 [00:20<00:00,  2.87paper/s]


Fetching paper details for author: 2166511 and name R. Socher


Fetching papers for author 2166511: 100%|██████████| 201/201 [01:13<00:00,  2.75paper/s]


Fetching paper details for author: 1982950 and name Luke Zettlemoyer


Fetching papers for author 1982950: 100%|██████████| 281/281 [01:46<00:00,  2.64paper/s]


Fetching paper details for author: 144783904 and name Christopher D. Manning


Fetching papers for author 144783904: 100%|██████████| 525/525 [03:05<00:00,  2.83paper/s]


Fetching paper details for author: 145580839 and name Jimmy Lin


Fetching papers for author 145580839: 100%|██████████| 572/572 [03:25<00:00,  2.78paper/s]


In [62]:
# Filter out any paper with paper_id 059fa9a5578cf1ca241b87455eb75803bc6483c0
# Replace tldr with just its text field

# for author in author_details:
#     for paper in author["paper_details"]:
#         if paper["paper_id"] == "059fa9a5578cf1ca241b87455eb75803bc6483c0":
#             author["paper_details"].remove(paper)
#         else:
#             paper["tldr"] = paper["tldr"].get("text", "")

# Filter paper and paper_details that have id "059fa9a5578cf1ca241b87455eb75803bc6483c0":

for author in author_details:
    author["papers"] = [paper for paper in author["papers"] if paper["paper_id"] not in ["586b9d75afdbb09434c8676fb10a5b0072882bfc", "059fa9a5578cf1ca241b87455eb75803bc6483c0"]]
    author["paper_details"] = [paper for paper in author["paper_details"] if paper["paper_id"] not in ["586b9d75afdbb09434c8676fb10a5b0072882bfc", "059fa9a5578cf1ca241b87455eb75803bc6483c0"]]
    print(len(author["papers"]))
# For the first author in author_details print out all the papers
for paper in author_details[0]["papers"]:
    print(paper["title"])
    print(paper["paper_id"])
    print(paper["corpus_id"])
    print(len(author_details[0]["paper_details"]))
    # Print the paper details
    for paper_detail in author_details[0]["paper_details"]:
        if paper_detail["paper_id"] == paper["paper_id"]:
            print(paper_detail["abstract"])
            print(paper_detail["tldr"])
            print(paper_detail["embedding"])
            # Year and date
            print(paper_detail["year"])
            print(paper_detail["publication_date"])

25
58
201
281
525
572
How Does Generative Retrieval Scale to Millions of Passages?
20a7b1e274aff828466bba3760992aa54e14951a
258822999
25
Popularized by the Differentiable Search Index, the emerging paradigm of generative retrieval re-frames the classic information retrieval problem into a sequence-to-sequence modeling task, forgoing external indices and encoding an entire document corpus within a single Transformer. Although many different approaches have been proposed to improve the effectiveness of generative retrieval, they have only been evaluated on document corpora on the order of 100k in size. We conduct the first empirical study of generative retrieval techniques across various corpus scales, ultimately scaling up to the entire MS MARCO passage ranking task with a corpus of 8.8M passages and evaluating model sizes up to 11B parameters. We uncover several findings about scaling generative retrieval to millions of passages; notably, the central importance of using synthetic queri

In [63]:
# Proportion of papers with embeddings in this set

for author in author_details:
    # Print name
    print(author["author_name"])
    # Print proportion of papers with embeddings
    print(len([paper for paper in author["paper_details"] if len(paper["embedding"]) > 0]) / len(author["paper_details"]))


Ronak Pradeep
0.92
Wenhu Chen
0.9827586206896551
R. Socher
0.9601990049751243
Luke Zettlemoyer
0.9395017793594306
Christopher D. Manning
0.9314285714285714
Jimmy Lin
0.9370629370629371


In [64]:
# Save whole thing as JSON
import json
# Make data directory
import os
if not os.path.exists("data"):
    os.makedirs("data")

with open("data/author_details.json", "w") as f:
    json.dump(author_details, f, indent=4)

In [78]:
import json

# Load author_details.json

with open("data/author_details.json", "r") as f:
    author_details = json.load(f)
    

In [79]:
import matplotlib.pyplot as plt
import numpy as np
import json
import pandas as pd
import warnings
import seaborn as sns
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
warnings.filterwarnings('ignore')

BLACKLIST_PAPER_IDS = ['4e12f790879dcbaa10b43d562f70e926c15b9f50', '3c9d137e92ced9bd05c028b39dd33ed3cd6d61fe']
BLACKLIST_PAPER_TITLES = ['Reviewers, Volume 33', 'Facial Emotion Recognition Using PHOG and a Hierarchical Expression Model',
                          'IMPROVING TRADITIONAL BUILDING REPAIR CONSTRUCTION QUALITY USING HISTORIC BUILDING INFORMATION MODELING CONCEPT']
BLACKLIST_AUTHORS = ["D. Cheriton"]
# Filter to only include those papers with an embedding

def to_string_authors(list_of_authors):
    if len(list_of_authors) > 5:
        return ", ".join(list_of_authors[:5]) + ", et al."
    elif len(list_of_authors) > 2:
        return ", ".join(list_of_authors[:-1]) + ", and " + list_of_authors[-1]
    else:
        return " and ".join(list_of_authors)

for author in author_details:
    print(f"Processing {author['author_name']}")
    # Print initial paper length
    print(f"Initial paper length: {len(author['paper_details'])}")

    # Filter papers with embeddings
    papers_with_embeddings = [
        paper for paper in author['paper_details'] if 'embedding' in paper and paper['embedding'] is not None and len(paper['embedding']) > 0 and paper['paper_id'] not in BLACKLIST_PAPER_IDS and paper['title'] not in BLACKLIST_PAPER_TITLES and 'year' in paper and paper['year'] is not None and paper['year'] != {}
    ]
    if not papers_with_embeddings:
        print(f"No papers with embeddings for {author['author_name']}")
        continue
    
    # Print final paper length

    # Extract embeddings and other necessary details
    embeddings = np.array([paper['embedding'] for paper in papers_with_embeddings])
    print("Scaling!")
    scaler = StandardScaler()
    normalized_embeddings = scaler.fit_transform(embeddings)
    clusterer = KMeans(n_clusters=5, n_init=50, max_iter=500, random_state=42)
    clusters = clusterer.fit(normalized_embeddings).labels_

    titles = [paper['title'] for paper in papers_with_embeddings]
    years = [paper['year'] for paper in papers_with_embeddings]

    print("PCA!")
    pca = PCA(n_components=min(50, int(normalized_embeddings.shape[0]/2)), random_state=42)  # Adjust n_components based on the dataset and size of array
    pca_embeddings = pca.fit_transform(normalized_embeddings)

    # Apply t-SNE
    print("t-SNE!")
    tsne = TSNE(n_components=2, n_iter=5000, random_state=42)
    reduced_embeddings = tsne.fit_transform(pca_embeddings)

    # Create a DataFrame for plotly express
    df = pd.DataFrame(reduced_embeddings, columns=['x', 'y'])


    # Ensure all of df is numeric, if not remove
    df = df.apply(pd.to_numeric, errors='coerce')
    # Print count of df
    print(f"Final dataframe length: {len(df)}")
    # Show some examples of df
    df['title'] = titles
    df['year'] = years
    df['cluster'] = clusters
    # Filter paper['authors'] to remove blacklisted authors
    for paper in papers_with_embeddings:
        paper['authors'] = [author for author in paper['authors'] if author['name'] not in BLACKLIST_AUTHORS]
    df['authors'] = [list(map(lambda author: author['name'], paper['authors'])) for paper in papers_with_embeddings]
    # Boolean flag if first author
    df['first_author'] = [author['author_id'] == paper['authors'][0]['authorId'] for paper in papers_with_embeddings]
    # Boolean flag if last author (not first author and very end of the list)
    df['last_author'] = [(len(paper['authors']) != 1) and (author['author_id'] == paper['authors'][-1]['authorId']) for paper in papers_with_embeddings]
    # Boolean flag if middle author
    df['middle_author'] = [not (first_author or last_author) for first_author, last_author in zip(df['first_author'], df['last_author'])]
    # Replace None with empty list
    df['authors'] = df['authors'].apply(lambda authors: authors if authors is not None else [])
    df['authors'] = df['authors'].apply(lambda authors: to_string_authors(authors))
    df['authors'] = df['authors'].apply(lambda authors: authors.replace("Jimmy J.", "Jimmy"))
    # Add author of interest
    df['author_of_interest'] = [author['author_name'].replace("Jimmy J.", "Jimmy") for _ in papers_with_embeddings]
    # Add reference_count and citation_count
    df['reference_count'] = [paper['reference_count'] for paper in papers_with_embeddings]
    df['citation_count'] = [paper['citation_count'] for paper in papers_with_embeddings]
    
    df['tldr'] = [paper['tldr']['text'] if 'tldr' in paper and paper['tldr'] is not None and 'text' in paper['tldr'] else "" for paper in papers_with_embeddings]
    # Replace None with empty string
    df['tldr'] = df['tldr'].apply(lambda tldr: tldr if (tldr is not None and tldr != {}) else '')
    # Venue
    df['venue'] = [paper['venue'] for paper in papers_with_embeddings]
    # Function to add <br> every 10 words
    def add_line_breaks(text):
        words = text.split()
        chunks = [words[i:i+10] for i in range(0, len(words), 10)]
        return '<br>'.join([' '.join(chunk) for chunk in chunks])

    # Apply the function to the 'tldr' column
    df['tldr'] = df['tldr'].apply(add_line_breaks)
    print(df.head())
    # Save df as JSON in data/authors
    if not os.path.exists("data/authors"):
        os.makedirs("data/authors")
    with open(f"data/authors/{author['author_id']}.json", "w") as f:
        json.dump(df.to_dict(), f, indent=4)

Processing Ronak Pradeep
Initial paper length: 25
Scaling!
PCA!
t-SNE!
Final dataframe length: 22
            x           y                                              title  \
0   78.973122  -54.079910  How Does Generative Retrieval Scale to Million...   
1  146.722122  -83.190521  Zero-Shot Listwise Document Reranking with a L...   
2   -8.229628  139.122910  ReadProbe: A Demo of Retrieval-Enhanced Large ...   
3  -39.414948 -106.893845  Vector Search with OpenAI Embeddings: Lucene I...   
4  -96.595345 -154.117599  RankVicuna: Zero-Shot Listwise Document Rerank...   

   year  cluster                                            authors  \
0  2023        3  Ronak Pradeep, Kai Hui, Jai Gupta, Á. Lelkes, ...   
1  2023        0  Xueguang Ma, Xinyu Crystina Zhang, Ronak Prade...   
2  2023        3                       Dake Zhang and Ronak Pradeep   
3  2023        3  Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and...   
4  2023        0  Ronak Pradeep, Sahel Sharifymoghaddam, and Jim..