In [25]:
import vcr
import pandas as pd
import json
from typing import List
from semanticscholar import SemanticScholar

# Read list of professors that will be featured.
with open("professors.json") as f:
    professors = json.load(f)["professors"]

client = SemanticScholar(timeout=1)

class DataParser():

    def __init__(self, professors: List[str]):
        self.get_prof_objs(professors)
        self.get_prof_id_mapping()
        self.get_paper_id_mapping()
    
    def get_prof(self, name: str):
        with vcr.use_cassette('cache/{}.yaml'.format(name)):
            response = sorted(client.search_author(name), key=lambda x: -x['hIndex'])[0]
            return response

    def get_prof_objs(self, professors: List[str]):
        objs = {}
        for prof in professors:
            try:
                objs[prof] = self.get_prof(prof)
            except:
                pass
        
        self.objs = objs
    
    def get_prof_id_mapping(self):
        self.prof_id_mapping = {}
        for i, prof in enumerate(self.objs.keys()):
            self.prof_id_mapping[self.objs[prof]["authorId"]] = str(i + 1)

    def get_paper_id_mapping(self):
        self.paper_id_mapping = {}
        i = 0
        for prof in self.objs.keys():
            for paper in self.objs[prof]["papers"]:
                self.paper_id_mapping[paper["paperId"]] = str(i + 1)
                i += 1

    def get_author_table(self):
        author_data = []
        for prof in self.objs.keys():
            author_data.append(
                [
                    str(self.prof_id_mapping[self.objs[prof]["authorId"]]), 
                    prof, self.objs[prof]["url"]
                ]
            )
        
        return author_data

    def get_alias_table(self):
        aliases = set()
        for prof in self.objs.keys():
            if self.objs[prof]["aliases"] == None: continue
            for alias in self.objs[prof]["aliases"]:
                aliases.add(alias)
        return [[el] for el in aliases]
    
    def get_knownas_table(self):
        knownas_data = []
        for prof in self.objs.keys():
            if self.objs[prof]["aliases"] == None: continue
            for alias in self.objs[prof]["aliases"]:
                knownas_data.append(
                    [
                        self.prof_id_mapping[self.objs[prof]["authorId"]],
                        alias
                    ]
                )
        return knownas_data
    
    def get_paper_table(self):
        paper_data = set()
        for prof in self.objs.keys():
            for paper in self.objs[prof]["papers"]:
                paper_data.add(
                    (
                        self.paper_id_mapping[paper["paperId"]],
                        paper["title"],
                        paper["abstract"],
                        paper["publicationDate"],
                        paper["url"],
                        paper["venue"]
                    )
                )

        return paper_data
    
    def get_publication_table(self):
        publications = set()
        for prof in self.objs.keys():
            for paper in self.objs[prof]["papers"]:
                publications.add(paper["venue"])
        
        return [[el] for el in publications]
    
    def get_authored_table(self):
        authored_data = []
        for prof in self.objs.keys():
            for paper in self.objs[prof]["papers"]:
                authored_data.append(
                    [
                        self.prof_id_mapping[self.objs[prof]["authorId"]],
                        self.paper_id_mapping[paper["paperId"]]
                    ]
                )
        
        return authored_data
    
    def get_also_identified_by_public(self):
        public_id_data = []
        for prof in self.objs.keys():
            for paper in self.objs[prof]["papers"]:
                for el in paper["externalIds"].keys():
                    public_id_data.append(
                        [
                            paper["externalIds"][el],
                            el,
                            self.paper_id_mapping[paper["paperId"]]
                        ]
                    )
        return public_id_data

    def get_institution_data(self):
        return [
                ["1", "Columbia University", "United States"],
                ["2", "Cornell University", "United States"],
                ["3", "Princeton University", "United States"],
                ["4", "Harvard University", "United States"],
                ["5", "University of Pennsylvania", "United States"],
                ["6", "Brown University", "United States"],
                ["7", "Dartmouth College", "United States"],
                ["8", "Yale University", "United States"],
                ["9", "Massachusetts Institute of Technology", "United States"],
                ["10", "Stanford University", "United States"], 
                ["11", "Northwestern University", "United States"]
            ]
    
    def get_affiliation_data(self):
        affiliation_data = []
        for prof in self.objs.keys():
            affiliation_data.append(
                [
                    self.prof_id_mapping[self.objs[prof]["authorId"]], "1"
                ]
            )
            
        
        return affiliation_data

    def get_citation_data(self):
        citation_data = set()
        for prof in self.objs.keys():
            for paper in self.objs[prof]["papers"]:
                pass
        
        return []
    
    def get_user_data(self):
        return [
            ["ggt2112@columbia.edu", "Trigozord", "0" * 256]
        ]
    
    def get_collection_data(self):
        return [
            ["Test Collection", "gabrielguerratrigo20@gmail.com", "2023-10-22"]
        ]
    
    def get_includes_table(self):
        return [
            ["Test Collection", "1", "gabrielguerratrigo20@gmail.com", "2023-10-22"]
        ]

                    

In [26]:
parser = DataParser(professors)

In [27]:
parser.get_includes_table()

[['Test Collection', '1', 'gabrielguerratrigo20@gmail.com', '2023-10-22']]

In [24]:
parser.get_collection_data()

[['Test Collection', 'gabrielguerratrigo20@gmail.com', '2023-10-22']]

In [21]:
parser.get_user_data()

[['ggt2112@columbia.edu',
  'Trigozord',
  '0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000']]

In [16]:
parser.get_institution_data()


[['1', 'Columbia University', 'United States'],
 ['2', 'Cornell University', 'United States'],
 ['3', 'Princeton University', 'United States'],
 ['4', 'Harvard University', 'United States'],
 ['5', 'University of Pennsylvania', 'United States'],
 ['6', 'Brown University', 'United States'],
 ['7', 'Dartmouth College', 'United States'],
 ['8', 'Yale University', 'United States'],
 ['9', 'Massachusetts Institute of Technology', 'United States'],
 ['10', 'Stanford University', 'United States'],
 ['11', 'Northwestern University', 'United States']]

In [15]:
parser.get_affiliation_data()

[['1', '1'],
 ['2', '1'],
 ['3', '1'],
 ['4', '1'],
 ['5', '1'],
 ['6', '1'],
 ['7', '1'],
 ['8', '1'],
 ['9', '1'],
 ['10', '1'],
 ['11', '1'],
 ['12', '1'],
 ['13', '1'],
 ['14', '1'],
 ['15', '1'],
 ['16', '1'],
 ['17', '1'],
 ['18', '1'],
 ['19', '1'],
 ['20', '1'],
 ['21', '1']]

In [85]:
parser.get_author_table()

[['1', 'Luis Gravano', 'https://www.semanticscholar.org/author/1684012'],
 ['2', 'Paul Blaer', 'https://www.semanticscholar.org/author/1897655'],
 ['3',
  'Christos Papadimitriou',
  'https://www.semanticscholar.org/author/144102674'],
 ['4', 'Adam Cannon', 'https://www.semanticscholar.org/author/145051113'],
 ['5', 'Nakul Verma', 'https://www.semanticscholar.org/author/39706047'],
 ['6', 'Mihalis Yannakakis', 'https://www.semanticscholar.org/author/1748179'],
 ['7', 'Martha Kim', 'https://www.semanticscholar.org/author/8991961'],
 ['8', 'Tony Dear', 'https://www.semanticscholar.org/author/3258452'],
 ['9', 'Daniel Bauer', 'https://www.semanticscholar.org/author/87520404'],
 ['10', 'David Blei', 'https://www.semanticscholar.org/author/1796335'],
 ['11', 'Shing-Fu Chang', 'https://www.semanticscholar.org/author/145586362'],
 ['12', 'Eleni Drinea', 'https://www.semanticscholar.org/author/2103183'],
 ['13', 'Stephen Edwards', 'https://www.semanticscholar.org/author/144520714'],
 ['14', 'S

In [86]:
parser.get_alias_table()

[['Daniel K. Hsu'],
 ['M. Yannakakis'],
 ['Steven K Feiner'],
 ['Steve W. Edwards'],
 ['D. Bauer'],
 ['Stephen W. Edwards'],
 ['D. Knowles'],
 ['C Wan Chien'],
 ['Daniel Ian Rubenstein'],
 ['C. H Papadimitriou'],
 ['M Yannakakis'],
 ['S W Edwards'],
 ['S.k. Feiner'],
 ['David A Knowles'],
 ['D I Rubenstein'],
 ['D. A. Knowles'],
 ['Daniel A. Bauer'],
 ['C. Stein'],
 ['S Edwards'],
 ['Daniel K Hsu'],
 ['Steven Feiner'],
 ['Bjarne Stroustrup'],
 ['Stephen W Edwards'],
 ['Daniel I Rubenstein'],
 ['S K Feiner'],
 ['P. Blaer'],
 ['Dan Rubenstein'],
 ['Daniel K Hsu Hsu'],
 ['Chien-lin Chang Chien'],
 ['David Blei'],
 ['Dan Bauer'],
 ['Daniel Hsu'],
 ['S. K. Feiner'],
 ['Cli Ord Stein'],
 ['Steven W Edwards'],
 ['Luis Gravano'],
 ['C H Papadimitriou'],
 ['Daniel Rubenstein'],
 ['D. A. Bauer'],
 ['D.\u2009a. Bauer'],
 ['David M Blei'],
 ['C. Vondrick'],
 ['Christos H. Papadimitriou'],
 ['Martha B. Kim'],
 ['D A Bauer'],
 ['D. Hsu'],
 ['S.w. Edwards'],
 ['B. Stroustrup'],
 ['Christos Harilaos P

In [87]:
parser.get_knownas_table()

[['1', 'L. Gravano'],
 ['1', 'Luis Gravano'],
 ['2', 'P. Blaer'],
 ['2', 'Paul S. Blaer'],
 ['2', 'Paul Blaer'],
 ['3', 'Christos Papadimitriou'],
 ['3', 'C. H. Papadimitriou'],
 ['3', 'C. H Papadimitriou'],
 ['3', 'Christos Harilaos Papadimitriou'],
 ['3', 'C H Papadimitriou'],
 ['3', 'C. Papadimitriou'],
 ['3', 'Christos H. Papadimitriou'],
 ['4', 'A Cannon'],
 ['4', 'A D Cannon'],
 ['4', 'A. Cannon'],
 ['4', 'Adam D. Cannon'],
 ['6', 'M Yannakakis'],
 ['6', 'M. Yannakakis'],
 ['6', 'Mihalis Yannakakis'],
 ['7', 'Marth Kim'],
 ['7', 'Martha B. Kim'],
 ['7', 'Martha Kim'],
 ['9', 'D. A. Bauer'],
 ['9', 'Daniel A. Bauer'],
 ['9', 'Daniel Adams Bauer'],
 ['9', 'Da Bauer'],
 ['9', 'D A Bauer'],
 ['9', 'Dan Bauer'],
 ['9', 'D Bauer'],
 ['9', 'Daniel Bauer'],
 ['9', 'D.\u2009a. Bauer'],
 ['9', 'D. Bauer'],
 ['9', 'D.a. Bauer'],
 ['10', 'David M. Blei'],
 ['10', 'David Blei'],
 ['10', 'D. Blei'],
 ['10', 'David. Blei'],
 ['10', 'David M Blei'],
 ['11', 'Cheng-an Chien'],
 ['11', 'C Wan Chie

In [88]:
parser.objs["Luis Gravano"]

{'authorId': '1684012', 'externalIds': {'DBLP': ['Luis Gravano']}, 'url': 'https://www.semanticscholar.org/author/1684012', 'name': 'L. Gravano', 'aliases': ['L. Gravano', 'Luis Gravano'], 'affiliations': [], 'homepage': None, 'paperCount': 139, 'citationCount': 13459, 'hIndex': 52, 'papers': [{'paperId': '9fafc9896d2b81d1328791e4c2fd7ab096f155f3', 'externalIds': {'ACL': '2021.socialnlp-1.4', 'DBLP': 'conf/acl-socialnlp/CaoLKHG21', 'MAG': '3168872769', 'DOI': '10.18653/V1/2021.SOCIALNLP-1.4', 'CorpusId': 235097238}, 'corpusId': 235097238, 'publicationVenue': {'id': 'c9c1948f-4092-4a54-8e9a-a02519194626', 'name': 'International Workshop on Natural Language Processing for Social Media', 'type': 'conference', 'alternate_names': ['SocialNLP', 'Int Workshop Nat Lang Process Soc Media']}, 'url': 'https://www.semanticscholar.org/paper/9fafc9896d2b81d1328791e4c2fd7ab096f155f3', 'title': 'Quantifying the Effects of COVID-19 on Restaurant Reviews', 'abstract': 'The COVID-19 pandemic has implicat

In [89]:
parser.get_paper_table()

[['1',
  'Quantifying the Effects of COVID-19 on Restaurant Reviews',
  'The COVID-19 pandemic has implications beyond physical health, affecting society and economies. Government efforts to slow down the spread of the virus have had a severe impact on many businesses, including restaurants. Mandatory policies such as restaurant closures, bans on social gatherings, and social distancing restrictions have affected restaurant operations as well as customer preferences (e.g., prompting a demand of stricter hygiene standards). As of now, however, it is not clear how and to what extent the pandemic has affected restaurant reviews, an analysis of which could potentially inform policies for addressing this ongoing situation. In this work, we present our efforts to understand the effects of COVID-19 on restaurant reviews, with a focus on Yelp reviews produced during the pandemic for New York City and Los Angeles County restaurants. Overall, we make the following contributions. First, we assemb

In [96]:
print(parser.get_paper_table())



In [91]:
parser.get_publication_table()

[[''],
 ['Proceedings of the 19th Annual International Conference on Mobile Systems, Applications, and Services'],
 ['ASPLOS X'],
 ['J. Assoc. Inf. Sci. Technol.'],
 ['Pain'],
 ["Proceedings IEEE Symposium on Information Visualization '96"],
 ['Journal of Zhejiang University SCIENCE B'],
 ['Medinfo'],
 ['International Society for Music Information Retrieval Conference'],
 ['American Journal of Clinical Dermatology'],
 ['Algorithms, Probability, Networks, and Games'],
 ['2018 IEEE International Symposium on Mixed and Augmented Reality Adjunct (ISMAR-Adjunct)'],
 ['Blood'],
 ['Circulation'],
 ['INFORMS journal on computing'],
 ["Proceedings Computer Animation '96"],
 ['History of Programming Languages'],
 ['Third IEEE and ACM International Symposium on Mixed and Augmented Reality'],
 ['Proceedings of the 19th Annual IEEE Symposium on Logic in Computer Science, 2004.'],
 ['Rangeland Ecology & Management'],
 ['International Conference on Tools and Algorithms for Construction and Analysis o

In [92]:
parser.get_authored_table()

[['1', '1'],
 ['1', '2'],
 ['1', '3'],
 ['1', '4'],
 ['1', '5'],
 ['1', '6'],
 ['1', '7'],
 ['1', '8'],
 ['1', '9'],
 ['1', '10'],
 ['1', '11'],
 ['1', '12'],
 ['1', '13'],
 ['1', '14'],
 ['1', '15'],
 ['1', '16'],
 ['1', '17'],
 ['1', '18'],
 ['1', '19'],
 ['1', '20'],
 ['1', '21'],
 ['1', '22'],
 ['1', '23'],
 ['1', '24'],
 ['1', '25'],
 ['1', '26'],
 ['1', '27'],
 ['1', '28'],
 ['1', '29'],
 ['1', '30'],
 ['1', '31'],
 ['1', '32'],
 ['1', '33'],
 ['1', '34'],
 ['1', '35'],
 ['1', '36'],
 ['1', '37'],
 ['1', '38'],
 ['1', '39'],
 ['1', '40'],
 ['1', '41'],
 ['1', '42'],
 ['1', '43'],
 ['1', '44'],
 ['1', '45'],
 ['1', '46'],
 ['1', '47'],
 ['1', '48'],
 ['1', '49'],
 ['1', '50'],
 ['1', '51'],
 ['1', '52'],
 ['1', '53'],
 ['1', '54'],
 ['1', '55'],
 ['1', '56'],
 ['1', '57'],
 ['1', '58'],
 ['1', '59'],
 ['1', '60'],
 ['1', '61'],
 ['1', '62'],
 ['1', '63'],
 ['1', '64'],
 ['1', '65'],
 ['1', '66'],
 ['1', '67'],
 ['1', '68'],
 ['1', '69'],
 ['1', '70'],
 ['1', '71'],
 ['1', '72'],
 

In [93]:
parser.get_also_identified_by_public()

[['2021.socialnlp-1.4', 'ACL', '1'],
 ['conf/acl-socialnlp/CaoLKHG21', 'DBLP', '1'],
 ['3168872769', 'MAG', '1'],
 ['10.18653/V1/2021.SOCIALNLP-1.4', 'DOI', '1'],
 [235097238, 'CorpusId', '1'],
 ['2020.louhi-1.15', 'ACL', '2'],
 ['journals/corr/abs-2010-05194', 'DBLP', '2'],
 ['2010.05194', 'ArXiv', '2'],
 ['3098266722', 'MAG', '2'],
 ['10.18653/v1/2020.louhi-1.15', 'DOI', '2'],
 [222291217, 'CorpusId', '2'],
 ['2020.findings-emnlp.323', 'ACL', '3'],
 ['conf/emnlp/Karamanolakis0G20', 'DBLP', '3'],
 ['2010.02562', 'ArXiv', '3'],
 ['3103065189', 'MAG', '3'],
 ['10.18653/v1/2020.findings-emnlp.323', 'DOI', '3'],
 [222142647, 'CorpusId', '3'],
 ['D19-5501', 'ACL', '4'],
 ['2978262124', 'MAG', '4'],
 ['journals/corr/abs-1910-00054', 'DBLP', '4'],
 ['1910.00054', 'ArXiv', '4'],
 ['10.18653/v1/D19-5501', 'DOI', '4'],
 [203610686, 'CorpusId', '4'],
 ['journals/corr/abs-1909-00415', 'DBLP', '5'],
 ['2971994113', 'MAG', '5'],
 ['D19-1468', 'ACL', '5'],
 ['1909.00415', 'ArXiv', '5'],
 ['10.18653/

In [94]:
import numpy as np
from collections import Counter
Counter(Counter(np.array(parser.get_authored_table())[:, 1]).values())

Counter({1: 3661, 2: 60})

In [97]:
parser.objs["Luis Gravano"]

{'authorId': '1684012', 'externalIds': {'DBLP': ['Luis Gravano']}, 'url': 'https://www.semanticscholar.org/author/1684012', 'name': 'L. Gravano', 'aliases': ['L. Gravano', 'Luis Gravano'], 'affiliations': [], 'homepage': None, 'paperCount': 139, 'citationCount': 13459, 'hIndex': 52, 'papers': [{'paperId': '9fafc9896d2b81d1328791e4c2fd7ab096f155f3', 'externalIds': {'ACL': '2021.socialnlp-1.4', 'DBLP': 'conf/acl-socialnlp/CaoLKHG21', 'MAG': '3168872769', 'DOI': '10.18653/V1/2021.SOCIALNLP-1.4', 'CorpusId': 235097238}, 'corpusId': 235097238, 'publicationVenue': {'id': 'c9c1948f-4092-4a54-8e9a-a02519194626', 'name': 'International Workshop on Natural Language Processing for Social Media', 'type': 'conference', 'alternate_names': ['SocialNLP', 'Int Workshop Nat Lang Process Soc Media']}, 'url': 'https://www.semanticscholar.org/paper/9fafc9896d2b81d1328791e4c2fd7ab096f155f3', 'title': 'Quantifying the Effects of COVID-19 on Restaurant Reviews', 'abstract': 'The COVID-19 pandemic has implicat

# Modifications:
1. publication_id not needed, name is already unique and can be key.
2. Author h_index is redundant, remove.
3. Primary key of AlsoIdentifiedByPublicId should be just (public_id, public_id_type)
4. Public_id should be varchar(100)
5. INSTITUTION TABLE IS MISSING

In [1]:
CREATE TABLE Affiliations(
    author_id VARCHAR(10)
    institution_id VARCHAR(10)
    PRIMARY KEY(author_id, institution_id)
    FOREIGN KEY author_id REFERENCES Author
        ON DELETE CASCADE
        ON UPDATE CASCADE
    FOREIGN KEY institution_id REFERENCES Institution
        ON DELETE CASCADE
        ON UPDATE CASCADE
)

SyntaxError: invalid syntax (4175637358.py, line 1)

In [17]:
"2" * 3

'222'