In [None]:
# Install packages
!pip3 install requests
!pip3 install beautifulsoup4
!pip3 install scholarly
!pip3 install pandas

In [None]:
# Import packages
import requests
from bs4 import BeautifulSoup
from scholarly import scholarly
import pandas as pd
import csv
import time
import random

Step 1 (after installing/importing libraries): Generate csv of papers with associated authors

In [None]:
# Initialize variables
## Updated as of 7/11/2020 11:07AM PST ##
start_idx = 5765
end_idx = 6852
url_template = "https://icml.cc/Conferences/2020/Schedule?showEvent="
papers = []

In [None]:
# Define functions

# Assert equals for verification of accessing website
def assert_equals(request_code):
    if request_code == 200:
        return True
    else:
        return False

In [None]:
# Iterate through all papers and find title and author names
for i in range(start_idx, end_idx+1):

  # Initialize variables
  paper_info = []
  author_names = []
  url = url_template + str(i)

  # HTTPS request to the url
  try:
    page = requests.get(url)
    assert_equals(page.status_code) # Asserts that the page is parsable, 200 is success parse
    soup = BeautifulSoup(page.content, "html.parser")

    # Find title and list of authors
    title = soup.find("div", class_="maincardBody").text
    button_list = soup.find_all("button", class_="btn btn-default")

    # Just so users can see at which step is it on
    print(str(i) + " " + title)

    # Append all information
    paper_info.append(title)  
    author_names = []
    for div in button_list:
      author = div.text[1:-2]
      author_names.append(author)
    paper_info.append(", ".join(author_names))
    paper_info.append(i)

    # Adds author information and paper title into papers 2D list
    papers.append(paper_info)
    print()

  except:
    print("No page found.")
    print()
    continue

In [None]:
# Appends data into csv file
with open('ICML2020_papers_authors_ID.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Paper Title", "Authors", "ID"])
    for paper in papers:
      writer.writerow(paper)

Step 2: Parse the papers and process the authors of the papers, then search the authors on Google Scholars for identification.

In [None]:
# Initialize
author_names = []
author_infos = []

In [None]:
# Loading up papers csv and separating all authors into individual elements for iteration
icml_papers = pd.read_csv("ICML2020_papers_authors_ID.csv")
for row in icml["Authors"]:
  author_names += row.split(", ")

In [None]:
# Querying all authors in the author_names list in Google Scholars, obtain affiliations
# THIS WILL TAKE A LONG TIME, TRY TO AVOID DOING THIS MANY TIMES
for i, author in enumerate(author_names):
  try:
    search_query = scholarly.search_author(author)
    info = next(search_query)
    info = info.fill(sections=["basic"])
    author_infos.append([author, info.affiliation])
    print(str(i) + " " + author + ", " + info.affiliation)
  except:
    print(str(i) + " " + author + " not found, or there might be a disconnection from Google Scholars")
  time.sleep(random.uniform(5,6)) # Pause is required or else Google Scholars will block your IP!

In [None]:
# Add all information into ICML2020_authors.csv file
with open("ICML2020_authors_all.csv", 'w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(["Author", "Affiliation"])
  for info in author_infos:
    writer.writerow(info)

Step 3: Look at the authors and depending on what institutions/colleges you are interested in focusing more on, you can filter the papers.

In [None]:
# Import csv files
icml_papers = pd.read_csv("ICML2020_papers_authors_ID.csv")
icml_authors = pd.read_csv("ICML2020_authors_all.csv")

In [None]:
# Initialize variables
authors_all = []
papers_all = []

In [None]:
# Institutions and Colleges to filter (we want to read them all!)
institutions_and_colleges = []

In [None]:
# Filter authors by desired institutions and colleges
for i in range(len(icml_authors)):
  author = str(icml_authors["Author"][i])
  affiliation = str(icml_authors["Affiliation"][i])
  res = not any(ele in affiliation for ele in institutions_and_colleges)
  if res:
    authors_all.append(author)

In [None]:
# Scan through papers and choose the ones of which the authors you want to read about
for i in range(len(icml_papers)):
  paper = icml_papers["Paper Title"][i]
  authors = icml_papers["Authors"][i]
  id = icml_papers["ID"][i]
  res = any(ele in authors for ele in authors_all)
  if res:
    papers_all.append([paper, id])

Step 4: Obtaining the PDFs of all the papers you want to read. Will require login credentials to ICML 2020.

In [None]:
# Initialize variables
url_template = "https://icml.cc/virtual/2020/poster/"

In [None]:
# Login
login_url = "https://icml.cc/accounts/login?nextp=/virtual/2020"
EMAIL = "ENTER YOUR OWN LOGIN USERNAME/EMAIL"
PASSWORD = "ENTER YOUR OWN LOGIN PASSWORD"

session = requests.session()
session.get(login_url)  # sets cookie
if 'csrftoken' in session.cookies:
    # Django 1.6 and up
    csrftoken = session.cookies['csrftoken']
else:
    # older versions
    csrftoken = session.cookies['csrf']

login_data = dict(username=EMAIL, password=PASSWORD, csrfmiddlewaretoken=csrftoken, next='/')
r = session.post(login_url, data=login_data, headers=dict(Referer=login_url))
print(r.status_code)

In [None]:
# Create a directory of papers
%mkdir Papers
%cd Papers

In [None]:
# Find and download papers with their IDs, requires login!
for paper_and_id in papers_all:
  title = paper_and_id[0]
  id = paper_and_id[1]
  url = url_template + str(id)

  # try:
  page = session.get(url)
  assert_equals(page.status_code) # Asserts that the page is parsable, 200 is success parse

  # Find PDF link
  soup = BeautifulSoup(page.content, "html.parser")
  pdf_link = soup.find_all("a", class_="card-link")

  # Just so users can see the pdf link
  print(pdf_link[1])

  # Download PDF to local directory
  r = session.get(pdf_link[1].get('href'))
  with open("{}.pdf".format(title),'wb') as f: 
    f.write(r.content) 
  time.sleep(random.uniform(1,2))

In [None]:
# Zip everything
!tar chvfz notebook.tar.gz *