In [1]:
from time import sleep

# import pandas as pd
from requests_html import HTMLSession

Search for machine learning articles from 1968 to 2003 in English, German, Spanish and Traditional Chinese:

In [2]:
session = HTMLSession()
response = session.get('https://scholar.google.com/scholar?lr=lang_zh-TW|lang_en|lang_de|lang_es&q=machine+learning&hl=en&as_sdt=0,5&as_ylo=1968&as_yhi=2003')
response

<Response [200]>

Extract all results from the page:

In [3]:
results = response.html.find('div.gs_r.gs_or.gs_scl')
results, len(results)

([<Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='HywQwjDdNeQJ' data-did='HywQwjDdNeQJ' data-lid='' data-aid='HywQwjDdNeQJ' data-rp='0'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='AgfPAvZ8ZNYJ' data-did='AgfPAvZ8ZNYJ' data-lid='' data-aid='AgfPAvZ8ZNYJ' data-rp='1'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='tHesnpWGq8EJ' data-did='tHesnpWGq8EJ' data-lid='' data-aid='tHesnpWGq8EJ' data-rp='2'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='A558H9ycTNoJ' data-did='A558H9ycTNoJ' data-lid='' data-aid='A558H9ycTNoJ' data-rp='3'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='iUQj7JK-7ooJ' data-did='iUQj7JK-7ooJ' data-lid='' data-aid='iUQj7JK-7ooJ' data-rp='4'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='cRq48wm8IagJ' data-did='cRq48wm8IagJ' data-lid='' data-aid='cRq48wm8IagJ' data-rp='5'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='69_3C1LkwxUJ' data-did='69_3C1LkwxUJ' data-li

Get the title, authors, source, direct paper link and # of citations for a result:

In [4]:
sample = results[1]
title = sample.find('h3.gs_rt', first=True).text
authors = sample.find('div.gs_a', first=True).text
source = sample.find('h3.gs_rt a', first=True).attrs['href']
paper = sample.find('div.gs_or_ggsm a', first=True).attrs['href']
citations = sample.find('div.gs_ri div.gs_fl a')[2].text

title, authors, source, paper, citations

('The boosting approach to machine learning: An overview',
 'RE Schapire\xa0- Nonlinear estimation and classification, 2003 - Springer',
 'https://link.springer.com/chapter/10.1007/978-0-387-21579-2_9',
 'http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.448.3000&rep=rep1&type=pdf',
 'Cited by 2535')

In [5]:
def search_by_params(params: str) -> list:
    """Extracts and returns the results from the Google Scholar search."""
    session = HTMLSession()
    all_results = []

    for i in range(100):  # The final page number is 100 and each page has 10 results.
        response = session.get(f'https://scholar.google.com/scholar?{params}&start={i*10}')
        page_results = response.html.find('div.gs_r.gs_or.gs_scl')
        if len(page_results) < 10: break
        all_results.extend(page_results)
        sleep(.5)  # Google blocks requests if too many are sent too quickly.

    return all_results

In [6]:
%%time
raw_results = search_by_params('q=metaverse&as_sdt=0,5&as_ylo=1989&as_yhi=2015')
raw_results, len(raw_results)

CPU times: user 8.23 s, sys: 211 ms, total: 8.44 s
Wall time: 2min 16s


([<Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='6_xoBxqaXeAJ' data-did='6_xoBxqaXeAJ' data-lid='' data-aid='6_xoBxqaXeAJ' data-rp='0'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='zfBpRW3HIB4J' data-did='zfBpRW3HIB4J' data-lid='' data-aid='zfBpRW3HIB4J' data-rp='1'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='eN7Fq0mkHlMJ' data-did='eN7Fq0mkHlMJ' data-lid='' data-aid='eN7Fq0mkHlMJ' data-rp='2'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='919uH2RP2xYJ' data-did='919uH2RP2xYJ' data-lid='' data-aid='919uH2RP2xYJ' data-rp='3'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='7V20oHDut4YJ' data-did='7V20oHDut4YJ' data-lid='' data-aid='7V20oHDut4YJ' data-rp='4'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='tv0nCcYBSm0J' data-did='tv0nCcYBSm0J' data-lid='' data-aid='tv0nCcYBSm0J' data-rp='5'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='Ja7hQB7qnRYJ' data-did='Ja7hQB7qnRYJ' data-li

In [3]:
def build_query(**kwargs) -> str:
    """Builds a Google Scholar query based the given arguments."""
    keywords = f'q={kwargs.get("keywords").replace(" ", "+")}'
    year_range = f'as_ylo={kwargs.get("start_year")}&as_yhi={kwargs.get("end_year")}'
    languages = f'lr={"|".join([f"lang_{l}" for l in kwargs.get("languages")])}'
    return f'{keywords}&{year_range}&{languages}'

In [2]:
build_query(keywords='metaverse privacy', start_year=1800, end_year=2012, languages=['en', 'zh-TW'])

'q=metaverse+privacy&as_ylo=1800&as_yhi=2012&lr=lang_en|lang_zh-TW'