In [63]:
from time import sleep

import pandas as pd
from requests_html import HTMLSession

In [2]:
session = HTMLSession()

Search for machine learning articles from 1968 to 2003 in English, German, Spanish and Traditional Chinese:

In [6]:
response = session.get('https://scholar.google.com/scholar?lr=lang_zh-TW|lang_en|lang_de|lang_es&q=machine+learning&hl=en&as_sdt=0,5&as_ylo=1968&as_yhi=2003')
response

<Response [200]>

Extract all results from the page:

In [7]:
results = response.html.find('div.gs_r.gs_or.gs_scl')
results, len(results)

([<Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='AgfPAvZ8ZNYJ' data-did='AgfPAvZ8ZNYJ' data-lid='' data-aid='AgfPAvZ8ZNYJ' data-rp='0'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='T84oNxNViJYJ' data-did='T84oNxNViJYJ' data-lid='' data-aid='T84oNxNViJYJ' data-rp='1'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='A558H9ycTNoJ' data-did='A558H9ycTNoJ' data-lid='' data-aid='A558H9ycTNoJ' data-rp='2'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='qtAhQGrhJtMJ' data-did='qtAhQGrhJtMJ' data-lid='' data-aid='qtAhQGrhJtMJ' data-rp='3'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='iUQj7JK-7ooJ' data-did='iUQj7JK-7ooJ' data-lid='' data-aid='iUQj7JK-7ooJ' data-rp='4'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='cRq48wm8IagJ' data-did='cRq48wm8IagJ' data-lid='' data-aid='cRq48wm8IagJ' data-rp='5'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='69_3C1LkwxUJ' data-did='69_3C1LkwxUJ' data-li

Get the title, authors, source, direct paper link and # of citations for a result:

In [32]:
sample = results[6]
title = sample.find('h3.gs_rt', first=True).text
authors = sample.find('div.gs_a', first=True).text
source = sample.find('h3.gs_rt a', first=True).attrs['href']
paper = sample.find('div.gs_or_ggsm a', first=True).attrs['href']
citations = sample.find('div.gs_ri div.gs_fl a')[2].text

title, authors, source, paper, citations

('“Memo” functions and machine learning',
 'D Michie\xa0- Nature, 1968 - nature.com',
 'https://www.nature.com/articles/218019a0',
 'https://stacks.stanford.edu/file/druid:dn905ks9646/dn905ks9646.pdf',
 'Cited by 762')

Extract elements using XPath:

In [33]:
sample = results[2]
title = sample.xpath('.//h3[@class="gs_rt"]/a | .//h3[@class="gs_rt"]/span[2]', first=True).text
authors = sample.xpath('.//div[@class="gs_a"]', first=True).text
source = sample.xpath('.//h3[@class="gs_rt"]/a/@href', first=True)
paper = sample.xpath('.//div[@class="gs_or_ggsm"]/a/@href', first=True)
citations = sample.xpath('.//div[@class="gs_ri"]/div[@class="gs_fl"]/a[3]', first=True).text

title, authors, source, paper, citations

('Elements of machine learning',
 'P Langley - 1996 - books.google.com',
 'https://books.google.com/books?hl=en&lr=lang_zh-TW|lang_en|lang_de|lang_es&id=TNg5qVoqRtUC&oi=fnd&pg=PR9&dq=machine+learning&ots=Q4ppXyx-Ok&sig=uBfm0ZKaKv7GOr8RoqgGtw5gJ6Y',
 None,
 'Cited by 896')

In [5]:
def search_by_params(params: str) -> list:
    """Extracts and returns the results from the Google Scholar search."""
    session = HTMLSession()
    all_results = []

    for i in range(100):  # The final page number is 100 and each page has 10 results.
        response = session.get(f'https://scholar.google.com/scholar?{params}&start={i*10}')
        page_results = response.html.find('div.gs_r.gs_or.gs_scl')
        if len(page_results) < 10: break
        all_results.extend(page_results)
        sleep(.5)  # Google blocks requests if too many are sent too quickly.

    return all_results

In [6]:
%%time
raw_results = search_by_params('q=metaverse&as_sdt=0,5&as_ylo=1989&as_yhi=2015')
raw_results, len(raw_results)

CPU times: user 8.23 s, sys: 211 ms, total: 8.44 s
Wall time: 2min 16s


([<Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='6_xoBxqaXeAJ' data-did='6_xoBxqaXeAJ' data-lid='' data-aid='6_xoBxqaXeAJ' data-rp='0'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='zfBpRW3HIB4J' data-did='zfBpRW3HIB4J' data-lid='' data-aid='zfBpRW3HIB4J' data-rp='1'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='eN7Fq0mkHlMJ' data-did='eN7Fq0mkHlMJ' data-lid='' data-aid='eN7Fq0mkHlMJ' data-rp='2'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='919uH2RP2xYJ' data-did='919uH2RP2xYJ' data-lid='' data-aid='919uH2RP2xYJ' data-rp='3'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='7V20oHDut4YJ' data-did='7V20oHDut4YJ' data-lid='' data-aid='7V20oHDut4YJ' data-rp='4'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='tv0nCcYBSm0J' data-did='tv0nCcYBSm0J' data-lid='' data-aid='tv0nCcYBSm0J' data-rp='5'>,
  <Element 'div' class=('gs_r', 'gs_or', 'gs_scl') data-cid='Ja7hQB7qnRYJ' data-did='Ja7hQB7qnRYJ' data-li

In [3]:
def build_query(**kwargs) -> str:
    """Builds a Google Scholar query based the given arguments."""
    keywords = f'q={kwargs.get("keywords").replace(" ", "+")}'
    year_range = f'as_ylo={kwargs.get("start_year")}&as_yhi={kwargs.get("end_year")}'
    languages = f'lr={"|".join([f"lang_{l}" for l in kwargs.get("languages")])}'
    return f'{keywords}&{year_range}&{languages}'

In [2]:
build_query(keywords='metaverse privacy', start_year=1800, end_year=2012, languages=['en', 'zh-TW'])

'q=metaverse+privacy&as_ylo=1800&as_yhi=2012&lr=lang_en|lang_zh-TW'

Time to fetch the articles of a profile by the user ID:

In [4]:
response = session.get('https://scholar.google.com/citations?user=VfYhf2wAAAAJ&hl=en')
response

<Response [200]>

In [5]:
show_more_button = response.html.xpath('.//button[@class="gs_btnPD gs_in_ib gs_btn_flat gs_btn_lrge gs_btn_lsu"]', first=True)
show_more_button

<Element 'button' type='button' id='gsc_bpf_more' class=('gs_btnPD', 'gs_in_ib', 'gs_btn_flat', 'gs_btn_lrge', 'gs_btn_lsu')>

In [6]:
show_more_button.attrs

{'type': 'button',
 'id': 'gsc_bpf_more',
 'class': ('gs_btnPD', 'gs_in_ib', 'gs_btn_flat', 'gs_btn_lrge', 'gs_btn_lsu')}

In [8]:
articles = response.html.xpath('.//tr[@class="gsc_a_tr"]')
articles

[<Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>,
 <Element 'tr' class=('gsc_a_tr',)>]

The extracting method here is a little different to the articles and case law sections...

In [16]:
sample = articles[0]
title = sample.xpath('.//a[@class="gsc_a_at"]/text()', first=True)
authors = sample.xpath('.//div[@class="gs_gray"][1]/text()', first=True)
journal = sample.xpath('.//div[@class="gs_gray"][2]/text()', first=True)
publish_year = sample.xpath('.//a[@class="gsc_a_ac gs_ibl"]/text()', first=True)
citations_no = sample.xpath('.//span[@class="gsc_a_h gsc_a_hc gs_ibl"]/text()', first=True)

title, authors, journal, publish_year, citations_no

('Keras', 'F Chollet', 'keras.io 10', '14883', '2015')

In [18]:
sample = articles[4]
title = sample.xpath('.//a[@class="gsc_a_at"]/text()', first=True)
authors = sample.xpath('.//div[@class="gs_gray"][1]/text()', first=True)
journal = sample.xpath('.//div[@class="gs_gray"][2]/text()', first=True)
publish_year = sample.xpath('.//a[@class="gsc_a_ac gs_ibl"]/text()', first=True)
citations_no = sample.xpath('.//span[@class="gsc_a_h gsc_a_hc gs_ibl"]/text()', first=True)

title, authors, journal, publish_year, citations_no

('Depthwise separable convolutions for neural machine translation',
 'L Kaiser, AN Gomez, F Chollet',
 'arXiv preprint arXiv:1706.03059',
 '218',
 '2017')

At the bottom of the profile page we will see the "SHOW MORE" option, but we don't need Selenium to click that sh\*t... Just follow the network requests!

In [14]:
articles = []

response = session.get('https://scholar.google.com/citations?hl=en&user=L5xeTxkAAAAJ&cstart=800&pagesize=100')
articles.extend(response.html.xpath('.//tr[@class="gsc_a_tr"]'))
print(response.html.xpath('.//tr[@class="gsc_a_tr"]'))    

show_more_button = response.html.xpath('.//button[@class="gs_btnPD gs_in_ib gs_btn_flat gs_btn_lrge gs_btn_lsu"]/@disabled', first=True)
print('Is show more button disabled?', 'No' if show_more_button is None else 'Yes')

[<Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a_tr',)>, <Element 'tr' class=('gsc_a

In [17]:
results = []

for article in articles:
    title = article.xpath('.//a[@class="gsc_a_at"]/text()', first=True)
    authors = article.xpath('.//div[@class="gs_gray"][1]/text()', first=True)
    journal = article.xpath('.//div[@class="gs_gray"][2]/text()', first=True)
    publish_year = article.xpath('.//a[@class="gsc_a_ac gs_ibl"]/text()', first=True)
    citations_no = article.xpath('.//span[@class="gsc_a_h gsc_a_hc gs_ibl"]/text()', first=True)
    results.append({
        'title': title,
        'authors': authors,
        'journal': journal,
        'publish_year': publish_year,
        'citations_no': citations_no,
    })

results

[{'title': 'Intentional injuries in the Eastern Mediterranean Region, 1990–2015: findings from the Global Burden of Disease 2015 study',
  'authors': 'GBDEMRII Collaborators',
  'journal': 'International Journal of Public Health 63 (1), 39-46',
  'publish_year': None,
  'citations_no': '2018'},
 {'title': 'ELLICOTT C. MATTHAY, PHD MPH',
  'authors': 'J Ahern, M Petersen, J Skeem, B Hernández-Prado, A Mokdad, S Lim, ...',
  'journal': 'University of California, Berkeley',
  'publish_year': None,
  'citations_no': '2018'},
 {'title': 'Abortion in the United States: Trends, Cross-State Care-Seeking, and Unmet Need for in-State Services',
  'authors': 'A Wollum, N Kassebaum, A Mokdad, I Ahluwalia, C Gerdts',
  'journal': 'PAA 2018 Annual Meeting',
  'publish_year': None,
  'citations_no': '2018'},
 {'title': 'Measuring progress from 1990 to 2017 and projecting attainment to 2030 of the health-related Sustainable Development Goals for 195 countries and territories: a systematic\xa0…',
  'au

To bypassing the Google's CAPTCHA we should use a website named [free-proxy-list.net](https://free-proxy-list.net)... The first step is fetching the proxy board from the homepage and then testing them.

In [30]:
response = session.get('https://free-proxy-list.net')
response

<Response [200]>

In [32]:
proxy_table = response.html.xpath('.//table[@class="table table-striped table-bordered"]', first=True)
proxy_table

<Element 'table' class=('table', 'table-striped', 'table-bordered')>

In [33]:
ip_address = proxy_table.xpath('//td[1]/text()')
port = proxy_table.xpath('//td[2]/text()')
country = proxy_table.xpath('//td[4]/text()')
google = proxy_table.xpath('//td[6]/text()')
https = proxy_table.xpath('//td[7]/text()')

proxies = list(zip(ip_address, port, country, google, https))
proxies

[('208.163.39.218', '53281', 'Jamaica', 'no', 'no'),
 ('05.189.229.42', '1081', 'Unknown', 'no', 'no'),
 ('181.204.9.182', '9812', 'Colombia', 'no', 'no'),
 ('173.82.149.243', '8080', 'United States', 'no', 'no'),
 ('103.108.75.10', '9812', 'India', 'no', 'no'),
 ('93.185.3.161', '9812', 'Czech Republic', 'no', 'no'),
 ('181.129.183.19', '53281', 'Colombia', 'no', 'no'),
 ('67.212.83.54', '1080', 'Canada', 'no', 'no'),
 ('213.230.97.10', '3128', 'Uzbekistan', 'yes', 'no'),
 ('103.109.195.187', '9812', 'Indonesia', 'no', 'no'),
 ('47.89.253.218', '80', 'United States', 'yes', 'no'),
 ('203.198.207.253', '80', 'Hong Kong', 'no', 'no'),
 ('15.165.18.201', '80', 'Korea', 'no', 'no'),
 ('91.234.127.222', '53281', 'Poland', 'no', 'no'),
 ('3.128.120.252', '80', 'United States', 'yes', 'no'),
 ('164.155.145.0', '80', 'United States', 'no', 'no'),
 ('103.115.14.180', '80', 'Afghanistan', 'no', 'no'),
 ('103.115.14.158', '80', 'Afghanistan', 'no', 'no'),
 ('103.115.14.116', '80', 'Afghanistan',

In [34]:
working_proxies = []

for proxy in proxies:
    ip_address = f'http://{proxy[0]}:{proxy[1]}'
    try:
        response = session.get('https://httpbin.org/ip', proxies={'http': ip_address, 'https': ip_address}, timeout=1)
        working_proxies.append(ip_address)
        print(f'{ip_address} is working!')
    except Exception as e:
        pass

http://169.57.1.85:8123 is working!


The above method is not good, because it is too slow... The proxies table has a column called "Google" which indicates whether the proxy is used for Google or not.

In [35]:
for proxy in proxies:
    if proxy[3] == 'yes':
        ip_address = f'http://{proxy[0]}:{proxy[1]}'
        try:
            response = session.get('https://httpbin.org/ip', proxies={'http': ip_address, 'https': ip_address}, timeout=2)
            working_proxies.append(ip_address)
            print(f'{ip_address} is working!')
        except Exception as e:
            print(e)

HTTPSConnectionPool(host='httpbin.org', port=443): Max retries exceeded with url: /ip (Caused by ProxyError('Cannot connect to proxy.', ConnectionResetError(104, 'Connection reset by peer')))
HTTPSConnectionPool(host='httpbin.org', port=443): Max retries exceeded with url: /ip (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out')))
HTTPSConnectionPool(host='httpbin.org', port=443): Max retries exceeded with url: /ip (Caused by ProxyError('Cannot connect to proxy.', ConnectionResetError(104, 'Connection reset by peer')))
HTTPSConnectionPool(host='httpbin.org', port=443): Max retries exceeded with url: /ip (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 407 Proxy Authentication Required')))
HTTPSConnectionPool(host='httpbin.org', port=443): Max retries exceeded with url: /ip (Caused by ProxyError('Cannot connect to proxy.', timeout('timed out')))
HTTPSConnectionPool(host='httpbin.org', port=443): Max retries exceeded with url: /ip (Ca

Okay... That column is not reliable... Maybe the "Https" column is better...

In [36]:
for proxy in proxies:
    if proxy[4] == 'yes':
        ip_address = f'http://{proxy[0]}:{proxy[1]}'
        try:
            response = session.get('https://httpbin.org/ip', proxies={'http': ip_address, 'https': ip_address}, timeout=2)
            working_proxies.append(ip_address)
            print(f'{ip_address} is working!')
        except Exception as e:
            print(e)

HTTPSConnectionPool(host='httpbin.org', port=443): Max retries exceeded with url: /ip (Caused by ProxyError('Cannot connect to proxy.', timeout('_ssl.c:1112: The handshake operation timed out')))
HTTPSConnectionPool(host='httpbin.org', port=443): Max retries exceeded with url: /ip (Caused by ProxyError('Cannot connect to proxy.', ConnectionResetError(104, 'Connection reset by peer')))
HTTPSConnectionPool(host='httpbin.org', port=443): Max retries exceeded with url: /ip (Caused by ProxyError('Cannot connect to proxy.', timeout('_ssl.c:1112: The handshake operation timed out')))


Almost no one works... We hope using the proxy to fetch all the articles without the CAPTCHA.

In [9]:
from concurrent.futures import ThreadPoolExecutor

In [40]:
def fetch_proxies() -> list:
    response = session.get('https://free-proxy-list.net')
    proxy_table = response.html.xpath('.//table[@class="table table-striped table-bordered"]', first=True)
    ip_address = proxy_table.xpath('//td[1]/text()')
    port = proxy_table.xpath('//td[2]/text()')
    country = proxy_table.xpath('//td[4]/text()')
    google = proxy_table.xpath('//td[6]/text()')
    https = proxy_table.xpath('//td[7]/text()')

    return list(zip(ip_address, port, country, google, https))

In [52]:
def test_proxy(proxy: tuple) -> bool:
    ip_address = f'http://{proxy[0]}:{proxy[1]}'
    try:
        response = session.get('https://httpbin.org/ip', proxies={'http': ip_address, 'https': ip_address}, timeout=1)
        return ip_address
    except Exception as e:
        pass

In [53]:
with ThreadPoolExecutor(max_workers=30) as executor:
    print(list(executor.map(test_proxy, fetch_proxies())))

[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'http://18.118.139.123:3128', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]


In [58]:
response = session.get('https://scholar.google.com/scholar?start=10&q=history&hl=en&as_sdt=0,5',
                       proxies={'http': 'http://18.118.139.123:3128', 'https': 'http://18.118.139.123:3128'})
response.html.xpath('.//div[@class="gs_r gs_or gs_scl"]')

[]

Free proxy maybe is not key to bypassing the CAPTCHA...

I need to design a good and also simple structure as the scraper base... Using decorators is the first idea:

In [60]:
def crawl(func):    
    def wrapper(**kwargs):
        params = func(**kwargs)
        # print(params)
        # print(params['urls'])
        results = []
        
        for url in params['urls']:
            response = session.get(url)
            result = params['parser'](response)
            print(f'{url} results is:', result)
            print('------------------------------------------')
            results.extend(result)
            if not params['condition'](result):
                break
    
        return results
    
    return wrapper

The following code may look dirty, but it has been tried to be flexible...

In [61]:
@crawl
def fetch_articles(**kwargs):
    query = build_query(**kwargs)
    urls = [f'https://scholar.google.com/scholar?hl=en&as_sdt=0,5&{query}&start={i}' for i in range(0, 1000, 10)]
    
    def parser(response):
        articles = response.html.xpath('.//div[@class="gs_r gs_or gs_scl"]')
        results = []
        
        for article in articles:
            title = article.xpath('.//h3[@class="gs_rt"]/a | .//h3[@class="gs_rt"]/span[2]', first=True).text
            authors = article.xpath('.//div[@class="gs_a"]', first=True).text
            source = article.xpath('.//h3[@class="gs_rt"]/a/@href', first=True)
            paper = article.xpath('.//div[@class="gs_or_ggsm"]/a/@href', first=True)
            citations = article.xpath('.//div[@class="gs_ri"]/div[@class="gs_fl"]/a[3]', first=True).text

            results.append({
                'title': title,
                'authors': authors,
                'source': source,
                'paper': paper,
                'citations': citations,
            })
        return results

    def condition(result):
        if len(result) != 10:
            return False
        return True
    
    return {
        'urls': urls,
        'parser': parser,
        'condition': condition,
    }

In [62]:
articles = fetch_articles(keywords='history', languages=['en'])
articles

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=0 results is: [{'title': 'On history', 'authors': 'F Braudel - 1982 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=Aus0BEQZFXIC&oi=fnd&pg=PP13&dq=history&ots=SZHBC6NsYy&sig=YgiDUtpHOnzU6C6_5vtXtUMyJe0', 'paper': None, 'citations': 'Cited by 2089'}, {'title': 'What is history?', 'authors': 'EH Carr - 2018 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=w8hcDwAAQBAJ&oi=fnd&pg=PT6&dq=history&ots=2mD-7SbLE5&sig=EUEUy4CYnYsyTsS-L6fsabuCrjc', 'paper': 'http://www.math.chalmers.se/~ulfp/Review/history.pdf', 'citations': 'Cited by 6190'}, {'title': 'A history of hypnotism.', 'authors': 'A Gauld - 1992 - psycnet.apa.org', 'source': 'https://psycnet.apa.org/record/1993-97236-000', 'paper': None, 'citations': 'Cited by 776'}, {'title': 'Choreographing history', 'authors': 'SL Foster - 1995 - books.google.com', 'source': 'http

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=30 results is: [{'title': 'The history of the world', 'authors': 'JM Roberts, OA Westad - 2013 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=A2cfZkU5aQgC&oi=fnd&pg=PP2&dq=history&ots=dgtYPCQkNR&sig=yhZL9gFCdG38G_y3O1TXxCI_WRg', 'paper': None, 'citations': 'Cited by 386'}, {'title': 'On the concept of history', 'authors': 'W Benjamin - 2009 - sfu.ca', 'source': 'http://www.sfu.ca/~andrewf/books/Concept_History_Benjamin.pdf', 'paper': 'http://www.sfu.ca/~andrewf/books/Concept_History_Benjamin.pdf', 'citations': 'Cited by 2206'}, {'title': 'The history of the world', 'authors': 'W Raleigh - 1829 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=UboxAQAAMAAJ&oi=fnd&pg=PA1&dq=history&ots=F3bUYBO1MH&sig=CTKVqAC9YioJ-Bftl642j-j9AjA', 'paper': 'https://books.google.com/books?hl=en&lr=lang_en&id=UboxAQAAMAAJ&oi=fnd&pg=PA1&dq

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=60 results is: [{'title': 'Science: A history', 'authors': 'J Gribbin - 2003 - Penguin UK', 'source': None, 'paper': None, 'citations': 'Cited by 351'}, {'title': 'The structures of history', 'authors': 'C Lloyd - 1993 - philpapers.org', 'source': 'https://philpapers.org/rec/LLOTSO', 'paper': None, 'citations': 'Cited by 337'}, {'title': 'English History 1914-1945', 'authors': 'AJP Taylor - 2001 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=KL5IMdG8rDEC&oi=fnd&pg=PP2&dq=history&ots=2X-I9yHVX5&sig=lUTm3OXcopDkwKge2ADNpXQ-vj4', 'paper': None, 'citations': 'Cited by 1886'}, {'title': 'What is cultural history?', 'authors': 'P Burke - 2019 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=hKukDwAAQBAJ&oi=fnd&pg=PT6&dq=history&ots=oV0PNZd-Ho&sig=HyQr6bC69ejY7u7ZBi_G0Fm_buA', 'paper': 'https://publications.hse.ru/pubs/sha

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=90 results is: [{'title': 'A history of India', 'authors': 'R Thapar - 1990 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=1Y02AiEu1kcC&oi=fnd&pg=PT3&dq=history&ots=6XOziCp1HE&sig=5UCwSF4q_seGHZ2goD0Ue4py_eM', 'paper': None, 'citations': 'Cited by 1216'}, {'title': 'A history of God', 'authors': 'K Armstrong - 1994 - Random House Digital, Inc.', 'source': None, 'paper': None, 'citations': 'Cited by 2328'}, {'title': 'History and presence', 'authors': 'RA Orsi\xa0- History and Presence, 2016 - degruyter.com', 'source': 'https://www.degruyter.com/document/doi/10.4159/9780674969056/html', 'paper': 'https://www.degruyter.com/document/doi/10.4159/9780674969056/pdf', 'citations': 'Cited by 302'}, {'title': 'Biography and history', 'authors': 'B Caine - 2018 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=h3dvDwAAQBAJ&oi=

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=120 results is: [{'title': 'The pursuit of history: Aims, methods and new directions in the study of history', 'authors': 'J Tosh - 2013 - taylorfrancis.com', 'source': 'https://www.taylorfrancis.com/books/mono/10.4324/9781315835341/pursuit-history-john-tosh', 'paper': 'http://abuss.narod.ru/Biblio/tosh.pdf', 'citations': 'Cited by 2238'}, {'title': 'A history of Brazil', 'authors': 'EB Burns - 1993 - Columbia University Press', 'source': None, 'paper': None, 'citations': 'Cited by 860'}, {'title': 'Is history fiction?', 'authors': 'A Curthoys - 2010 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=DnOuYd3XlyIC&oi=fnd&pg=PR1&dq=history&ots=j0NTLdXh7n&sig=Qmq7QeeZTqszjjwpKV0D-2OL3vQ', 'paper': None, 'citations': 'Cited by 392'}, {'title': 'A history of medicine', 'authors': 'LN Magner, OJ Kim - 2017 - taylorfrancis.com', 'source': 'https://www.taylorfr

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=150 results is: [{'title': 'History of the Christian church', 'authors': 'W Walker - 2014 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=0kWAAwAAQBAJ&oi=fnd&pg=PR9&dq=history&ots=n5aKvV1jBY&sig=6ACAtihHvvSwZVjaqSZQslIdr5E', 'paper': None, 'citations': 'Cited by 1056'}, {'title': 'Is literary history possible?', 'authors': 'D Perkins - 1992 - booksfree.org', 'source': 'https://www.booksfree.org/wp-content/uploads/2022/02/Is-literary-history-possible-by-david-perkins-pdf.pdf', 'paper': 'https://www.booksfree.org/wp-content/uploads/2022/02/Is-literary-history-possible-by-david-perkins-pdf.pdf', 'citations': 'Cited by 676'}, {'title': 'Professing literature: An institutional history', 'authors': 'G Graff - 2008 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=GT8c60fpavoC&oi=fnd&pg=PR5&dq=history&ots=IPaykzmf6B&sig=8BBR

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=180 results is: [{'title': 'History of economic analysis', 'authors': 'JA Schumpeter - 2006 - taylorfrancis.com', 'source': 'https://www.taylorfrancis.com/books/mono/10.4324/9780203983911/history-economic-analysis-joseph-schumpeter', 'paper': 'https://www.academia.edu/download/38660850/History_of_Economic_Analysis-J.A.Schumpeter.pdf', 'citations': 'Cited by 14504'}, {'title': 'The Cambridge illustrated history of medicine', 'authors': 'R Porter - 2001 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=VsyYXczSmhgC&oi=fnd&pg=PA6&dq=history&ots=MuYj07ZAW4&sig=yUGjB6uMj987_HYXlIP1TMYeeZI', 'paper': None, 'citations': 'Cited by 537'}, {'title': 'Doing oral history', 'authors': 'DA Ritchie - 2014 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=3OtjBAAAQBAJ&oi=fnd&pg=PP1&dq=history&ots=SOrZ9Jk6I4&sig=nfh2JzujECh5_C7Cch-b2o1E

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=210 results is: [{'title': 'London, a social history', 'authors': 'R Porter - 1995 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=DyZfYaLXsuUC&oi=fnd&pg=PR9&dq=history&ots=Ccnz1nhLL0&sig=ZodTTHR0u7h1fwl_5dOV3sSlF4I', 'paper': None, 'citations': 'Cited by 1012'}, {'title': 'From Plataea to Potidaea: studies in the history and historiography of the Pentecontaetia', 'authors': 'E Badian - 1993 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=nj5OiXQG12QC&oi=fnd&pg=PR9&dq=history&ots=LE_4vsIwuf&sig=SRJ1Fpj9TMbntr190kQn_ov1y4E', 'paper': None, 'citations': 'Cited by 293'}, {'title': 'History of Australia', 'authors': 'M Clark - 1993 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=PYpNYjr_xQEC&oi=fnd&pg=PR9&dq=history&ots=UPFFXzUKE9&sig=glANLCGh-a8hprc7nWTUvPhwlNY', 'paper': None, 'citat

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=240 results is: [{'title': 'Evolution: the history of an idea', 'authors': 'PJ Bowler - 1989 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=e2b5B0po8fwC&oi=fnd&pg=PP13&dq=history&ots=bmwNpWy5wC&sig=ZAYTsDjzQv_rbKFEH0ImKqelNL0', 'paper': 'https://pdfcoffee.com/download/the-history-of-science-and-religion-in-the-western-tradition-an-encyclopedia-pdf-free.html#page=550', 'citations': 'Cited by 2339'}, {'title': 'A short history of linguistics', 'authors': 'RH Robins - 2013 - taylorfrancis.com', 'source': 'https://www.taylorfrancis.com/books/mono/10.4324/9781315843186/short-history-linguistics-robins', 'paper': None, 'citations': 'Cited by 2179'}, {'title': 'A history of Africa', 'authors': 'J Fage - 2013 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=mXa4AQAAQBAJ&oi=fnd&pg=PP1&dq=history&ots=cX6QQ_t9lf&sig=zDXiXFUlYF

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=270 results is: [{'title': 'Vietnam: A history', 'authors': 'S Karnow - 1994 - Random House', 'source': None, 'paper': None, 'citations': 'Cited by 2572'}, {'title': 'The history of management: a global perspective', 'authors': 'W Pindur, SE Rogers, PS Kim\xa0- Journal of management history, 1995 - emerald.com', 'source': 'https://www.emerald.com/insight/content/doi/10.1108/13552529510082831/full/html', 'paper': 'https://e-tarjome.com/storage/btn_uploaded/2019-04-16/1555409051_9297e-tarjome-English.pdf', 'citations': 'Cited by 193'}, {'title': 'America: A narrative history', 'authors': 'GB Tindall, DE Shi - 2016 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=QvZ3DAAAQBAJ&oi=fnd&pg=PR11&dq=history&ots=hqlTJFuHY4&sig=pQOrD4J8OjZW6zEDF7o1_Kz3dXQ', 'paper': 'https://books.google.com/books?hl=en&lr=lang_en&id=QvZ3DAAAQBAJ&oi=fnd&pg=PR11&dq=history&ots=hq

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=300 results is: [{'title': 'History and memory', 'authors': 'J Le Goff - 1992 - Columbia University Press', 'source': None, 'paper': None, 'citations': 'Cited by 1749'}, {'title': 'Women with polycystic ovary syndrome wedge resected in 1956 to 1965: a long-term follow-up focusing on natural history and circulating hormones', 'authors': 'E Dahlgren, S Johansson, G Lindstedt, F Knutsson…\xa0- Fertility and sterility, 1992 - Elsevier', 'source': 'https://www.sciencedirect.com/science/article/pii/S0015028216548924', 'paper': None, 'citations': 'Cited by 793'}, {'title': 'Truth in history', 'authors': 'O Handlin - 2018 - taylorfrancis.com', 'source': 'https://www.taylorfrancis.com/books/mono/10.4324/9781351301046/truth-history-oscar-handlin', 'paper': None, 'citations': 'Cited by 254'}, {'title': 'The natural history of Selborne', 'authors': 'G White - 1890 - books.google.com', 'source': '

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=330 results is: [{'title': 'California: A history', 'authors': 'K Starr - 2007 - Modern Library', 'source': None, 'paper': None, 'citations': 'Cited by 282'}, {'title': 'The gateway to history', 'authors': 'A Nevins - 2018 - taylorfrancis.com', 'source': 'https://www.taylorfrancis.com/books/mono/10.4324/9781315640860/gateway-history-allan-nevins', 'paper': None, 'citations': 'Cited by 392'}, {'title': 'The history of money', 'authors': 'J Weatherford - 2009 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=v8oxCs1eWgsC&oi=fnd&pg=PR11&dq=history&ots=sHp5f_QQT3&sig=aZU1gu-c_AHlZhKn8y9LGqvGjuI', 'paper': None, 'citations': 'Cited by 506'}, {'title': 'A social history of American technology', 'authors': 'RS Cowan\xa0- OUP Catalogue, 1997 - ideas.repec.org', 'source': 'https://ideas.repec.org/b/oxp/obooks/9780195046052.html', 'paper': None, 'citations': 'Ci

https://scholar.google.com/scholar?hl=en&as_sdt=0,5&q=history&as_ylo=None&as_yhi=None&lr=lang_en&start=360 results is: [{'title': 'The end of social history?', 'authors': 'P Joyce\xa0- Social History, 1995 - Taylor & Francis', 'source': 'https://www.tandfonline.com/doi/pdf/10.1080/03071029508567926', 'paper': None, 'citations': 'Cited by 269'}, {'title': 'History of the Jews', 'authors': 'P Johnson - 2013 - books.google.com', 'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=gbUzAAAAQBAJ&oi=fnd&pg=PT11&dq=history&ots=NzOcZ9_DnX&sig=Z6YFR8c3TQrMreQNePhSlPcvBXI', 'paper': 'http://www.exodusbooks.com/samples/Harper/7427Sample.pdf', 'citations': 'Cited by 1031'}, {'title': 'A history of architectural conservation', 'authors': 'J Jokilehto - 2017 - taylorfrancis.com', 'source': 'https://www.taylorfrancis.com/books/mono/10.4324/9781315636931/history-architectural-conservation-jukka-jokilehto', 'paper': None, 'citations': 'Cited by 1541'}, {'title': 'The history of mankind', 'auth

[{'title': 'On history',
  'authors': 'F Braudel - 1982 - books.google.com',
  'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=Aus0BEQZFXIC&oi=fnd&pg=PP13&dq=history&ots=SZHBC6NsYy&sig=YgiDUtpHOnzU6C6_5vtXtUMyJe0',
  'paper': None,
  'citations': 'Cited by 2089'},
 {'title': 'What is history?',
  'authors': 'EH Carr - 2018 - books.google.com',
  'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=w8hcDwAAQBAJ&oi=fnd&pg=PT6&dq=history&ots=2mD-7SbLE5&sig=EUEUy4CYnYsyTsS-L6fsabuCrjc',
  'paper': 'http://www.math.chalmers.se/~ulfp/Review/history.pdf',
  'citations': 'Cited by 6190'},
 {'title': 'A history of hypnotism.',
  'authors': 'A Gauld - 1992 - psycnet.apa.org',
  'source': 'https://psycnet.apa.org/record/1993-97236-000',
  'paper': None,
  'citations': 'Cited by 776'},
 {'title': 'Choreographing history',
  'authors': 'SL Foster - 1995 - books.google.com',
  'source': 'https://books.google.com/books?hl=en&lr=lang_en&id=TAL61XwNf_8C&oi=fnd&pg=PP11&dq=history&

Convert the results to a Pandas DataFrame:

In [75]:
df = pd.DataFrame(articles)
df['citations'] = df['citations'].str.replace('Cited by ', '')
df['citations'] = pd.to_numeric(df['citations'], errors='coerce').fillna('n/a').astype('int64')
df.fillna('n/a', inplace=True)
df.head(10)

Unnamed: 0,title,authors,source,paper,citations
0,On history,F Braudel - 1982 - books.google.com,https://books.google.com/books?hl=en&lr=lang_e...,,2089
1,What is history?,EH Carr - 2018 - books.google.com,https://books.google.com/books?hl=en&lr=lang_e...,http://www.math.chalmers.se/~ulfp/Review/histo...,6190
2,A history of hypnotism.,A Gauld - 1992 - psycnet.apa.org,https://psycnet.apa.org/record/1993-97236-000,,776
3,Choreographing history,SL Foster - 1995 - books.google.com,https://books.google.com/books?hl=en&lr=lang_e...,,503
4,On history,E Hobsbawm - 2011 - books.google.com,https://books.google.com/books?hl=en&lr=lang_e...,,1248
5,Why history?,K Jenkins - 2005 - taylorfrancis.com,https://www.taylorfrancis.com/books/mono/10.43...,,349
6,History of the Goths,H Wolfram - 1990 - books.google.com,https://books.google.com/books?hl=en&lr=lang_e...,,741
7,History and tropology,"FR Ankersmit - History and Tropology, 1994 - d...",https://www.degruyter.com/document/doi/10.1525...,https://www.degruyter.com/document/doi/10.1525...,557
8,History of shit,D Laporte - 2002 - books.google.com,https://books.google.com/books?hl=en&lr=lang_e...,,446
9,The cunning of history,R Rubenstein - 1978 - libraryofsocialscience.com,https://www.libraryofsocialscience.com/assets/...,https://www.libraryofsocialscience.com/assets/...,435


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370 entries, 0 to 369
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      370 non-null    object
 1   authors    370 non-null    object
 2   source     370 non-null    object
 3   paper      370 non-null    object
 4   citations  370 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 14.6+ KB
