# Intro

ref: https://dev.to/dmitryzub/scrape-google-scholar-with-python-32oh 
pip install requests
$ pip install lxml 
$ pip install beautifulsoup4
$ pip install google-search-results 

# Import Libraries

In [None]:
from bs4 import BeautifulSoup
import requests, lxml, os, json

# Headers & Parameters

In [None]:
headers = {
    'User-agent':
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
  "q": "reinforcement learning",
  "hl": "en",
}

html = requests.get('https://scholar.google.com/scholar?start=0', headers=headers, params=params).text
soup = BeautifulSoup(html, 'lxml')

for result in soup.select('.gs_ab_mdw'):
  print(result.text)

ArticlesScholar
About 3,160,000 results (0.04 sec)


# Web Scraping

## Scrape just PDF links

In [None]:
# Scrape just PDF links
for pdf_link in soup.select('.gs_or_ggsm a'):
  pdf_file_link = pdf_link['href']
  print(pdf_file_link)

https://www.academia.edu/download/54674740/Reinforcement_Learning.pdf
https://www.jair.org/index.php/jair/article/download/10166/24110/
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.308.549&rep=rep1&type=pdf
https://www-anw.cs.umass.edu/~barto/courses/cs687/Chapter%201-printable.pdf
https://www.academia.edu/download/6985553/ivry_rev.pdf
https://arxiv.org/pdf/1312.5602.pdf?source=post_page---------------------------
https://arxiv.org/pdf/1811.12560.pdf%3C/p%3E
https://ojs.aaai.org/index.php/AAAI/article/download/11694/11553


In [None]:
# JSON data will be collected here
data = []

# Container where all needed data is located
for result in soup.select('.gs_ri'):
  title = result.select_one('.gs_rt').text
  title_link = result.select_one('.gs_rt a')['href']
  publication_info = result.select_one('.gs_a').text
  snippet = result.select_one('.gs_rs').text
  cited_by = result.select_one("a[href*=/scholar?cites=]").text #result.select_one('#gs_res_ccl_mid .gs_nph a')['href']
  related_articles = result.select_one("a[href*=/scholar?q=related=]") #result.select_one('a:nth-child(4)')['href'] 
  try:
    all_article_versions = result.select_one('a~ a+ .gs_nph')['href']
  except:
    all_article_versions = None

  data.append({
    'title': title,
    'title_link': title_link,
    'publication_info': publication_info,
    'snippet': snippet,
    'cited_by': f'https://scholar.google.com{cited_by}',
    'related_articles': f'https://scholar.google.com{related_articles}',
    'all_article_versions': f'https://scholar.google.com{all_article_versions}',
  })

print(json.dumps(data, indent = 2, ensure_ascii = False))

# Part of the JSON Output:
'''
[
  {
    "title": "“What? I thought Samsung was Japanese”: accurate or not, perceived country of origin matters",
    "title_link": "https://www.emerald.com/insight/content/doi/10.1108/02651331111167589/full/html",
    "publication_info": "P Magnusson, SA Westjohn… - International Marketing …, 2011 - emerald.com",
    "snippet": "Purpose–Extensive research has shown that country‐of‐origin (COO) information significantly affects product evaluations and buying behavior. Yet recently, a competing perspective has emerged suggesting that COO effects have been inflated in prior research …",
    "cited_by": "https://scholar.google.com/scholar?cites=341074171610121811&as_sdt=2005&sciodt=0,5&hl=en",
    "related_articles": "https://scholar.google.com/scholar?q=related:U8bh6Ca9uwQJ:scholar.google.com/&scioq=samsung&hl=en&as_sdt=0,5",
    "all_article_versions": "https://scholar.google.com/scholar?cluster=341074171610121811&hl=en&as_sdt=0,5"
  }
]
'''

# Part of PDF Links Output:
'''
https://www.researchgate.net/profile/Peter_Magnusson/publication/232614407_What_I_thought_Samsung_was_Japanese_Accurate_or_not_perceived_country_of_origin_matters/links/09e4150881184a6ad2000000/What-I-thought-Samsung-was-Japanese-Accurate-or-not-perceived-country-of-origin-matters.pdf
https://www.researchgate.net/profile/Hong_Mo_Yang/publication/235291000_Supply_chain_management_six_sigma_A_management_innovation_methodology_at_the_Samsung_Group/links/56e03d0708aec4b3333d0445.pdf
https://www.academia.edu/download/54053930/The_Strategic_Localization_of_Transnatio20170803-32468-4ntcqr.pdf
https://mathsci2.appstate.edu/~wmcb/Class/5340/ClassNotes141/EdelmanAwards/Interfaces2002-S.pdf
'''

[
  {
    "title": "“What? I thought Samsung was Japanese”: accurate or not, perceived country of origin matters",
    "title_link": "https://www.emerald.com/insight/content/doi/10.1108/02651331111167589/full/html",
    "publication_info": "P Magnusson, SA Westjohn… - International Marketing …, 2011 - emerald.com",
    "snippet": "… toward Samsung. When the consumer learns that Samsung is actually a South Korean \nbrand, the consumer’s attitude toward Samsung is … That is, a less favorable image of South \nKorea, in comparison with Japan, leads to a less favorable attitude toward Samsung, whereas a …",
    "cited_by": "https://scholar.google.comCited by 345",
    "related_articles": "https://scholar.google.comNone",
    "all_article_versions": "https://scholar.google.comNone"
  },
  {
    "title": "[BOOK][B] Sony vs Samsung: The Inside Story of the Electronics Giants' Battle For Global Supremacy",
    "title_link": "https://books.google.com/books?hl=en&lr=&id=blWb6tslZb8C&oi=fnd&pg=PT9

'\nhttps://www.researchgate.net/profile/Peter_Magnusson/publication/232614407_What_I_thought_Samsung_was_Japanese_Accurate_or_not_perceived_country_of_origin_matters/links/09e4150881184a6ad2000000/What-I-thought-Samsung-was-Japanese-Accurate-or-not-perceived-country-of-origin-matters.pdf\nhttps://www.researchgate.net/profile/Hong_Mo_Yang/publication/235291000_Supply_chain_management_six_sigma_A_management_innovation_methodology_at_the_Samsung_Group/links/56e03d0708aec4b3333d0445.pdf\nhttps://www.academia.edu/download/54053930/The_Strategic_Localization_of_Transnatio20170803-32468-4ntcqr.pdf\nhttps://mathsci2.appstate.edu/~wmcb/Class/5340/ClassNotes141/EdelmanAwards/Interfaces2002-S.pdf\n'