# Google scholar webscraper

Google Scholar is a freely accessible web search engine that indexes the full text or metadata of scholarly literature across an array of publishing formats and disciplines.

This notebook contains **five** key elements:

1. Importing the libaries and initializing a chrome webdriver. 

2. Using selenium to max out the see more button 

3. Creating a beautiful soup object

4. Perfoming the parsing.

5. Creating a pandas dataframe(detailed below) for each researcher.



In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.remote.webelement import WebElement
import selenium.webdriver.support.ui as ui
import selenium.webdriver.support.expected_conditions as EC
import time
import os
import pandas as pd
import random
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

In [None]:
#xpath of show more button and the x of the 'more details' path
show_more_button_path = '/html/body/div/div[13]/div[2]/div/div[4]/form/div[2]/div/button'
x_button_path = '/html/body/div/div[8]/div/div[1]/a'
year_button_path = '/html/body/div/div[13]/div[2]/div/div[4]/form/div[1]/table/thead/tr[2]/th[3]/span/a'

driver = webdriver.Chrome()
time.sleep(2)

## The resulting dataframe will have these columns:

| Column name | description |
|---:|:---|
| Research name | The researcher's name.|
| Publication title | The title of a  paper |
| Author list | The author's of a paper.|
| Conf/Journal Details  | Details about a publication. |
| Citation Count  | The citation count garnered by a particular paper. |
| Year  | Year of a paper. |

In [None]:
researchers = pd.read_csv("../data/SOC_Researchers.csv")

In [None]:
def add_capitals(l):# adds capitals to author name, cutting down the number of alias's
    if l.isupper():
        l = l.lower()
        l =  l.title()
    else:
        l = l.title()
    return l
alias_authors = pd.read_excel("../data/Neo4j/alias_author_excel.xlsx") 
alias_authors["Alias"] = alias_authors["Alias"].apply(lambda x: x.strip())# removing whitespaces
alias_authors["Author"] = alias_authors["Author"].apply(lambda x: x.strip())
alias_dict = dict(zip(alias_authors["Alias"].tolist(), alias_authors["Author"].tolist()))

In [None]:
for index,value in researchers['Researcher'].items():
    name = value.strip()
    google_scholar_link = researchers.iloc[index,1]
    if str(google_scholar_link) != 'nan':
        driver.get(google_scholar_link)
        time.sleep(5+ random.uniform(0,20))
        #maxing out the show more button
        show_more_button = driver.find_element_by_xpath(show_more_button_path)
        time.sleep(2)
        while show_more_button.get_attribute('disabled') == None:
            WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, show_more_button_path))).click()
            time.sleep(3+ random.uniform(0,1))
            show_more_button = driver.find_element_by_xpath(show_more_button_path)
        WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,year_button_path))).click()
        time.sleep(2)
        text = driver.page_source
        list_researcher_name = []
        list_paper_title = [] 
        list_year = []
        list_authors = []
        list_journal_info = []
        list_count_cit = []
        list_only_journal = []
        soup =  BeautifulSoup(text,"lxml")
        i = 1
        for instance in soup.find_all('tr', class_='gsc_a_tr'):
            #adding research name
            list_researcher_name.append(name)

            # finding authors names
            more_details_button = "/html/body/div/div[13]/div[2]/div/div[4]/form/div[1]/table/tbody/tr[{}]/td[1]/a".format(i)
            WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,more_details_button))).click()
            time.sleep(5 + random.uniform(0,5))
            more_details_soup = BeautifulSoup(driver.page_source, "lxml")
            authors = None
            for extra_instance in more_details_soup.find_all("div", class_="gs_scl"):
                if extra_instance.find('div', class_="gsc_vcd_field"):
                    if extra_instance.find('div', class_="gsc_vcd_field").get_text() == "Authors":
                        authors = extra_instance.find('div', class_="gsc_vcd_value").get_text().strip()
                    elif extra_instance.find('div', class_="gsc_vcd_field").get_text() == "Inventors":
                        authors = extra_instance.find('div', class_="gsc_vcd_value").get_text().strip()
            WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, x_button_path))).click()
            time.sleep(1)
            try:
                list_authors.append(authors.replace(u'\xa0', u' ').strip())
            except AttributeError:
                list_authors.append(name)

            #--OLD FINDING AUTHORS METHOD-----
            # finding authors names
            #authors =instance.find('div', class_='gs_gray')
            #list_authors.append(authors.get_text().replace(u'\xa0', u' ').strip())

            #finding the papers title
            paper_title = instance.find("a", class_= "gsc_a_at")
            list_paper_title.append(paper_title.get_text().replace(u'\xa0', u' ').strip())

            #getting year
            year = instance.find("span", class_="gsc_a_h gsc_a_hc gs_ibl")
            if len(year) > 0:
                list_year.append(int(year.get_text().strip()))
            else:
                list_year.append(None)

            #finding the journal name
            journal_info =instance.find_all('div', class_='gs_gray')[1]
            if journal_info:
                list_journal_info.append(journal_info.get_text().replace(u'\xa0', u' ').strip())
                if year:
                    list_only_journal.append("".join(journal_info.get_text().replace(u'\xa0', u' ').strip().split(",")[:-1]))
                else:
                    list_only_journal.append(journal_info.get_text().replace(u'\xa0', u' ').strip())

            else:
                list_journal_info.append(None)
                list_only_journal.append(None)


            #getting count of citations
            count_cit = instance.find('a', class_="gsc_a_ac gs_ibl")
            # checking if its not just a line through citation
            if count_cit == None:
                count_cit = instance.find('a', class_="gsc_a_ac gs_ibl gsc_a_acm")
            if len(count_cit) > 0:
                list_count_cit.append(count_cit.get_text().strip())
            else:
                list_count_cit.append(0)

            i+=1 # used to find the title xpath link

        d = {"Research name":list_researcher_name,"Publication Title": list_paper_title,"Author List": list_authors, "Conf/Journal Details": list_only_journal, "Citation count": list_count_cit, "Year":list_year }
        df = pd.DataFrame(data=d)
        pd.to_numeric(df.Year , errors='coerce')
        df = df[df.Year > 2008]
        df = df.sort_values(by=['Year'], ascending=False)
        #alias to canocial method
        for index, authors in df["Author List"].items():
            list_authors = []# a list of authors on a publication
            # have to replace diffferent kind of apostrophe to one 
            for author in authors.split(", "):
                author = author.replace("'","’").strip()
                author = add_capitals(author).strip()
                if author in alias_dict:
                    author = alias_dict[author]#changing to Cannon name
                list_authors.append(author)
            df.loc[index,"Author List"] = ", ".join(list_authors)#changing the scholar dataframe to replace aliases with cannon names
            
        filename = "_".join(name.split(" "))
        df.to_csv("../data/Google Scholar Publications/{}.csv".format(filename), index = None, header=True)

In [None]:
driver.quit()