In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [2]:
# Load the data
df_idgs = pd.read_excel('/Users/rudyhendrawan/miniforge3/datasets/DTPR-UPDATE-3.xlsx')
df_samples = df_idgs.sample(3)
df_samples

Unnamed: 0,NO,NIDN,NAMA,NAMA DOSEN DENGAN GELAR,JAFUNG,SERDOS,scholar_id
4,5,805098701.0,DANDY PRAMANA HOSTIADI,"Dr. DANDY PRAMANA HOSTIADI, S.Kom., M.T",Lektor 300,2016.0,igzQmigAAAAJ
61,62,807098501.0,YOHANES PRIYO ATMOJO,"YOHANES PRIYO ATMOJO, S.Kom., M.Eng",Lektor 300,2017.0,rsgrUx8AAAAJ
3,4,819128701.0,PUTU DESIANA WULANING AYU,"Dr. PUTU DESIANA WULANING AYU, S.T., M.T",Lektor Kepala,2017.0,fFjHpfAAAAAJ


In [None]:
# Function to get publication data
def get_publication_data(scholar_id, delay_times=1.0):
    url = f'https://scholar.google.com/citations?user={scholar_id}&hl=en'
    browser.get(url)
    time.sleep(delay_times)

    # Click the "Show more" button
    while True:
        try:
            show_more_button = browser.find_element('id', 'gsc_bpf')
            if show_more_button.is_displayed():
                show_more_button.click()
                time.sleep(delay_times)
            else:
                break
        except Exception as e:
            break

    html = browser.page_source
    soup = BeautifulSoup(html, 'html.parser')

    publications = []
    for row in soup.find_all('tr', class_='gsc_a_tr'):
        title_elem = row.find('a', class_='gsc_a_at')
        title = row.find('a', class_='gsc_a_at').text
        year = row.find('span', class_='gsc_a_hc').text
        citations = row.find('a', class_='gsc_a_ac').text
        authors_publisher = row.find_all('div', class_='gs_gray')
        authors = authors_publisher[0].text
        publisher = authors_publisher[1].text

        article_url = title_elem['href'] if title_elem else 'N/A'
        article_id = article_url.split('citation_for_view=')[1] if article_url != 'N/A' else 'N/A'

        publications.append({
            'scholar_id': scholar_id,
            'title': title,
            'authors': authors,
            'publisher': publisher,
            'year': year,
            'citations': citations,
            'article_url': article_url,
            'article_id': article_id,
        })

    return publications

def fetch_publication_data(scholar_ids):
    publications = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(get_publication_data, scholar_id) for scholar_id in scholar_ids]
        for future in as_completed(futures):
            publications.extend(future.result())
    return publications

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument('--no-sandbox')

# Initialize the WebDriver
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

# DataFrame to store the results
# columns = ['scholar_id', 'title', 'authors', 'publisher', 'year', 'citations']
# results_df = pd.DataFrame(columns=columns)

# Loop through each scholar_id and get the publication data
# for index, row in df_idgs.iterrows():
#     scholar_id = row['scholar_id']
#     publications = get_publication_data(scholar_id)
#     results_df = pd.concat([results_df, pd.DataFrame(publications)], ignore_index=True)

scholar_ids = df_samples['scholar_id'].tolist()
publications = fetch_publication_data(scholar_ids)
df_publications = pd.DataFrame(publications)
df_publications.to_csv('/Users/rudyhendrawan/miniforge3/datasets/publications.csv', index=False)

# Close the browser
browser.quit()

# Display the results
# results_df.head()
# results_df.to_csv('/Users/rudyhendrawan/miniforge3/datasets/publications.csv', index=False)

In [None]:
def get_citation_data(article_id, article_url, delay_times=2.0):
	url = f'https://scholar.google.com{article_url}'
	try:
		browser.get(url)
		time.sleep(delay_times)  # Wait for the page to load
		html = browser.page_source
		soup = BeautifulSoup(html, 'html.parser')

		citations_per_year = {}

		# Find the graph wrapper
		graph_bars = soup.find('div', id='gsc_oci_graph_bars')
		if not graph_bars:
			print(f"No graph bars found for {url}")
			return citations_per_year

		# Extract year labels and citation counts
		years = graph_bars.find_all('span', class_='gsc_oci_g_t')
		citation_bars = graph_bars.find_all('a', class_='gsc_oci_g_a')

		# Ensure consistent pairing
		for year, bar in zip(years, citation_bars):
			year_value = year.text.strip()
			citation_count = bar.find('span', class_='gsc_oci_g_al').text.strip()
			citations_per_year[year_value] = int(citation_count)

		return citations_per_year
	except Exception as e:
		print(f'Error accessing {url}: {e}')
		return {}

browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

# DataFrame to store citation information
citation_infos_df = pd.DataFrame(columns=['article_id', 'year', 'citations'])

# Process each article
for index, row in results_df.iterrows():
	article_id = row['article_id']
	article_url = row['article_url']
	print(f"Accessing article ID {article_id} with URL {article_url}")

	citations_per_year = get_citation_data(article_id, article_url)

	# Insert citation data into DataFrame
	for year, citations in citations_per_year.items():
		citation_infos_df = pd.concat([
			citation_infos_df,
			pd.DataFrame({'article_id': [article_id], 'year': [year], 'citations': [citations]})
		], ignore_index=True)

# Close the browser
browser.quit()

# Display the DataFrame
print(citation_infos_df.head())

In [None]:
citation_infos_df.tail()

In [None]:
# Merge citation_infos_df with results_df on article_id
merged_df = pd.merge(citation_infos_df, results_df, on='article_id', how='inner')

# Join the merged_df with df_samples on scholar_id
final_df = pd.merge(merged_df, df_samples, on='scholar_id', how='inner')

# Display the final DataFrame
final_df.head()