In [None]:
# import libraries
import time
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from urllib.parse import urlparse, parse_qs
import pandas as pd
import numpy as np
import random
import sqlite3
import time

In [None]:
chrome_options = webdriver.ChromeOptions()

# General options
chrome_options.add_argument('--no-sandbox')  # Required for certain environments
chrome_options.add_argument('--disable-dev-shm-usage')  # Address shared memory issue
chrome_options.add_argument("--disable-extensions")  # Disable extensions to appear more like a normal browser
chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # Bypass automation detection

# Mimic a user-like behavior
chrome_options.add_argument("--start-maximized")  # Start the browser maximized
chrome_options.add_argument('--disable-infobars')  # Disable the 'Chrome is being controlled by automation' bar
chrome_options.add_argument("--window-size=1920,1080")  # Set window size
chrome_options.add_argument("--enable-javascript")  # Ensure JavaScript is enabled
chrome_options.add_argument("--incognito")  # Use incognito mode

# User-Agent modification
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

# Prevent detection
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

# Optional headless mode (Comment this out if you want to see the browser UI)
chrome_options.add_argument('--headless')  # Run without GUI
chrome_options.add_argument('--disable-gpu')  # Necessary in headless mode for some systems

# Handle proxy and languages if needed
chrome_options.add_argument("--lang=en-US")  # Set browser language
chrome_options.add_argument('--proxy-server="direct://"')  # Disable proxy
chrome_options.add_argument('--proxy-bypass-list=*')  # Bypass all proxies


In [None]:
con = sqlite3.connect("/home/somir/Desktop/CitationDataset/dataset.sqlite3")
picked_authors = pd.read_sql_query("SELECT scholar_id,profile_link FROM authors", con)
already_collected = pd.read_sql_query("SELECT distinct(scholar_id) from publications", con)

con.close()

In [None]:
print(already_collected)

In [None]:
picked_authors

In [None]:
def save_author_metrics_and_publications(scholar_id,info):
    con = sqlite3.connect("dataset.sqlite3")
    c = con.cursor()

    try:
        con.execute('BEGIN TRANSACTION')
        con.execute('''UPDATE authors 
                      SET h_index = ?, i10_index = ?, total_publications = ?, total_citations = ?, bibliometrics_added_at = ?
                      WHERE scholar_id = ?''', (info['h_index'], info['i10_index'], info['total_publications'], info['total_citations'], time.time(), scholar_id))
    
        for work in info['works']:
            con.execute("""
        INSERT INTO
            publications
            (publication_url, scholar_id, pub_added_at)
        VALUES
            (?,?,?)""", (work, scholar_id, time.time()))
        con.commit()
    except Exception as error:
        con.rollback()
        print(str(error))
        print("An exception occurred:", type(error).__name__)
    con.close()

In [None]:
def get_scholar_info(url): ## this method collects scholar's bibliometrics and all publication's urls
    try:
        driver = webdriver.Chrome(options=chrome_options)
        
        driver.get(url+"&pagesize=100&view_op=list_works")
        print(driver.current_url)
        
        #clicking show more button untill all the publications are displayed
        elem = driver.find_element(By.ID,'gsc_bpf_more')
        is_disabled = elem.get_attribute("disabled")
        while(is_disabled!= 'true'):
            elem.click()
            elem = driver.find_element(By.ID,'gsc_bpf_more')
            is_disabled = elem.get_attribute("disabled")

        #start collecting bibliometrics and publication urls
        try:
            
            citation = driver.find_element(By.CSS_SELECTOR, "#gsc_rsb_st tbody tr:nth-child(1) td:nth-child(2)").text
            h_index = driver.find_element(By.CSS_SELECTOR, "#gsc_rsb_st tbody tr:nth-child(2) td:nth-child(2)").text
            i10_index = driver.find_element(By.CSS_SELECTOR, "#gsc_rsb_st tbody tr:nth-child(3) td:nth-child(2)").text
        except Exception as e:
            print("An exception occurred after retries:", str(e))
            citation = 0
            h_index = 0
            i10_index = 0
        # Find all anchor elements with the class name "gsc_a_at"
        works_element = driver.find_elements(By.CLASS_NAME,"gsc_a_tr")
        # print('here are works')
        publication_count = len(works_element)
        
        works=[]
        for work in works_element:
            # Find the link with class "gsc_a_at" and extract the url link of work
            link = work.find_element(By.CLASS_NAME,"gsc_a_at").get_attribute("href")
    
            # # Find the link with class "gsc_a_ac gs_ibl" and extract citation count
            # text_value = work.find_element(By.CLASS_NAME,"gsc_a_ac.gs_ibl").text
            # if text_value.strip() != "":
            #     # Append the extracted data to the respective lists
            works.append(link)

        driver.quit()
        # Extract URLs from anchor elements
        return {
            'url': url,
            'total_citations': citation,
            'h_index': h_index,
            'works': works,
            'total_publications': publication_count,
            'i10_index': i10_index
        }
    except Exception as e:
        print("An exception occurred after retries:", str(e))
        return {
            'error':1,
            'url':url,
            'total_citations': None,
            'h_index': None,
            'works': [],
            'total_publications': None,
            'i10_index': None
        }






In [None]:
from tqdm import tqdm
# apply the function to each row of the DataFrame
error_list = []

with tqdm(total=len(picked_authors)) as pbar:
    for index, row in picked_authors.iterrows():
        if row['scholar_id'] in already_collected['scholar_id'].unique():
            pbar.update(1)
            continue
        info = get_scholar_info(row['profile_link'])
        if info.get('error')==1:
            error_list.append(row['scholar_id'])
            print(info)
            continue
        save_author_metrics_and_publications(row['scholar_id'],info)
        pbar.update(1)
error_df = pd.DataFrame(error_list, columns=['scholar_id'])

# Save the DataFrame to a CSV file
error_df.to_csv('2nd_attempt_error_scholar_list.csv', index=False)

print('CSV file saved successfully.')