In [None]:
# import libraries
import time
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from urllib.parse import urlparse, parse_qs
import pandas as pd
import numpy as np
import random
import sqlite3
import time

In [None]:
chrome_options = webdriver.ChromeOptions()

# General options
chrome_options.add_argument('--no-sandbox')  # Required for certain environments
chrome_options.add_argument('--disable-dev-shm-usage')  # Address shared memory issue
chrome_options.add_argument("--disable-extensions")  # Disable extensions to appear more like a normal browser
chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # Bypass automation detection

# Mimic a user-like behavior
chrome_options.add_argument("--start-maximized")  # Start the browser maximized
chrome_options.add_argument('--disable-infobars')  # Disable the 'Chrome is being controlled by automation' bar
chrome_options.add_argument("--window-size=1920,1080")  # Set window size
chrome_options.add_argument("--enable-javascript")  # Ensure JavaScript is enabled
chrome_options.add_argument("--incognito")  # Use incognito mode

# User-Agent modification
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

# Prevent detection
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)

# Optional headless mode (Comment this out if you want to see the browser UI)
chrome_options.add_argument('--headless')  # Run without GUI
chrome_options.add_argument('--disable-gpu')  # Necessary in headless mode for some systems

# Handle proxy and languages if needed
chrome_options.add_argument("--lang=en-US")  # Set browser language
chrome_options.add_argument('--proxy-server="direct://"')  # Disable proxy
chrome_options.add_argument('--proxy-bypass-list=*')  # Bypass all proxies


In [None]:
con = sqlite3.connect("dataset.sqlite3")
df = pd.read_sql_query("SELECT sub_field FROM authors", con)

con.close()

In [None]:
print(df)

In [None]:
def save_author_data(authors):
    con = sqlite3.connect("/home/somir/Desktop/CitationDataset/dataset.sqlite3")
    c = con.cursor()
    
    for author in authors:
        try:
            con.execute("""
    INSERT INTO
        authors
        (scholar_id, name, profile_link, field, sub_field,research_areas, page_number, page_rank, page_url,created_at)
    VALUES
        (:scholar_id, :author_name, :profile_link, :field, :sub_field, :research_areas, :page_number, :page_rank, :page_url, :created_at)""", author)
        except Exception as error:
            print(str(error))
            print("An exception occurred:", type(error).__name__)
    con.commit()
    con.close()

In [None]:
from bs4 import BeautifulSoup

def get_authors(field_label: str, sub_field_label: str, total_pages=200):
    if sub_field_label in df['sub_field'].values:
        return "Already Done"
    driver = webdriver.Chrome(options=chrome_options)
    url = 'http://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:' + sub_field_label
    driver.get(url)
    print(driver.current_url)
    author_list = []
    # get competition "All competition" button
    for i in range(0, total_pages):
        elems = driver.find_elements(By.CLASS_NAME, "gsc_1usr")
        rank = random.randint(1, len(elems))
        elem = elems[rank - 1]
        name = elem.find_element(By.CLASS_NAME, "gs_ai_name")
        url = name.find_element(By.TAG_NAME, "a").get_attribute("href")
        parsed_url = urlparse(url)
        scholar_id = parse_qs(parsed_url.query)['user'][0]
        page_url = driver.current_url

        # Extract research areas
        html_content = elem.get_attribute('innerHTML')  # Get the inner HTML of the element
        soup = BeautifulSoup(html_content, 'html.parser')
        research_areas = [a.text for a in soup.find_all('a', class_='gs_ai_one_int')]  # Extract research area labels
        research_areas_text = ", ".join(research_areas)

        dic = {
            'field': field_label,
            'sub_field': sub_field_label,
            'scholar_id': scholar_id,
            'author_name': name.text,
            'page_number': i + 1,
            'page_url': page_url,
            'page_rank': rank,
            'profile_link': url,
            'research_areas': research_areas_text,  # Add research areas to the dictionary
            'created_at': time.time()
        }
        author_list.append(dic)
        # print(dic)
        # break

        # Selecting the next button to load next page
        elem = driver.find_element(By.CSS_SELECTOR, '[aria-label="Next"]')
        is_disabled = elem.get_attribute("disabled")
        if is_disabled:
            break
        else:
            elem.click()
            driver.switch_to.window(driver.window_handles[-1])
            ul = driver.current_url
            driver.quit()
            driver = webdriver.Chrome(options=chrome_options)
            driver.get(ul)
        time.sleep(5)
    
    save_author_data(author_list)
    return "Saved " + field_label + "-" + sub_field_label

In [None]:
from bs4 import BeautifulSoup
from random import sample 

def get_authors_custom(field_label: str, sub_field_label: str, total_pages=200,selection_number=1):
    if sub_field_label in df['sub_field'].values:
        return "Already Done"
    expected_value_calculation = {
    'economic_sociology': {'threshold': 53, 'param1': 2, 'param2': 3},
    'brand_management': {'threshold': 43, 'param1': 2, 'param2': 3},
    'marketing_strategy': {'threshold': 30, 'param1': 2, 'param2': 3},
    'microelectronics': {'threshold': 83, 'param1': 1, 'param2': 2},
    'microeconomics': {'threshold': 60, 'param1': 1, 'param2': 2},
    'algebra': {'threshold': 67, 'param1': 1, 'param2': 2},
    'number_theory': {'threshold': 40, 'param1': 1, 'param2': 2},
    'combinatorics': {'threshold': 67, 'param1': 1, 'param2': 2}
}
    driver = webdriver.Chrome(options=chrome_options)
    url = 'http://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:' + sub_field_label
    driver.get(url)
    print(driver.current_url)
    author_list = []
    # get competition "All competition" button
    for i in range(0, total_pages):
        elems = driver.find_elements(By.CLASS_NAME, "gsc_1usr")

        if sub_field_label in expected_value_calculation:
            params = expected_value_calculation[sub_field_label]
            selection_number = random_decision(**params)
    
        # Ensure there are at least two profiles on the page
        if len(elems) < selection_number:
            print("Not enough profiles on this page to select.")
            continue

        # Randomly select two unique indices
        selected_indices = sample(range(len(elems)), selection_number)  # Get unique random indices

        for rank in selected_indices:
            elem = elems[rank]  # Select the profile element
            name = elem.find_element(By.CLASS_NAME, "gs_ai_name")
            url = name.find_element(By.TAG_NAME, "a").get_attribute("href")
            parsed_url = urlparse(url)
            scholar_id = parse_qs(parsed_url.query)['user'][0]
            page_url = driver.current_url
    
            # Extract research areas
            html_content = elem.get_attribute('innerHTML')  # Get the inner HTML of the element
            soup = BeautifulSoup(html_content, 'html.parser')
            research_areas = [a.text for a in soup.find_all('a', class_='gs_ai_one_int')]  # Extract research area labels
            research_areas_text = ", ".join(research_areas)
            
            # Print or store the data for each profile
            # print(f"Name: {name.text}")
            # print(f"Profile URL: {url}")
            # print(f"Scholar ID: {scholar_id}")
            # print(f"Research Areas: {research_areas_text}")
            # print(f"Page URL: {page_url}")
            # print("-" * 50)

            dic = {
                'field': field_label,
                'sub_field': sub_field_label,
                'scholar_id': scholar_id,
                'author_name': name.text,
                'page_number': i + 1,
                'page_url': page_url,
                'page_rank': rank,
                'profile_link': url,
                'research_areas': research_areas_text,  # Add research areas to the dictionary
                'created_at': time.time()
            }
            author_list.append(dic)
        # print(dic)

        # Selecting the next button to load next page
        elem = driver.find_element(By.CSS_SELECTOR, '[aria-label="Next"]')
        is_disabled = elem.get_attribute("disabled")
        if is_disabled:
            break
        else:
            elem.click()
            driver.switch_to.window(driver.window_handles[-1])
            ul = driver.current_url
            # driver.quit()
            # driver = webdriver.Chrome(options=chrome_options)
            # driver.get(ul)
    
    save_author_data(author_list)
    return "Saved " + field_label + "-" + sub_field_label

In [None]:
keywords = {
    'computer_science': ['software_engineering', 'cyber_security', 'computer_networks'],
    'biology': ['genetics', 'microbiology', 'molecular_biology'],
    'electrical_engineering': ['signal_processing', 'microelectronics','power_electronics'],
    'civil_engineering': ['environmental_engineering', 'geotechnical_engineering', 'structural_engineering'],
    'psychology': ['cognitive_psychology', 'developmental_psychology', 'social_psychology'],
    'sociology': ['demography', 'criminology', 'economic_sociology'],
    'marketing': ['consumer_behavior', 'brand_management', 'marketing_strategy'],
    'economics': ['econometrics', 'microeconomics', 'macroeconomics'],
    'mathematics':['algebra','combinatorics','number_theory']
}
for field, subfields in keywords.items():
    # Loop through the list of subfields/topics for each key
    for subfield in subfields:
        print(get_authors_custom(field, subfield))

In [None]:
import random

def random_decision(threshold,param1,param2):
    """
    Generates a random number between 0 and 100.
    Returns 1 if the number is greater than the threshold, otherwise returns 2.

    Args:
        threshold (int): The threshold value to compare against (0 to 100).

    Returns:
        int: 1 if random number > threshold, otherwise 2.
    """
    if not 0 <= threshold <= 100:
        raise ValueError("Threshold must be between 0 and 100.")
    
    random_number = random.randint(0, 100)
    # print(f"Generated Random Number: {random_number}")
    
    if random_number > threshold:
        return param2
    else:
        return param1

# Example usage
# threshold = 50  # You can set this to any value between 0 and 100
result = random_decision(50,1,2)
print(f"Result: {result}")


In [None]:
keywords = {
    'electrical_engineering': ['microelectronics'],
    'sociology': ['economic_sociology'],
    'marketing': ['brand_management', 'marketing_strategy'],
    'economics': ['microeconomics'],
}

for field, subfields in keywords.items():
    # Loop through the list of subfields/topics for each key
    for subfield in subfields:
        print(get_authors_custom(field, subfield))