In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pickle
import time
import re
import pandas as pd


In [21]:

def setup_driver():
    chrome_options = webdriver.ChromeOptions()
    # chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [22]:

def login_sinta(driver, username, password):
    driver.get("https://sinta.kemdikbud.go.id/logins")
    time.sleep(3)
    username_field = driver.find_element(By.NAME, "username") if len(driver.find_elements(By.NAME, "username")) > 0 else driver.find_element(By.XPATH, '//input[@name="username"]')
    password_field = driver.find_element(By.NAME, "password") if len(driver.find_elements(By.NAME, "password")) > 0 else driver.find_element(By.XPATH, '//input[@name="password"]')
    
    username_field.send_keys(username)
    password_field.send_keys(password)
    
    print("Username field value: ", username_field.get_attribute("value"))
    print("Password field value: ", password_field.get_attribute("value"))
    
    login_button = driver.find_element(By.CSS_SELECTOR, 'button.btn.btn-block.btn-info')
    login_button.click()
    time.sleep(5)
    
    with open("sinta_session.pkl", "wb") as session_file:
        pickle.dump(driver.get_cookies(), session_file)
    
    print("Session saved successfully!")


In [23]:
def load_session(driver):
    with open("sinta_session.pkl", "rb") as session_file:
        cookies = pickle.load(session_file)
        for cookie in cookies:
            driver.add_cookie(cookie)
    driver.refresh()
    print("Session loaded successfully!")

def navigate_to_tab(driver, researcher_id, view):
    driver.get(f"https://sinta.kemdikbud.go.id/authors/profile/{researcher_id}/?view={view}")
    print(f"Navigated to {view} tab for researcher profile with ID: {researcher_id}")
    time.sleep(3)


In [24]:
def scrape_researcher_info(driver):
    try:
        name = driver.find_element(By.CSS_SELECTOR, 'h3 > a').text
    except:
        name = "N/A"
    try:
        affiliation = driver.find_element(By.CSS_SELECTOR, 'a[href*="affiliations/profile"]').text
    except:
        affiliation = "N/A"
    try:
        department = driver.find_element(By.CSS_SELECTOR, 'a[href*="departments/profile"]').text
    except:
        department = "N/A"
    
    return {
        "Name": name,
        "Affiliation": affiliation,
        "Department": department
    }

In [25]:
def scrape_articles(driver):
    articles = []
    try:
        pagination_text_element = driver.find_element(By.CSS_SELECTOR, 'div.pagination-text')
        pagination_text = pagination_text_element.text
        match = re.search(r'Page \d+ of (\d+)', pagination_text)
        if match:
            total_pages = int(match.group(1))
            print(f"Total number of pages: {total_pages}")
        else:
            total_pages = 1
    except:
        print("Pagination element not found, assuming only one page.")
        total_pages = 1

    for page in range(1, total_pages + 1):
        print(f"Scraping page {page} of {total_pages}")
        time.sleep(3)
        div_elements = driver.find_elements(By.CSS_SELECTOR, 'div.ar-list-item.mb-5')
        for index, div in enumerate(div_elements):
            print(f"Scraping article {index + 1} on page {page}")
            try:
                title = div.find_element(By.CSS_SELECTOR, 'div.ar-title > a').text
            except:
                title = "N/A"
            try:
                link = div.find_element(By.CSS_SELECTOR, 'div.ar-title > a').get_attribute("href")
            except:
                link = "N/A"
            try:
                publisher = div.find_element(By.CSS_SELECTOR, 'div.ar-meta > a.ar-pub').text
            except:
                publisher = "N/A"
            try:
                journal = div.find_elements(By.CSS_SELECTOR, 'a.ar-pub')[-1].text if div.find_elements(By.CSS_SELECTOR, 'a.ar-pub') else "N/A"
            except:
                journal = "N/A"
            try:
                author_order = div.find_element(By.XPATH, '//a[contains(text(), "Author Order")]').text
            except:
                author_order = "N/A"
            try:
                if "garuda" in driver.current_url:
                    authors = div.find_elements(By.CSS_SELECTOR, 'div.ar-meta > a')[1].text if len(div.find_elements(By.CSS_SELECTOR, 'div.ar-meta > a')) > 1 else "N/A"
                elif "googlescholar" in driver.current_url:
                    authors_div = div.find_element(By.CSS_SELECTOR, 'div.ar-meta')
                    authors = authors_div.find_element(By.XPATH, './/a[contains(text(), "Authors :")]').text if "Authors :" in authors_div.text else "N/A"
                else:
                    authors_div = div.find_element(By.CSS_SELECTOR, 'div.ar-meta')
                    authors = authors_div.find_element(By.XPATH, './/a[contains(text(), "Authors :")]').text if "Authors :" in authors_div.text else "N/A"
            except:
                authors = "N/A"
            try:
                year = div.find_element(By.CSS_SELECTOR, 'a.ar-year').text
            except:
                year = "N/A"
            try:
                doi = div.find_element(By.CSS_SELECTOR, 'a.ar-cited').text if "DOI" in div.find_element(By.CSS_SELECTOR, 'a.ar-cited').text else "N/A"
            except:
                doi = "N/A"
            try:
                quartile = div.find_element(By.CSS_SELECTOR, 'a.ar-quartile').text
            except:
                quartile = "N/A"
            
            articles.append({
                "Title": title,
                "Link": link,
                "Publisher": publisher,
                "Journal": journal,
                "Author Order": author_order,
                "Authors": authors,
                "Year": year,
                "DOI": doi,
                "Quartile": quartile
            })
        
        if page < total_pages:
            try:
                next_button = driver.find_element(By.LINK_TEXT, "Next")
                next_button.click()
            except:
                print("No next button found, stopping navigation.")
                break
    
    return articles

In [26]:
def save_articles_to_files(articles, researcher_info, researcher_id, filename_prefix):
    articles_df = pd.DataFrame(articles)
    articles_df["Researcher Name"] = researcher_info["Name"]
    articles_df["Researcher ID"] = researcher_id
    articles_df["Affiliation"] = researcher_info["Affiliation"]
    articles_df["Department"] = researcher_info["Department"]
    articles_df.to_csv(f"{filename_prefix}_articles.csv", index=False)
    articles_df.to_excel(f"{filename_prefix}_articles.xlsx", index=False)
    print(articles_df)
    return articles_df

In [27]:
def generate_statistics(scopus_df, wos_df, garuda_df, googlescholar_df, researcher_id):
    # Combine all dataframes into one
    combined_df = pd.concat([scopus_df, wos_df, garuda_df, googlescholar_df], keys=['Scopus', 'WOS', 'Garuda', 'GoogleScholar'], names=['Source'])
    
    # General statistics
    total_articles = combined_df.shape[0]
    articles_per_source = combined_df.groupby(level='Source').size()
    articles_per_year = combined_df['Year'].value_counts()
    
    # Convert statistics to DataFrame
    stats_df = pd.DataFrame({
        "Source": articles_per_source.index,
        "Articles Count": articles_per_source.values
    })
    stats_df_year = articles_per_year.reset_index()
    stats_df_year.columns = ["Year", "Articles Count"]
    
    # Save statistics to CSV
    stats_df.to_csv(f"statistics_summary_source_{researcher_id}.csv", index=False)
    stats_df_year.to_csv(f"statistics_summary_year_{researcher_id}.csv", index=False)
    
    print("Total number of articles for researcher_id", researcher_id, ":", total_articles)
    print("Number of articles per source:")
    print(stats_df)
    print("Number of articles per year:")
    print(stats_df_year)


In [28]:
def main():
    username = ""
    password = ""
    researcher_ids = ["5986966","161222", "6172418"]  # Add multiple researcher IDs here
    
    import pandas as pd

    # Load the dataset with researcher information
    file_path = 'authors_data_teknikindustri.csv'
    authors_df = pd.read_csv(file_path)

    # Extract the necessary columns: SINTA_ID, Author_Name, University, Department
    researcher_data = authors_df[['SINTA_ID', 'Author_Name', 'University', 'Department']]

    # Prepare the list of researcher IDs and related information for the scraping script
    researcher_ids = researcher_data['SINTA_ID'].tolist()
    researcher_names = researcher_data['Author_Name'].tolist()
    universities = researcher_data['University'].tolist()
    departments = researcher_data['Department'].tolist()
        
    driver = setup_driver()
    try:
        login_sinta(driver, username, password)
        load_session(driver)
        
        for index, researcher_id in enumerate(researcher_ids):
            researcher_name = researcher_names[index]
            university = universities[index]
            department = departments[index]
            researcher_info = {
                "Name": researcher_name,
                "Affiliation": university,
                "Department": department
            }
            print(f"Starting scraping for researcher ID: {researcher_id}")
            # Scrape researcher information
            navigate_to_tab(driver, researcher_id, "")
            researcher_info = scrape_researcher_info(driver)
            
            # Scrape Scopus Articles
            navigate_to_tab(driver, researcher_id, "scopus")
            scopus_articles = scrape_articles(driver)
            scopus_df = save_articles_to_files(scopus_articles, researcher_info, researcher_id, f"scopus_{researcher_id}")
            
            # Scrape WOS Articles
            navigate_to_tab(driver, researcher_id, "wos")
            wos_articles = scrape_articles(driver)
            wos_df = save_articles_to_files(wos_articles, researcher_info, researcher_id, f"wos_{researcher_id}")
            
            # Scrape Garuda Articles
            navigate_to_tab(driver, researcher_id, "garuda")
            garuda_articles = scrape_articles(driver)
            garuda_df = save_articles_to_files(garuda_articles, researcher_info, researcher_id, f"garuda_{researcher_id}")
            
            # Scrape Google Scholar Articles
            navigate_to_tab(driver, researcher_id, "googlescholar")
            googlescholar_articles = scrape_articles(driver)
            googlescholar_df = save_articles_to_files(googlescholar_articles, researcher_info, researcher_id, f"googlescholar_{researcher_id}")
            
            # Generate statistics from all sources for each researcher
            generate_statistics(scopus_df, wos_df, garuda_df, googlescholar_df, researcher_id)
            print(f"Finished scraping for researcher ID: {researcher_id}\n")
        
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

: 