In [1]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import urllib.parse
import time
import random
import re



def format_search_query(input_string):
    # Remove the '.pdf' extension if present
    if input_string.lower().endswith('.pdf'):
        input_string = input_string[:-4]
    
    # Split the string by underscores
    parts = input_string.split("_")
    
    # Rebuild the string excluding the part between the second and third underscore
    if len(parts) > 3:
        input_string = '_'.join(parts[:2] + parts[3:])
    else:
        input_string = '_'.join(parts)  # If there aren't enough parts, just rejoin what is there

    # Replace underscores with spaces and fix HTML character entities
    input_string = input_string.replace("_", " ")
    input_string = input_string.replace("&amp", "&")
    input_string = input_string.replace(";", "")

    # Print the intermediate result to check
    print("Formatted search query:", input_string)

    # URL encode the formatted string
    formatted_string = urllib.parse.quote_plus(input_string)
    return formatted_string


def extract_paper_info(filename):
    formatted_query = format_search_query(filename)
    scholar_url = f"https://scholar.google.com/scholar?hl=en&q={formatted_query}&as_sdt=0,5"
    driver.get(scholar_url)
    try:
        # Wait for the page element to load. If not loaded within 5 seconds, assume CAPTCHA has appeared
        element_present = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.ID, "gs_res_ccl_mid"))
        )
    except:
        # If element is not found, it could be due to CAPTCHA
        print("CAPTCHA REACHED")
        return None
    try:
        # WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, "gs_res_ccl_mid")))
        result = driver.find_element(By.CSS_SELECTOR, '.gs_ri')
        title = result.find_element(By.CSS_SELECTOR, 'h3.gs_rt a').text
        
        try:
            authors_and_journal_div = result.find_element(By.CSS_SELECTOR, 'div.gs_a.gs_fma_p').text
        except Exception:
            # Fallback selector
            authors_and_journal_div = result.find_element(By.CSS_SELECTOR, 'div.gs_a').text

        print(f"Raw div content: {authors_and_journal_div}")

        # Split the div content appropriately
        parts = authors_and_journal_div.split(' - ')
        if len(parts) >= 2:
            authors = parts[0].strip()
            journal_info = parts[1].split(', ')
            journal_name = journal_info[0].strip()
            year = journal_info[1].split(' ')[0].strip() if len(journal_info) > 1 else None
        elif len(parts) == 1:
            parts = authors_and_journal_div.split('\n')
            if len(parts) > 1:
                # If no dash was found, process as comma-separated format
                authors = parts[0].strip()
                remaining_info = parts[1]

                # Use regex to find the year
                year_match = re.search(r'\b(19|20)\d{2}\b', remaining_info)
                if year_match:
                    year = year_match.group()
                    
                    # Extract journal name by taking everything between the authors and year
                    journal_start_idx = 0
                    journal_end_idx = year_match.start()
                    journal_name = remaining_info[journal_start_idx:journal_end_idx].strip(', ')
                else:
                    year = None
                    journal_name = None
            else: 
                authors = None
                journal_name = None
                year = None
        else:
            # If parsing fails, set to None or default values
            authors = None
            journal_name = None
            year = None

        print(f"Extracted authors: {authors}")
        print(f"Extracted journal: {journal_name}")
        print(f"Extracted year: {year}")

        try:
            cited_by_text = result.find_element(By.PARTIAL_LINK_TEXT, 'Cited by').text
            cited_by = cited_by_text.split(' ')[2]
        except Exception:
            cited_by = None
            print("No 'Cited by' info available.")

        year_in_filename = year in filename if year else False

        sleep_duration = random.uniform(10, 30)
        print(f"Sleeping for {sleep_duration:.2f} seconds")
        time.sleep(sleep_duration)

        return [title, authors, journal_name, year, cited_by, filename, year_in_filename]
    except Exception as e:
        print(f"Error: {e}")
        sleep_duration = random.uniform(10, 30)
        print(f"Sleeping for {sleep_duration:.2f} seconds due to error")
        time.sleep(sleep_duration)
        return [None, None, None, None, None, filename, False]


def load_existing_entries(year):
    try:
        df = pd.read_excel(f'../paper_citation_counts/{year}_paper_citation_info.xlsx')
        return df
    except FileNotFoundError:
        return pd.DataFrame(columns=['Title', 'Authors', 'Journal', 'Year', 'Cited By', 'File Name', 'Year in Filename'])


def should_process_file(df, filename):
    print(f"Checking if processing is needed for: {filename}")
    existing_entry = df[df['File Name'].str.strip().str.lower() == filename.strip().lower()]
    
    if not existing_entry.empty:
        row = existing_entry.iloc[0]
        # Check all required fields for completeness
        if pd.notna(row['Title']) and pd.notna(row['Authors']) and pd.notna(row['Journal']) and pd.notna(row['Cited By']):
            print(f"Skipping {filename}: complete record exists.")
            return False
        else:
            print(f"Processing {filename}: incomplete record.")
            return True
    else:
        print(f"Processing {filename}: no existing record found.")
        return True

# Setup Selenium WebDriver
options = Options()
# options.add_argument("--headless")
service = Service('/usr/local/bin/chromedriver')
driver = webdriver.Chrome(service=service, options=options)

try:
    year = 2001
    directory = f'../papers_by_year/{year}'
    filenames = [f for f in os.listdir(directory) if f.endswith('.pdf')]
    
    # Load existing entries
    existing_entries = load_existing_entries(year)
    
    # DataFrame to store paper info from this processing batch
    data = []

    # Process each file
    for filename in filenames:
        if should_process_file(existing_entries, filename):
            print("Gathering:", filename)
            info = extract_paper_info(filename)
            if info is not None:
                print(info)
                data.append(info)
            else:
                print("CAPTCHA or other issue encountered, stopping further processing.")
                break
        else:
            print(f"Skipping file {filename} as it already has complete information.")

    if data:  # Check if any new data was collected
        # Convert new data to DataFrame
        new_df = pd.DataFrame(data, columns=['Title', 'Authors', 'Journal', 'Year', 'Cited By', 'File Name', 'Year in Filename'])
        
        # Combine with existing entries
        if not existing_entries.empty:
            combined_df = pd.concat([existing_entries, new_df], ignore_index=True)
        else:
            combined_df = new_df

        # Remove duplicates, in case some files are processed again
        combined_df.drop_duplicates(subset=['File Name'], keep='last', inplace=True)

        # Save combined DataFrame to Excel, replacing the old file
        combined_df.to_excel(f'../paper_citation_counts/{year}_paper_citation_info.xlsx', index=False)
    else:
        print("No data collected; no new file saved.")

finally:
    driver.quit()


Checking if processing is needed for: Labianca et al._2001_OrgSci_Emulation in Academia_Quant.pdf
Processing Labianca et al._2001_OrgSci_Emulation in Academia_Quant.pdf: no existing record found.
Gathering: Labianca et al._2001_OrgSci_Emulation in Academia_Quant.pdf
Formatted search query: Labianca et al. 2001 Emulation in Academia Quant
Raw div content: G Labianca, JF Fairbank, JB Thomas… - Organization …, 2001 - pubsonline.informs.org
Extracted authors: G Labianca, JF Fairbank, JB Thomas…
Extracted journal: Organization …
Extracted year: 2001
Sleeping for 24.87 seconds
