## Import libraries : 

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd

## Extract publisher details from Google scholar: 

In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import re


# Function for extracting details from Google Scholar
def extract_pub_detls_google_schlr(publisher, subject):
    # Set up the WebDriver (use the ChromeDriver or your preferred driver)
    driver = webdriver.Chrome()

    # Open Google Scholar
    driver.get("https://scholar.google.com/")

    search_query = f"site:{publisher} {subject}"
    search_bar = driver.find_element(By.NAME, "q")
    search_bar.send_keys(search_query)
    search_bar.send_keys(Keys.RETURN)

    time.sleep(3)  # Wait for the results to load

    # Initialize lists to store data
    titles = []
    links = []
    authors = []
    publication_info = []
    citations = []
    years = []

    # Restrict to a maximum of 5 pages
    max_pages = 10
    current_page = 1

    # Extract results from the pages
    try:
        while current_page <= max_pages:
            # Find all result containers
            results = driver.find_elements(By.CLASS_NAME, "gs_ri")

            for result in results:
                try:
                    # Extract title
                    title_element = result.find_element(By.TAG_NAME, "h3")
                    title = title_element.text
                    link = title_element.find_element(By.TAG_NAME, "a").get_attribute("href")

                    # Extract authors and publication info
                    author_info = result.find_element(By.CLASS_NAME, "gs_a").text

                    # Extract citation count
                    try:
                        citation_info = result.find_element(By.CLASS_NAME, "gs_fl").text
                        if "Cited by" in citation_info:
                            citation_count = int(citation_info.split("Cited by ")[1].split()[0])
                        else:
                            citation_count = 0
                    except Exception:
                        citation_count = 0  # Handle cases where citations are not available

                    # Extract publication year using regex
                    try:
                        year_match = re.search(r"\b(19|20)\d{2}\b", author_info)
                        year = int(year_match.group()) if year_match else None
                    except Exception:
                        year = None

                    # Add to lists
                    titles.append(title)
                    links.append(link)
                    authors.append(author_info)
                    publication_info.append(author_info)
                    citations.append(citation_count)
                    years.append(year)
                except Exception as e:
                    print(f"Error extracting result: {e}")

            # Check if there is a "Next" button and click it
            try:
                next_button = driver.find_element(By.LINK_TEXT, "Next")
                next_button.click()
                time.sleep(3)  # Wait for the next page to load
                current_page += 1
            except Exception as e:
                print(f"No 'Next' button or error navigating pages: {e}")
                break  # Exit loop if no "Next" button is found or an error occurs
    finally:
        driver.quit()

    # Create a DataFrame from the scraped data
    data = {
        "Title": titles,
        "Link": links,
        "Authors/Info": authors,
        "Publication Info": publication_info,
        "Citations": citations,
        "Publication Year": years
    }
    df = pd.DataFrame(data)
    
    df=df.sort_values(by='Publication Year',ascending=False)

    # Adjust keywords for post-scraping filtering based on the subject
    subject_keywords = {
        "biology": ["biology", "genetics", "ecology"],
        "computer science": ["algorithm", "AI", "machine learning", "data"],
        "climate": ["climate", "global warming", "sustainability"]
    }.get(subject.lower(), [])

    if subject_keywords:
        df_filtered = df[
            df["Title"].str.contains("|".join(subject_keywords), case=False, na=False) |
            df["Publication Info"].str.contains("|".join(subject_keywords), case=False, na=False)
        ]
    else:
        print("No filtering keywords found for the specified subject.")
        df_filtered = pd.DataFrame()  # Return an empty DataFrame if no filtering keywords
        
        
     # Handle the case where df_filtd is empty
    if df_filtered.empty:
        print("No records matched.")
        return df, df_filtered, None  # Return the full DataFrame and None for the filtered DataFrame
    
    else:
        
        ## Create summary :     
        gs_smry = pd.DataFrame({
        'Website' : 'google scholar',
        'Publisher_Name' : publisher,
        'Subject' : subject,
        'Total_Publications': [len(df_filtered)],
        'Latest_Year_of_Pub': [max(df_filtered['Publication Year'])],
        'Avg_Pub_evry_yr': [len(df_filtered) / (max(df_filtered['Publication Year']) - min(df_filtered['Publication Year']))],
        'Total_Citations': [df_filtered['Citations'].sum()]
        })



    return df, df_filtered, gs_smry


In [7]:
## Try :
# publisher="gatesfoundation.org"
publisher= "open-research-europe.ec.europa.eu"
subject="Climate"

df_gs,df_gs_filtd, gs_smry=extract_pub_detls_google_schlr(publisher,subject)

In [8]:
gs_smry

Unnamed: 0,Website,Publisher_Name,Subject,Total_Publications,Latest_Year_of_Pub,Avg_Pub_evry_yr,Total_Citations
0,google scholar,open-research-europe.ec.europa.eu,Climate,10,2024,3.333333,4


In [42]:
df_gs

Unnamed: 0,Title,Link,Authors/Info,Publication Info,Citations,Publication Year
0,[PDF] Mapping the terrain: Year 1 of the evalu...,https://docs.gatesfoundation.org/Documents/Map...,"L Shear, B Smerdon - Annual Meeting of the Ame...","L Shear, B Smerdon - Annual Meeting of the Ame...",11,2003
