In [None]:
import pandas as pd
import sqlite3

con = sqlite3.connect("dataset.sqlite3")
df = pd.read_sql_query("SELECT * from publications", con)

# Verify that result of SQL query is stored in the dataframe
print(df.head())
con.close()

In [None]:
selected_rows = df[(df['is_processed']==0) & (df['failed_attempt']==0)]

In [None]:
failed_rows = df[df['failed_attempt']==1]

In [None]:
failed_but_processed_rows = df[(df['is_processed']==1) & (df['failed_attempt']==1)]

In [None]:
len(selected_rows)

In [None]:
len(failed_rows)

In [None]:
# import libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from urllib.parse import urlparse, parse_qs
import requests
from bs4 import BeautifulSoup
import re

In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--window-size=1920,1080")

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, WebDriverException
from bs4 import BeautifulSoup
import time
import re
import sys

# Define Chrome options for Selenium
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36')

def get_pub_details(work_url):
    retry_attempts = 3

    for attempt in range(retry_attempts):
        driver = webdriver.Chrome(options=chrome_options)
        try:
            # print(f"Attempting to access URL: {work_url}, Attempt: {attempt + 1}")
            error = 0
            driver.get(work_url)
            # Check if redirected to the human verification page
            current_url = driver.current_url
            if "https://www.google.com/sorry/index" in current_url:
                print(f"Blocked by Google at URL: {current_url}")
                print("Please change your IP and restart the script.")
                
                # Option 1: Raise an exception
                print("Blocked by Google. Change your IP and restart.")
        
                # Option 2: Exit the script
                sys.exit("Blocked by Google. Change your IP and restart.")
                
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            # print(soup)
            # print(soup.find('div', id='gsc_oci_title'))

            # Extract title
            try:
                title_element = soup.find('div', id='gsc_oci_title')
                title = title_element.text
            except Exception as e:
                title = None
                error = 1
            
            # Extract authors or inventors
            try:
                try:
                    author_field = soup.find('div', class_='gsc_oci_field', string='Authors')
                    author_element = author_field.find_next_sibling('div', class_='gsc_oci_value')
                    authors = author_element.text
                except Exception as e:
                    author_field = soup.find('div', class_='gsc_oci_field', string='Inventors')
                    author_element = author_field.find_next_sibling('div', class_='gsc_oci_value')
                    authors = author_element.text
            except Exception as e:
                authors = None

            # Extract publication date
            try:
                published_date_field = soup.find('div', class_='gsc_oci_field', string='Publication date')
                publication_date_element = published_date_field.find_next_sibling('div', class_='gsc_oci_value')
                publication_date = publication_date_element.text
            except Exception:
                publication_date = None

            # Extract type
            try:
                work_type_elem = soup.select_one("#gsc_oci_table > div:nth-child(3) > div:nth-child(1)")
                typ = work_type_elem.text
            except Exception as e:
                typ = None

            # Extract citation count
            try:
                citation_field = soup.find('div', class_='gsc_oci_field', string='Total citations')
                citation_field_element = citation_field.find_next_sibling('div', class_='gsc_oci_value')
                citation_value = citation_field_element.find('div')
                citation_count = int(re.search(r'\d+', citation_value.get_text()).group())
            except Exception as e:
                print(str(e))
                citation_count = 0
            # print(error)
            # If successful, return the extracted data
            return {
                'error': error,
                'publication_url': work_url,
                'title': title,
                'authors': authors,
                'publication_date': publication_date,
                'publication_type': typ,
                'citations': citation_count,
                'details_added_at': time.time()
            }
        
        except TimeoutException:
            error=1
            print(f"Timeout while trying to access {work_url}. Retrying...")
            time.sleep(5)  # Delay before retrying
        except WebDriverException as e:
            print(f"WebDriver error while accessing {work_url}: {str(e)}")
            error = 1
            if "ERR_INTERNET_DISCONNECTED" in str(e):
                print("Internet disconnected. Stopping the program.")
                sys.exit(1)
            break  # Stop retrying if WebDriver error occurs
        except Exception as e:
            print(f"Unexpected error occurred: {str(e)}")
            error = 1
            break
        finally:
            driver.quit()

    # If all retries fail, return failure
    print(f"Failed to retrieve details for {work_url} after {retry_attempts} attempts.")
    return {
        'error': 1,
        'publication_url': work_url,
        'title': None,
        'authors': None,
        'publication_date': None,
        'publication_type': None,
        'citations': 0,
        'details_added_at': time.time()
    }


In [None]:
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# from selenium.common.exceptions import TimeoutException, WebDriverException
# from bs4 import BeautifulSoup
# import time
# import re

# # Define Chrome options for Selenium
# chrome_options = Options()
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--headless=new')
# chrome_options.add_argument('--disable-gpu')
# chrome_options.add_argument('--disable-dev-shm-usage')
# chrome_options.add_argument('--window-size=1920,1080')
# chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36')

# def get_pub_details(work_url):
#     error=0
#     driver = webdriver.Chrome(options=chrome_options)

#     driver.get(work_url)
#     page_source = driver.page_source
#     soup = BeautifulSoup(page_source, 'html.parser')
    
#     try:
#         title_element = soup.find('div', id='gsc_oci_title')
#         title = title_element.text
#     except Exception as e:
#         title = None
#         error = 1

#     try:
#         try:
#             author_field = soup.find('div', class_='gsc_oci_field', string='Authors')
#             author_element = author_field.find_next_sibling('div', class_='gsc_oci_value')
#             authors = author_element.text
#         except Exception as e:
#             author_field = soup.find('div', class_='gsc_oci_field', string='Inventors')
#             author_element = author_field.find_next_sibling('div', class_='gsc_oci_value')
#             authors = author_element.text
#     except Exception as e:
#         authors = None

#     try:
#         published_date_field = soup.find('div', class_='gsc_oci_field', string='Publication date')
#         publication_date_element = published_date_field.find_next_sibling('div', class_='gsc_oci_value')
#         publication_date = publication_date_element.text
#     except Exception as e:
#         publication_date = None

#     try:
#         work_type_elem = soup.select_one("#gsc_oci_table > div:nth-child(3) > div:nth-child(1)")
#         typ = work_type_elem.text
#     except Exception as e:
#         typ = None

#     try:
#         citation_field = soup.find('div', class_='gsc_oci_field', string='Total citations')
#         citation_field_element = citation_field.find_next_sibling('div', class_='gsc_oci_value')
#         citation_value = citation_field_element.find('div')
#         citation_count = int(re.search(r'\d+', citation_value.get_text()).group())
#     except Exception as e:
#         # print(str(e))
#         citation_count = 0
#     time.sleep(1.5)
#     driver.quit()
#     if error ==1:
#         print(driver.current_url)
    
        
#     # print({
#     #     'error': error,
#     #     'link': work_url,
#     #     'title': title,
#     #     'authors': authors,
#     #     'publication_date': publication_date,
#     #     'type': typ,
#     #     'citation': citation_count
#     # })
#     return {
#         'error': error,
#         'publication_url': work_url,
#         'title': title,
#         'authors': authors,
#         'publication_date': publication_date,
#         'publication_type': typ,
#         'citations': citation_count,
#         'details_added_at': time.time()
#     }

In [None]:
# Connect to the SQLite database (replace 'your_database.db' with your database file name)
conn = sqlite3.connect("dataset.sqlite3")

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

for index, row in selected_rows.iterrows():
    # print(row['publication_url'])
    details = get_pub_details(row['publication_url'])
    # print(details)
    if details['error']==0:
        cursor.execute("UPDATE publications SET title = :title, author_names = :authors ,publication_date= :publication_date, publication_type= :publication_type , citations= :citations,is_processed=1,failed_attempt = 0, details_added_at= :details_added_at WHERE publication_url = :publication_url", details)
        # Commit the changes
        conn.commit()
    else:
        cursor.execute("UPDATE publications SET failed_attempt = 1 WHERE publication_url = :publication_url", details)
        # Commit the changes
        conn.commit()
print('code finished')
# Close the connection
conn.close()

In [None]:
#####retrying failed attempts######################

# Connect to the SQLite database (replace 'your_database.db' with your database file name)
conn = sqlite3.connect("dataset.sqlite3")

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

for index, row in failed_rows.iterrows():
    # print(row['publication_url'])
    details = get_pub_details(row['publication_url'])
    # print(details)
    if details['error']==0:
        cursor.execute("UPDATE publications SET title = :title, author_names = :authors ,publication_date= :publication_date, publication_type= :publication_type , citations= :citations,is_processed=1,failed_attempt = 0, details_added_at= :details_added_at WHERE publication_url = :publication_url", details)
        # Commit the changes
        conn.commit()
    else:
        cursor.execute("UPDATE publications SET failed_attempt = 1 WHERE publication_url = :publication_url", details)
        # Commit the changes
        conn.commit()
print('code finished')
# Close the connection
conn.close()

In [None]:
#####retrying failed but processed rows attempts######################

# Connect to the SQLite database (replace 'your_database.db' with your database file name)
conn = sqlite3.connect("dataset.sqlite3")

# Create a cursor object to execute SQL commands
cursor = conn.cursor()

for index, row in failed_but_processed_rows.iterrows():
    # print(row['publication_url'])
    details = get_pub_details(row['publication_url'])
    print(details)
    if details['error']==0:
        cursor.execute("UPDATE publications SET title = :title, author_names = :authors ,publication_date= :publication_date, publication_type= :publication_type , citations= :citations,is_processed=1,failed_attempt = 0, details_added_at= :details_added_at WHERE publication_url = :publication_url", details)
        # Commit the changes
        conn.commit()
    else:
        cursor.execute("UPDATE publications SET failed_attempt = 1 WHERE publication_url = :publication_url", details)
        # Commit the changes
        conn.commit()
print('code finished')
# Close the connection
conn.close()

In [None]:
print(get_pub_details('https://scholar.google.com/citations?view_op=view_citation&hl=en&user=jtDUXJkAAAAJ&pagesize=100&citation_for_view=jtDUXJkAAAAJ:dhFuZR0502QC'))

In [None]:
get_pub_details('https://scholar.google.com/citations?view_op=view_citation&hl=en&user=3vo8WTYAAAAJ&pagesize=100&citation_for_view=3vo8WTYAAAAJ:CLhUwle04lcC')