In [57]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time

In [58]:
# Step 1: Correct the headers definition (it should be a dictionary)
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}

# Step 2: Define the URL
url = 'https://scholar.google.com/scholar?as_ylo=2023&q=inherited+retinal+diseases&hl=en&as_sdt=0,5'

# Step 3: Make the GET request
response = requests.get(url, headers=headers)

# Step 4: Print the response status code to ensure it worked
print(response.status_code)


429


In [59]:
page_contents = response.text
doc = BeautifulSoup(page_contents,'html.parser')

In [60]:
# this function for the getting inforamtion of the web page
def get_paperinfo(paper_url):

  #download the page
  response=requests.get(url,headers=headers)

  # check successful response
  if response.status_code != 200:
    print('Status code:', response.status_code)
    raise Exception('Failed to fetch web page ')

  #parse using beautiful soup
  paper_doc = BeautifulSoup(response.text,'html.parser')

  return paper_doc

In [61]:
# this function for the extracting information of the tags
def get_tags(doc):
  paper_tag = doc.select('[data-lid]')
  cite_tag = doc.select('[title=Cite] + a')
  link_tag = doc.find_all('h3',{"class" : "gs_rt"})
  author_tag = doc.find_all("div", {"class": "gs_a"})

  return paper_tag,cite_tag,link_tag,author_tag

In [62]:
# it will return the title of the paper
def get_papertitle(paper_tag):
  
  paper_names = []
  
  for tag in paper_tag:
    paper_names.append(tag.select('h3')[0].get_text())

  return paper_names

In [63]:
# it will return the number of citation of the paper
def get_citecount(cite_tag):
  cite_count = []
  for i in cite_tag:
    cite = i.text
    if i is None or cite is None:  # if paper has no citatation then consider 0
      cite_count.append(0)
    else:
      tmp = re.search(r'\d+', cite) # its handle the None type object error and re use to remove the string " cited by " and return only integer value
      if tmp is None :
        cite_count.append(0)
      else :
        cite_count.append(int(tmp.group()))

  return cite_count

In [64]:
# function for the getting link information
def get_link(link_tag):

  links = []

  for i in range(len(link_tag)) :
    links.append(link_tag[i].a['href']) 

  return links 

In [65]:
def get_author_year_publi_info(authors_tag):
    years = []
    publications = []
    authors = []

    for i in range(len(authors_tag)):
        authortag_text = (authors_tag[i].text).split()
        
        # Extract year
        year = int(re.search(r'\d{4}', authors_tag[i].text).group()) if re.search(r'\d{4}', authors_tag[i].text) else None
        years.append(year)
        
        # Extract publication
        publication = authortag_text[-1] if len(authortag_text) > 1 else "Unknown"
        publications.append(publication)
        
        # Extract author name
        author = authortag_text[0] + ' ' + re.sub(',', '', authortag_text[1]) if len(authortag_text) > 1 else "Unknown"
        authors.append(author)

    return years, publications, authors



In [66]:
def get_doi(doc):
    doi = None
    doi_tag = doc.find('a', href=True, string=re.compile(r'doi', re.IGNORECASE))
    if doi_tag:
        doi = doi_tag['href']
    return doi


def get_abstract(doc):
    abstract = None
    abstract_tag = doc.find('div', class_='gs_rs')
    if abstract_tag:
        abstract = abstract_tag.get_text()
    return abstract

In [67]:
# Creating the final repository
paper_repos_dict = {
    'Paper Title': [],
    'Year': [],
    'Author': [],
    'Citation': [],
    'Publication': [],
    'Url of paper': [],
    'DOI': [],          # New field for DOI
    'Abstract': []      # New field for Abstract
}

# adding information in repository
def add_in_paper_repo(papername, year, author, cite, publi, link, doi, abstract):
    paper_repos_dict['Paper Title'].extend(papername)
    paper_repos_dict['Year'].extend(year)
    paper_repos_dict['Author'].extend(author)
    paper_repos_dict['Citation'].extend(cite)
    paper_repos_dict['Publication'].extend(publi)
    paper_repos_dict['Url of paper'].extend(link)
    paper_repos_dict['DOI'].extend(doi)              # Add DOI to the repository
    paper_repos_dict['Abstract'].extend(abstract)    # Add Abstract to the repository

    return pd.DataFrame(paper_repos_dict)


In [68]:
import time
import random
import requests
from bs4 import BeautifulSoup

def get_paperinfo(url):
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'})
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    elif response.status_code == 429:
        print("Status code: 429 - Too many requests")
        return None
    else:
        print(f"Status code: {response.status_code} - Failed to fetch web page")
        return None

for i in range(0, 110, 10):
    backoff_time = 30
    while True:
        try:
            # Get the URL for each page
            url = f"https://scholar.google.com/scholar?start={i}&q=object+detection+in+aerial+image+&hl=en&as_sdt=0,5"
            
            # Fetch and parse the page
            doc = get_paperinfo(url)
            if doc is None:
                print(f"Error processing page starting at {i}: Failed to fetch web page")
                time.sleep(backoff_time)
                backoff_time *= 2  # Exponential backoff
                continue
            
            # Reset backoff time if successful
            backoff_time = 30
            
            # Extract tags from the page
            paper_tag, cite_tag, link_tag, author_tag = get_tags(doc)
            
            # Extract paper titles
            papername = get_papertitle(paper_tag) or []
            
            # Extract year, author, and publication information
            year, publication, author = get_author_year_publi_info(author_tag)
            year = year or []
            publication = publication or []
            author = author or []
            
            # Extract DOI and abstract separately
            doi = get_doi(doc) or "Not Available"
            abstract = get_abstract(doc) or "Not Available"
            
            # Extract citation count
            cite = get_citecount(cite_tag) or []
            
            # Extract paper URLs
            link = get_link(link_tag) or []
    
            # Check if all lists are the same length
            if len(papername) == len(year) == len(author) == len([doi] * len(papername)) == len([abstract] * len(papername)) == len(cite) == len(link):
                # Add data to the repository, including DOIs and abstracts
                final = add_in_paper_repo(papername, year, author, cite, publication, link, [doi] * len(papername), [abstract] * len(papername))
            else:
                print(f"Length mismatch on page starting at {i}. Skipping this page.")
            
            # Sleep to avoid rate limiting with randomized delay
            time.sleep(random.uniform(30, 60))
            break  # Exit the while loop and move to the next page
        except Exception as e:
            print(f"Error processing page starting at {i}: {e}")
            continue

Status code: 429 - Too many requests
Error processing page starting at 0: Failed to fetch web page
Status code: 429 - Too many requests
Error processing page starting at 0: Failed to fetch web page


KeyboardInterrupt: 