This notebook searches Google Scholar for the search term "Machine Learning lncRNA"
and attempts to download any associated PDF file and check record the language associated with the file.


The aim is to download a set of 250 papers with the associated pdf file which is in the English language.

**NOTE:**

The search of Google Scholar takes a long time. Additionally, Google Scholar puts a limit on the number of search results that can be processed and an error is raised. The code to search and download files has been modified to stop after 3 exceptions are raised.

In [None]:
import pickle

import numpy as np
import pandas as pd

import requests
import mimetypes
import os

import uuid

import time
from scholarly import scholarly

import PyPDF2
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

In [None]:
DATA_FOLDER = '../../../data/'

In [None]:
DOWNLOAD_FOLDER = '../../../downloads/'

In [None]:
def detect_language(file_path):
    if file_path is None:
        return None
    else:
        try:
            # Open the PDF file
            with open(file_path, 'rb') as pdf_file:
                reader = PyPDF2.PdfReader(pdf_file)
                text = ""
                
                # Extract text from each page
                for page in reader.pages:
                    text += page.extract_text()
                
                # Detect the language of the extracted text
                if text.strip():  # Ensure there's text to analyze
                    language = detect(text)
                    return language
                else:
                    print(f"No text found in {file_path}")
                    return None
        except LangDetectException:
            print(f"Language detection failed for {file_path}")
            return None
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return None

In [None]:
def search_and_download_papers(query, limit, output_dir):
    search_query = scholarly.search_pubs(query)
    papers = []
    counter = 0
    error_count = 0
    error_limit = 3

    while(counter < limit):
        try:
            time.sleep(2)  # Introduce delay
            paper = next(search_query)
            paper_info = {
                "title": paper.get("bib", {}).get("title"),
                "abstract": paper.get("bib", {}).get("abstract"),
                "year": paper.get("bib", {}).get("pub_year"),
                "url": paper.get("eprint_url", ""),
                "author_id": paper.get("author_id", []),
                "query": query,
                "file_name": "",
                "file_path": None,
                "language": None
            }

            # lets download the file if the url exists
            if paper_info['url']:
                url = paper_info['url']
                try:
                    # Download the file if a link is available
                    response = requests.get(url, stream=True)
                    content_type = response.headers.get('Content-Type', '')
                    # Check if the content type is PDF
                    if 'application/pdf' in content_type or mimetypes.guess_extension(content_type) == '.pdf':
                        unique_filename = str(uuid.uuid4()) + '.pdf'
                        file_path = os.path.join(output_dir, unique_filename)

                        with open(file_path, "wb") as file:
                            for chunk in response.iter_content(chunk_size=8192):
                                file.write(chunk)
                        
                        print(f"Downloaded: {file_path}")                            

                        language = detect_language(file_path=file_path)
                        
                        paper_info['file_name'] = unique_filename
                        paper_info['file_path'] = file_path
                        paper_info['language'] = language

                        # the aim is to collect files that are in english
                        if language == 'en':
                            counter += 1
                    else:
                        print(f"non-PDF content: {url}")             
                except Exception as e:
                    print(f"Error occurred while processing {url}: {e}")                                
            papers.append(paper_info)
            print(paper_info)
        except StopIteration:
            break
        except Exception as e:
            print(f"Error processing search : {e}")
            
            if error_count < error_limit:
                error_count += 1
                continue
            else:
                break

    return papers

In [None]:
papers = search_and_download_papers(query="Machine Learning lncRNA", limit=250, output_dir=DOWNLOAD_FOLDER)

In [None]:
large_ml_lncRNA_search_df = pd.DataFrame(papers)

In [None]:
large_ml_lncRNA_search_df

In [None]:
large_ml_lncRNA_search_df[large_ml_lncRNA_search_df['language'] == 'en']

In [None]:
large_ml_lncRNA_search_df.to_parquet(f'{DATA_FOLDER}large_ml_lncRNA_search_df.parquet')