This notebook attempts to download at least five pdf files which in english for each machine learning model term described in ML-Model-Categorization.ods.

To do this:
1. Recover from previous run if required
2. Find papers matching the ML search term
3. Iterate through the search results and download files until five pdf files in the English language are collected
4. Save progress

NOTE: Google Scholar limits how many search terms are accessed which could result in the search being blocked. In this case, the search should be resumed a few days later.

In [1]:
import pandas as pd

#import spacy
#from spacy.matcher import PhraseMatcher

from scholarly import scholarly
import time

import requests
import mimetypes
import os

import uuid

#import PyPDF2

In [2]:
ROOT_FOLDER = '../../../'

In [3]:
DATA_FOLDER = '../../../data/'

In [4]:
DETECT_ML_MODEL_FILES_FOLDER = '../../../detect_ml_model_files/'

In [5]:
model_df = pd.read_excel(f'{ROOT_FOLDER}ML-Model-Categorization.ods', sheet_name='Sheet1', usecols=['MODEL'])

In [6]:
model_df

Unnamed: 0,MODEL
0,Linear Regression
1,Polynomial Regression
2,Ridge Regression
3,Lasso Regression
4,Elastic Net Regression
...,...
97,Claude
98,LLaMA
99,Contrastive Language-Image Pretraining
100,DALL-E


In [7]:
models = model_df['MODEL'].values.tolist()

In [8]:
import PyPDF2
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

def detect_language(file_path):
    if file_path is None:
        return None
    else:
        try:
            # Open the PDF file
            with open(file_path, 'rb') as pdf_file:
                reader = PyPDF2.PdfReader(pdf_file)
                text = ""
                
                # Extract text from each page
                for page in reader.pages:
                    text += page.extract_text()
                
                # Detect the language of the extracted text
                if text.strip():  # Ensure there's text to analyze
                    language = detect(text)
                    return language
                else:
                    print(f"No text found in {file_path}")
                    return None
        except LangDetectException:
            print(f"Language detection failed for {file_path}")
            return None
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return None

In [9]:
def search_and_download_papers(query, limit, output_dir):
    search_query = scholarly.search_pubs(query)
    papers = []
    counter = 0

    while(counter < limit):
        try:
            time.sleep(2)  # Introduce delay
            paper = next(search_query)
            paper_info = {
                "title": paper.get("bib", {}).get("title"),
                "abstract": paper.get("bib", {}).get("abstract"),
                "year": paper.get("bib", {}).get("pub_year"),
                "url": paper.get("eprint_url", ""),
                "author_id": paper.get("author_id", []),
                "query": query,
                "file_name": "",
                "file_path": None,
                "language": None
            }

            # lets download the file if the url exists
            if paper_info['url']:
                url = paper_info['url']
                try:
                    # Download the file if a link is available
                    response = requests.get(url, stream=True)
                    content_type = response.headers.get('Content-Type', '')
                    # Check if the content type is PDF
                    if 'application/pdf' in content_type or mimetypes.guess_extension(content_type) == '.pdf':
                        unique_filename = str(uuid.uuid4()) + '.pdf'
                        file_path = os.path.join(output_dir, unique_filename)

                        with open(file_path, "wb") as file:
                            for chunk in response.iter_content(chunk_size=8192):
                                file.write(chunk)
                        
                        print(f"Downloaded: {file_path}")                            

                        language = detect_language(file_path=file_path)
                        
                        paper_info['file_name'] = unique_filename
                        paper_info['file_path'] = file_path
                        paper_info['language'] = language

                        # the aim is to collect files that are in english
                        if language == 'en':
                            counter += 1
                    else:
                        print(f"non-PDF content: {url}")             
                except Exception as e:
                    print(f"Error occurred while processing {url}: {e}")                                
            papers.append(paper_info)
        except StopIteration:
            break
        except Exception as e:
            print(f"Error processing paper : {e}")
            continue

    return papers

In [10]:
merge_with_previous = False

if os.path.isfile(f'{DATA_FOLDER}detect_ml_models.parquet'):
    # recover from previous run
    
    saved_data_df = pd.read_parquet(f'{DATA_FOLDER}detect_ml_models.parquet')

    merge_with_previous = True    
    
    processed_models = saved_data_df['query'].unique().tolist()
    
    unprocessed_models = list()
    
    for model in models:
        if model not in processed_models:
            unprocessed_models.append(model)
else:
    unprocessed_models = models

In [11]:
unprocessed_models

['Linear Regression',
 'Polynomial Regression',
 'Bayesian Linear Regression',
 'Quantile Regression',
 'Support Vector Regression',
 'Decision Trees',
 'LightGBM',
 'Co-Training',
 'LeNet',
 'VGGNet',
 'Kalman Filters',
 'Vector Autoregression',
 'Prophet',
 'Stacking',
 'ChatGPT',
 'LLaMA']

In [12]:
search_results = list()
for model in unprocessed_models:
    print(f"Searching for: {model}")    
    papers = search_and_download_papers(query=model, limit=5, output_dir=DETECT_ML_MODEL_FILES_FOLDER)
    search_results.append(papers)

Searching for: Linear Regression
Downloaded: ../../../detect_ml_model_files/4b55ea85-209f-4ff8-9657-5d6d08ed6ad2.pdf
Downloaded: ../../../detect_ml_model_files/fcfc4d57-a217-4ba2-8cc9-eb7f40b92540.pdf
No text found in ../../../detect_ml_model_files/fcfc4d57-a217-4ba2-8cc9-eb7f40b92540.pdf
Downloaded: ../../../detect_ml_model_files/fba635a5-3044-4e07-b058-8e4d9f947c32.pdf
Downloaded: ../../../detect_ml_model_files/b143697b-982f-42a3-91bb-8337dc5d9b50.pdf
non-PDF content: https://www.sciencedirect.com/science/article/pii/S1877042813046429/pdf?md5=da62147e64e2f356bfe6696b4f8031c1&pid=1-s2.0-S1877042813046429-main.pdf&_valck=1
Downloaded: ../../../detect_ml_model_files/d2c3221d-0479-4e05-95ca-f081d5b8fe09.pdf
non-PDF content: https://www.ajodo.org/article/S0889-5406(15)01379-7/fulltext
Downloaded: ../../../detect_ml_model_files/f8ef9814-9fd2-417d-afba-7c28bd6ac17b.pdf
Searching for: Polynomial Regression
non-PDF content: https://www.sciencedirect.com/science/article/pii/S1877705812046085/p

FloatObject (b'145.4927.471') invalid; use 0.0 instead
FloatObject (b'122.5144..673') invalid; use 0.0 instead
FloatObject (b'1418147.8804.673') invalid; use 0.0 instead
FloatObject (b'146.21128.543') invalid; use 0.0 instead
FloatObject (b'144.7421128.543') invalid; use 0.0 instead
FloatObject (b'125.9144.888') invalid; use 0.0 instead
FloatObject (b'15.631.489') invalid; use 0.0 instead
FloatObject (b'147.82.884') invalid; use 0.0 instead
FloatObject (b'141.4147.65') invalid; use 0.0 instead
FloatObject (b'147.28.043754') invalid; use 0.0 instead
FloatObject (b'6.6.745') invalid; use 0.0 instead
FloatObject (b'3147.319785.174') invalid; use 0.0 instead
FloatObject (b'14.319785.174') invalid; use 0.0 instead
FloatObject (b'147.488824.888') invalid; use 0.0 instead
FloatObject (b'130.7688837.248') invalid; use 0.0 instead
FloatObject (b'147.65.72941.99') invalid; use 0.0 instead
FloatObject (b'147.65.7793141.4147.65.77931416') invalid; use 0.0 instead
FloatObject (b'147.5345.77931416')

Error processing ../../../detect_ml_model_files/09bbf18a-eb17-4536-b12b-469c9c5d6c75.pdf: list index out of range
non-PDF content: https://www.academia.edu/download/52484748/Kalman_CoursePack_08.pdf
Downloaded: ../../../detect_ml_model_files/1eccd87a-3d78-4094-9a34-763bdc12bd20.pdf
non-PDF content: https://www.academia.edu/download/52364275/Kalman_filtering_using_Matlab.pdf
Downloaded: ../../../detect_ml_model_files/cd8540f6-d543-4bb5-8890-c69181c06ce4.pdf
Downloaded: ../../../detect_ml_model_files/3d129e04-de6d-4a8a-bb63-d4074926f968.pdf
Downloaded: ../../../detect_ml_model_files/a5ebfcd7-8863-4490-8ac2-9d99e8de4f02.pdf
No text found in ../../../detect_ml_model_files/a5ebfcd7-8863-4490-8ac2-9d99e8de4f02.pdf
Downloaded: ../../../detect_ml_model_files/bef412b7-6e2a-4188-8dea-40d3f33f999e.pdf
Downloaded: ../../../detect_ml_model_files/92a12e97-b2fa-4c03-9864-756ebbe575e3.pdf
Searching for: Vector Autoregression
non-PDF content: https://pubs.aeaweb.org/doi/pdf/10.1257/jep.15.4.101
Downloa



Downloaded: ../../../detect_ml_model_files/7e835186-822f-41a4-9582-436bd07f021a.pdf
Downloaded: ../../../detect_ml_model_files/7ef6e5d6-fd77-456f-9da4-96b4395fe610.pdf
Searching for: Prophet
Downloaded: ../../../detect_ml_model_files/f2b7be36-ce2a-4242-9fa5-51f9cdeabf9e.pdf
Downloaded: ../../../detect_ml_model_files/d04a8869-9754-4265-92dc-739ec3a23a50.pdf
Downloaded: ../../../detect_ml_model_files/9ed433e8-d53c-4868-8746-f6a8409430f4.pdf
non-PDF content: https://marxists.architexturez.net/francais/harman/1994/00/propheteproletariat.pdf
non-PDF content: https://books.google.com/books?hl=en&lr=&id=L2AzAQAAMAAJ&oi=fnd&pg=PR34&dq=Prophet&ots=JiZY9RrmYN&sig=ZerdjpO0hVBGhcro3er195KrwPM
Downloaded: ../../../detect_ml_model_files/7bd8e37c-23ff-4f66-aaf8-335171ac036d.pdf
No text found in ../../../detect_ml_model_files/7bd8e37c-23ff-4f66-aaf8-335171ac036d.pdf
Downloaded: ../../../detect_ml_model_files/2cef7b22-1e52-43cf-a6c0-0dab614fccff.pdf
non-PDF content: https://classes.matthewjbrown.net/te

In [13]:
# can do it in one line but is it easier to read?
# flattened_search_results = [paper for paper in papers for papers in search_results]
flattened_search_results = list()
for papers in search_results:
    for paper in papers:
        flattened_search_results.append(paper)    

In [14]:
data_df = pd.DataFrame(flattened_search_results)

In [15]:
data_df

Unnamed: 0,title,abstract,year,url,author_id,query,file_name,file_path,language
0,Linear regression,Linear regression plays a fundamental role in ...,2012,https://www.cs.columbia.edu/~djhsu/coms4771-f2...,"[wcgZZP8AAAAJ, , El-UNYoAAAAJ]",Linear Regression,4b55ea85-209f-4ff8-9657-5d6d08ed6ad2.pdf,../../../detect_ml_model_files/4b55ea85-209f-4...,en
1,Introduction to linear regression analysis,Equation (1.2) is called a linear regression m...,2021,http://sutlib2.sut.ac.th/sut_contents/H133678.pdf,"[5PboKNAAAAAJ, , y2eb6cQAAAAJ]",Linear Regression,fcfc4d57-a217-4ba2-8cc9-eb7f40b92540.pdf,../../../detect_ml_model_files/fcfc4d57-a217-4...,
2,Linear regression,"linear regression, a very simple approach for ...",2023,https://datamineaz.org/readings/ISL_chp3.pdf,"[KUIjZqgAAAAJ, bHZf-c8AAAAJ, tQVe-fAAAAAJ, ZpG...",Linear Regression,fba635a5-3044-4e07-b058-8e4d9f947c32.pdf,../../../detect_ml_model_files/fba635a5-3044-4...,en
3,Applied linear regression,Applied linear regression Applied linear regre...,2005,https://www.stat.cmu.edu/~brian/valerie/617-20...,[wlu6jZQAAAAJ],Linear Regression,b143697b-982f-42a3-91bb-8337dc5d9b50.pdf,../../../detect_ml_model_files/b143697b-982f-4...,en
4,Linear regression,In linear regression the ordinary least square...,2012,,[vBWC0XIAAAAJ],Linear Regression,,,
...,...,...,...,...,...,...,...,...,...
387,"Características de canal, calidad de carne y c...",La carne de llama presenta un alto nivel prote...,2014,http://www.scielo.org.pe/scielo.php?pid=S1609-...,"[, , ]",LLaMA,,,
388,Características físico-químicas del charqui de...,,2011,https://www.redalyc.org/pdf/3718/371838943002.pdf,"[, ]",LLaMA,e8b85264-2cac-4c48-81f4-6180297047df.pdf,../../../detect_ml_model_files/e8b85264-2cac-4...,es
389,Antología de la prensa periódica isabelina esc...,"""Antología de la prensa periódica isabelina es...",2001,https://rodin.uca.es/bitstream/handle/10498/26...,[],LLaMA,123fb0f1-66c6-4aca-be6c-4ac275a094db.pdf,../../../detect_ml_model_files/123fb0f1-66c6-4...,es
390,"Sistemática, taxonomía y domesticación de alpa...",", grouping guanaco with llama and vicuña with ...",2007,https://www.scielo.cl/scielo.php?pid=S0716-078...,"[, , zTlcXB4AAAAJ, AcJZnfEAAAAJ]",LLaMA,,,


In [16]:
if merge_with_previous:
    data_df = pd.concat([saved_data_df, data_df], ignore_index=True)

In [17]:
data_df

Unnamed: 0,title,abstract,year,url,author_id,query,file_name,file_path,language
0,Ridge regression,Ridge regression is a popular parameter estima...,2009,,[i-68VW8AAAAJ],Ridge Regression,,,
1,Ridge regression in practice,In regular use of ridge regression we display ...,1975,https://www.researchgate.net/profile/David-Boo...,"[, SelQvxgAAAAJ]",Ridge Regression,,,
2,New ridge parameters for ridge regression,"In ridge regression, ridge parameter plays an ...",2014,https://www.sciencedirect.com/science/article/...,[],Ridge Regression,,,
3,Ridge regression: applications to nonorthogona...,This paper is an exposition of the use of ridg...,1970,https://scholar.archive.org/work/zedzhecp75hdj...,"[, ]",Ridge Regression,520946c9-4f85-4fd1-bfe5-2305c315cb1b.pdf,../../../detect_ml_model_files/520946c9-4f85-4...,en
4,Lecture notes on ridge regression,well-known results on ridge regression. The cu...,2015,https://arxiv.org/pdf/1509.09169,[qglWXEQAAAAJ],Ridge Regression,7f813b9f-7b00-4610-b3fe-30b85ad26b51.pdf,../../../detect_ml_model_files/7f813b9f-7b00-4...,en
...,...,...,...,...,...,...,...,...,...
1324,"Características de canal, calidad de carne y c...",La carne de llama presenta un alto nivel prote...,2014,http://www.scielo.org.pe/scielo.php?pid=S1609-...,"[, , ]",LLaMA,,,
1325,Características físico-químicas del charqui de...,,2011,https://www.redalyc.org/pdf/3718/371838943002.pdf,"[, ]",LLaMA,e8b85264-2cac-4c48-81f4-6180297047df.pdf,../../../detect_ml_model_files/e8b85264-2cac-4...,es
1326,Antología de la prensa periódica isabelina esc...,"""Antología de la prensa periódica isabelina es...",2001,https://rodin.uca.es/bitstream/handle/10498/26...,[],LLaMA,123fb0f1-66c6-4aca-be6c-4ac275a094db.pdf,../../../detect_ml_model_files/123fb0f1-66c6-4...,es
1327,"Sistemática, taxonomía y domesticación de alpa...",", grouping guanaco with llama and vicuña with ...",2007,https://www.scielo.cl/scielo.php?pid=S0716-078...,"[, , zTlcXB4AAAAJ, AcJZnfEAAAAJ]",LLaMA,,,


Do some basic checks to see if there are exactly five files in the english language for each model

In [25]:
check_df = data_df.dropna()

In [26]:
check_df

Unnamed: 0,title,abstract,year,url,author_id,query,file_name,file_path,language
3,Ridge regression: applications to nonorthogona...,This paper is an exposition of the use of ridg...,1970,https://scholar.archive.org/work/zedzhecp75hdj...,"[, ]",Ridge Regression,520946c9-4f85-4fd1-bfe5-2305c315cb1b.pdf,../../../detect_ml_model_files/520946c9-4f85-4...,en
4,Lecture notes on ridge regression,well-known results on ridge regression. The cu...,2015,https://arxiv.org/pdf/1509.09169,[qglWXEQAAAAJ],Ridge Regression,7f813b9f-7b00-4610-b3fe-30b85ad26b51.pdf,../../../detect_ml_model_files/7f813b9f-7b00-4...,en
5,A critique of some ridge regression methods,These techniques are consequently all subject ...,1980,http://economics-files.pomona.edu/garysmith/pa...,"[8CzBuBYAAAAJ, ]",Ridge Regression,5ed3ef6d-6030-47a6-9ddf-363a2f09571c.pdf,../../../detect_ml_model_files/5ed3ef6d-6030-4...,en
6,Ridge regression learning algorithm in dual va...,of the Ridge Regression procedure. It allows u...,1998,https://eprints.soton.ac.uk/258942/1/Dualrr_IC...,"[OEd0rvkAAAAJ, uoWoR4gAAAAJ, GJE29ekAAAAJ]",Ridge Regression,0acd09a1-4322-48eb-907d-b48498c2d89c.pdf,../../../detect_ml_model_files/0acd09a1-4322-4...,en
7,Ridge regression: Biased estimation for nonort...,"For this reason, estimation and analysis built...",1970,https://mineracaodedados.wordpress.com/wp-cont...,"[, ]",Ridge Regression,578f8b95-0643-4dcf-8565-00c5d655caa2.pdf,../../../detect_ml_model_files/578f8b95-0643-4...,en
...,...,...,...,...,...,...,...,...,...
1318,Code llama: Open foundation models for code,from Llama 2 to Code Llama in Llama models cl...,2023,https://arxiv.org/pdf/2308.12950,"[6vht2iwAAAAJ, jOwTwm4AAAAJ, , UAx_woYAAAAJ, T...",LLaMA,094c84c1-c9d6-4302-a100-311f513862da.pdf,../../../detect_ml_model_files/094c84c1-c9d6-4...,en
1321,Los días de llamas de la revolución,"Ya a la altura del verano de 1938, cuando el f...",2003,https://ifc.dpz.es/recursos/publicaciones/24/8...,[frwKQH4AAAAJ],LLaMA,dff07229-a282-469e-a7ae-63a74bc6f911.pdf,../../../detect_ml_model_files/dff07229-a282-4...,es
1325,Características físico-químicas del charqui de...,,2011,https://www.redalyc.org/pdf/3718/371838943002.pdf,"[, ]",LLaMA,e8b85264-2cac-4c48-81f4-6180297047df.pdf,../../../detect_ml_model_files/e8b85264-2cac-4...,es
1326,Antología de la prensa periódica isabelina esc...,"""Antología de la prensa periódica isabelina es...",2001,https://rodin.uca.es/bitstream/handle/10498/26...,[],LLaMA,123fb0f1-66c6-4aca-be6c-4ac275a094db.pdf,../../../detect_ml_model_files/123fb0f1-66c6-4...,es


In [27]:
check_df = check_df.query('language=="en"').groupby(['query'])['query'].count().reset_index(name='count')

In [28]:
check_df

Unnamed: 0,query,count
0,Actor-Critic Models,5
1,AdaBoost,5
2,Advantage Actor-Critic,5
3,AlexNet,5
4,Apriori Algorithm,5
...,...,...
94,Vision Transformers,5
95,XGBoost,5
96,k-Means Clustering,5
97,k-Nearest Neighbors,5


In [29]:
check_df.query('count != 5')

Unnamed: 0,query,count


So all search terms have five files associated with them

In [30]:
data_df.to_parquet(f'{DATA_FOLDER}detect_ml_models.parquet')