1. Recover from previous run
2. Find papers matching the ML search term
3. Download first five available files
4. Save progress

In [25]:
import pandas as pd

#import spacy
#from spacy.matcher import PhraseMatcher

from scholarly import scholarly
import time

import requests
import mimetypes
import os

import uuid

#import PyPDF2

In [26]:
ROOT_FOLDER = '../../../'

In [27]:
DATA_FOLDER = '../../../data/'

In [28]:
DETECT_ML_MODEL_FILES_FOLDER = '../../../detect_ml_model_files/'

In [29]:
model_df = pd.read_excel(f'{ROOT_FOLDER}ML-Model-Categorization.ods', sheet_name='Sheet1', usecols=['MODEL'])

In [30]:
model_df

Unnamed: 0,MODEL
0,Linear Regression
1,Polynomial Regression
2,Ridge Regression
3,Lasso Regression
4,Elastic Net Regression
...,...
97,Claude
98,LLaMA
99,Contrastive Language-Image Pretraining
100,DALL-E


In [7]:
models = model_df['MODEL'].values.tolist()

In [8]:
def search_and_download_papers(query, limit, output_dir):
    search_query = scholarly.search_pubs(query)
    papers = []
    counter = 0

    while(counter < limit):
        try:
            time.sleep(2)  # Introduce delay
            paper = next(search_query)
            paper_info = {
                "title": paper.get("bib", {}).get("title"),
                "abstract": paper.get("bib", {}).get("abstract"),
                "year": paper.get("bib", {}).get("pub_year"),
                "url": paper.get("eprint_url", ""),
                "author_id": paper.get("author_id", []),
                "query": query,
                "file_name": "",
                "file_path": None
            }

            # lets download the file if the url exists
            if paper_info['url']:
                url = paper_info['url']
                try:
                    # Download the file if a link is available
                    response = requests.get(url, stream=True)
                    content_type = response.headers.get('Content-Type', '')
                    # Check if the content type is PDF
                    if 'application/pdf' in content_type or mimetypes.guess_extension(content_type) == '.pdf':
                        unique_filename = str(uuid.uuid4()) + '.pdf'
                        file_path = os.path.join(output_dir, unique_filename)

                        with open(file_path, "wb") as file:
                            for chunk in response.iter_content(chunk_size=8192):
                                file.write(chunk)
                        
                        print(f"Downloaded: {file_path}")                            
                        paper_info['file_name'] = unique_filename
                        paper_info['file_path'] = file_path
                        counter += 1
                    else:
                        print(f"non-PDF content: {url}")             
                except Exception as e:
                    print(f"Error occurred while processing {url}: {e}")                                
            papers.append(paper_info)
        except StopIteration:
            break
        except Exception as e:
            print(f"Error processing paper : {e}")
            continue

    return papers

In [42]:
merge_with_previous = False

if os.path.isfile(f'{DATA_FOLDER}detect_ml_models.parquet'):
    # recover from previous run
    
    saved_data_df = pd.read_parquet(f'{DATA_FOLDER}detect_ml_models.parquet')

    merge_with_previous = True    
    
    processed_models = saved_data_df['query'].unique().tolist()
    
    unprocessed_models = list()
    
    for model in models:
        if model not in processed_models:
            unprocessed_models.append(model)
else:
    unprocessed_models = models

In [10]:
search_results = list()
for model in unprocessed_models:
    print(f"Searching for: {model}")    
    papers = search_and_download_papers(query=model, limit=5, output_dir=DETECT_ML_MODEL_FILES_FOLDER)
    search_results.append(papers)

Searching for: BERT
Downloaded: ../../../detect_ml_model_files/91ebf298-65bd-4429-8c60-b17cb80306fe.pdf
Downloaded: ../../../detect_ml_model_files/65f04db1-ad62-4f48-9e80-3ae5a11bf147.pdf
Downloaded: ../../../detect_ml_model_files/03d43387-8413-413c-8ce8-92b3512cedfb.pdf
Downloaded: ../../../detect_ml_model_files/a60911d8-b83b-4c2a-8555-2ac3b09b5025.pdf
Downloaded: ../../../detect_ml_model_files/98aa5e73-a662-448f-bca7-ff87bc1cc5bb.pdf


In [44]:
# can do it in one line but is it easier to read?
# flattened_search_results = [paper for paper in papers for papers in search_results]
flattened_search_results = list()
for papers in search_results:
    for paper in papers:
        flattened_search_results.append(paper)    

In [45]:
data_df = pd.DataFrame(flattened_search_results)

In [46]:
data_df

Unnamed: 0,title,abstract,year,url,author_id,query,file_name,file_path
0,BERT: a review of applications in natural lang...,"relative to the original BERT model, which is ...",2021,https://arxiv.org/pdf/2103.11943,[],BERT,91ebf298-65bd-4429-8c60-b17cb80306fe.pdf,../../../detect_ml_model_files/91ebf298-65bd-4...
1,What does BERT learn about the structure of la...,language structure learned by BERT. We first s...,2019,https://inria.hal.science/hal-02131630/document,"[X7SMP1EAAAAJ, HXUT9ZkAAAAJ, P7EtARsAAAAJ]",BERT,65f04db1-ad62-4f48-9e80-3ae5a11bf147.pdf,../../../detect_ml_model_files/65f04db1-ad62-4...
2,Visualizing and understanding the effectivenes...,trajectories of fine-tuning BERT on specific d...,2019,https://arxiv.org/pdf/1908.05620,"[cqOLO7IAAAAJ, wEfQgPgAAAAJ, G-V1VpwAAAAJ, ]",BERT,03d43387-8413-413c-8ce8-92b3512cedfb.pdf,../../../detect_ml_model_files/03d43387-8413-4...
3,A comprehensive survey on pretrained foundatio...,Pretrained Foundation Models (PFMs) are regard...,2024,https://arxiv.org/pdf/2302.09419,"[HWx73DcAAAAJ, AHg-JGIAAAAJ, nPvWFpkAAAAJ, fh1...",BERT,a60911d8-b83b-4c2a-8555-2ac3b09b5025.pdf,../../../detect_ml_model_files/a60911d8-b83b-4...
4,Bertje: A dutch bert model,a monolingual Dutch BERT model called BERTje. ...,2019,https://arxiv.org/pdf/1912.09582,"[gZkWURYAAAAJ, Y745fFYAAAAJ, biQvUhcAAAAJ]",BERT,98aa5e73-a662-448f-bca7-ff87bc1cc5bb.pdf,../../../detect_ml_model_files/98aa5e73-a662-4...


In [47]:
if merge_with_previous:
    data_df = pd.concat([saved_data_df, data_df], ignore_index=True)

In [48]:
data_df

Unnamed: 0,title,abstract,year,url,author_id,query,file_name,file_path
0,Linear regression,Linear regression plays a fundamental role in ...,2012,https://www.cs.columbia.edu/~djhsu/coms4771-f2...,"[wcgZZP8AAAAJ, , El-UNYoAAAAJ]",Linear Regression,7cedd2b0-ac03-49fb-9ef1-398b1a5d7fe1.pdf,../../../detect_ml_model_files/7cedd2b0-ac03-4...
1,Introduction to linear regression analysis,Equation (1.2) is called a linear regression m...,2021,http://sutlib2.sut.ac.th/sut_contents/H133678.pdf,"[5PboKNAAAAAJ, , y2eb6cQAAAAJ]",Linear Regression,91374318-7135-4f51-a3bc-720ececd88de.pdf,../../../detect_ml_model_files/91374318-7135-4...
2,Linear regression,"linear regression, a very simple approach for ...",2023,https://datamineaz.org/readings/ISL_chp3.pdf,"[KUIjZqgAAAAJ, bHZf-c8AAAAJ, tQVe-fAAAAAJ, ZpG...",Linear Regression,6d3627b9-6b0b-423c-8898-3185837a7617.pdf,../../../detect_ml_model_files/6d3627b9-6b0b-4...
3,Linear regression,In linear regression the ordinary least square...,2012,,[vBWC0XIAAAAJ],Linear Regression,,
4,Applied linear regression,Applied linear regression Applied linear regre...,2005,https://www.stat.cmu.edu/~brian/valerie/617-20...,[wlu6jZQAAAAJ],Linear Regression,c7fed73c-8e13-4770-a589-67708aab0b7e.pdf,../../../detect_ml_model_files/c7fed73c-8e13-4...
...,...,...,...,...,...,...,...,...
1227,BERT: a review of applications in natural lang...,"relative to the original BERT model, which is ...",2021,https://arxiv.org/pdf/2103.11943,[],BERT,91ebf298-65bd-4429-8c60-b17cb80306fe.pdf,../../../detect_ml_model_files/91ebf298-65bd-4...
1228,What does BERT learn about the structure of la...,language structure learned by BERT. We first s...,2019,https://inria.hal.science/hal-02131630/document,"[X7SMP1EAAAAJ, HXUT9ZkAAAAJ, P7EtARsAAAAJ]",BERT,65f04db1-ad62-4f48-9e80-3ae5a11bf147.pdf,../../../detect_ml_model_files/65f04db1-ad62-4...
1229,Visualizing and understanding the effectivenes...,trajectories of fine-tuning BERT on specific d...,2019,https://arxiv.org/pdf/1908.05620,"[cqOLO7IAAAAJ, wEfQgPgAAAAJ, G-V1VpwAAAAJ, ]",BERT,03d43387-8413-413c-8ce8-92b3512cedfb.pdf,../../../detect_ml_model_files/03d43387-8413-4...
1230,A comprehensive survey on pretrained foundatio...,Pretrained Foundation Models (PFMs) are regard...,2024,https://arxiv.org/pdf/2302.09419,"[HWx73DcAAAAJ, AHg-JGIAAAAJ, nPvWFpkAAAAJ, fh1...",BERT,a60911d8-b83b-4c2a-8555-2ac3b09b5025.pdf,../../../detect_ml_model_files/a60911d8-b83b-4...


Do some basic checks to see if there are exactly five files per search that have been downloaded

In [49]:
check_df = data_df.dropna()

In [50]:
check_df

Unnamed: 0,title,abstract,year,url,author_id,query,file_name,file_path
0,Linear regression,Linear regression plays a fundamental role in ...,2012,https://www.cs.columbia.edu/~djhsu/coms4771-f2...,"[wcgZZP8AAAAJ, , El-UNYoAAAAJ]",Linear Regression,7cedd2b0-ac03-49fb-9ef1-398b1a5d7fe1.pdf,../../../detect_ml_model_files/7cedd2b0-ac03-4...
1,Introduction to linear regression analysis,Equation (1.2) is called a linear regression m...,2021,http://sutlib2.sut.ac.th/sut_contents/H133678.pdf,"[5PboKNAAAAAJ, , y2eb6cQAAAAJ]",Linear Regression,91374318-7135-4f51-a3bc-720ececd88de.pdf,../../../detect_ml_model_files/91374318-7135-4...
2,Linear regression,"linear regression, a very simple approach for ...",2023,https://datamineaz.org/readings/ISL_chp3.pdf,"[KUIjZqgAAAAJ, bHZf-c8AAAAJ, tQVe-fAAAAAJ, ZpG...",Linear Regression,6d3627b9-6b0b-423c-8898-3185837a7617.pdf,../../../detect_ml_model_files/6d3627b9-6b0b-4...
4,Applied linear regression,Applied linear regression Applied linear regre...,2005,https://www.stat.cmu.edu/~brian/valerie/617-20...,[wlu6jZQAAAAJ],Linear Regression,c7fed73c-8e13-4770-a589-67708aab0b7e.pdf,../../../detect_ml_model_files/c7fed73c-8e13-4...
8,A review on linear regression comprehensive in...,We discuss linear regression and polynomial re...,2020,https://jastt.org/index.php/jasttpath/article/...,"[G9U01kwAAAAJ, aBdgHxkAAAAJ]",Linear Regression,f8d408d7-9c86-4d72-b6af-6538ed46970f.pdf,../../../detect_ml_model_files/f8d408d7-9c86-4...
...,...,...,...,...,...,...,...,...
1227,BERT: a review of applications in natural lang...,"relative to the original BERT model, which is ...",2021,https://arxiv.org/pdf/2103.11943,[],BERT,91ebf298-65bd-4429-8c60-b17cb80306fe.pdf,../../../detect_ml_model_files/91ebf298-65bd-4...
1228,What does BERT learn about the structure of la...,language structure learned by BERT. We first s...,2019,https://inria.hal.science/hal-02131630/document,"[X7SMP1EAAAAJ, HXUT9ZkAAAAJ, P7EtARsAAAAJ]",BERT,65f04db1-ad62-4f48-9e80-3ae5a11bf147.pdf,../../../detect_ml_model_files/65f04db1-ad62-4...
1229,Visualizing and understanding the effectivenes...,trajectories of fine-tuning BERT on specific d...,2019,https://arxiv.org/pdf/1908.05620,"[cqOLO7IAAAAJ, wEfQgPgAAAAJ, G-V1VpwAAAAJ, ]",BERT,03d43387-8413-413c-8ce8-92b3512cedfb.pdf,../../../detect_ml_model_files/03d43387-8413-4...
1230,A comprehensive survey on pretrained foundatio...,Pretrained Foundation Models (PFMs) are regard...,2024,https://arxiv.org/pdf/2302.09419,"[HWx73DcAAAAJ, AHg-JGIAAAAJ, nPvWFpkAAAAJ, fh1...",BERT,a60911d8-b83b-4c2a-8555-2ac3b09b5025.pdf,../../../detect_ml_model_files/a60911d8-b83b-4...


In [51]:
check_df = check_df.groupby(['query'])['query'].count().reset_index(name='count')

In [52]:
check_df

Unnamed: 0,query,count
0,Actor-Critic Models,5
1,AdaBoost,5
2,Advantage Actor-Critic,5
3,AlexNet,5
4,Apriori Algorithm,5
...,...,...
94,Vision Transformers,5
95,XGBoost,5
96,k-Means Clustering,5
97,k-Nearest Neighbors,5


In [53]:
check_df.query('count != 5')

Unnamed: 0,query,count


So all search terms have five files associated with them

In [54]:
data_df.to_parquet(f'{DATA_FOLDER}detect_ml_models.parquet')