1. Recover from previous run
2. Find papers matching the ML search term
3. Download first five available files
4. Save progress

In [1]:
import pandas as pd

#import spacy
#from spacy.matcher import PhraseMatcher

from scholarly import scholarly
import time

import requests
import mimetypes
import os

import uuid

#import PyPDF2

In [2]:
ROOT_FOLDER = '../../../'

In [3]:
DATA_FOLDER = '../../../data/'

In [4]:
DETECT_ML_MODEL_FILES_FOLDER = '../../../detect_ml_model_files/'

In [5]:
model_df = pd.read_excel(f'{ROOT_FOLDER}ML-Model-Categorization.ods', sheet_name='Sheet1', usecols=['MODEL'])

In [6]:
model_df

Unnamed: 0,MODEL
0,Linear Regression
1,Polynomial Regression
2,Ridge Regression
3,Lasso Regression
4,Elastic Net Regression
...,...
97,Claude
98,LLaMA
99,Contrastive Language-Image Pretraining
100,DALL-E


In [7]:
models = model_df['MODEL'].values.tolist()

In [8]:
def search_and_download_papers(query, limit, output_dir):
    search_query = scholarly.search_pubs(query)
    papers = []
    counter = 0

    while(counter < limit):
        try:
            time.sleep(2)  # Introduce delay
            paper = next(search_query)
            paper_info = {
                "title": paper.get("bib", {}).get("title"),
                "abstract": paper.get("bib", {}).get("abstract"),
                "year": paper.get("bib", {}).get("pub_year"),
                "url": paper.get("eprint_url", ""),
                "author_id": paper.get("author_id", []),
                "query": query,
                "file_name": "",
                "file_path": None
            }

            # lets download the file if the url exists
            if paper_info['url']:
                url = paper_info['url']
                try:
                    # Download the file if a link is available
                    response = requests.get(url, stream=True)
                    content_type = response.headers.get('Content-Type', '')
                    # Check if the content type is PDF
                    if 'application/pdf' in content_type or mimetypes.guess_extension(content_type) == '.pdf':
                        unique_filename = str(uuid.uuid4()) + '.pdf'
                        file_path = os.path.join(output_dir, unique_filename)

                        with open(file_path, "wb") as file:
                            for chunk in response.iter_content(chunk_size=8192):
                                file.write(chunk)
                        
                        print(f"Downloaded: {file_path}")                            
                        paper_info['file_name'] = unique_filename
                        paper_info['file_path'] = file_path
                        counter += 1
                    else:
                        print(f"non-PDF content: {url}")             
                except Exception as e:
                    print(f"Error occurred while processing {url}: {e}")                                
            papers.append(paper_info)
        except StopIteration:
            break
        except Exception as e:
            print(f"Error processing paper : {e}")
            continue

    return papers

In [9]:
merge_with_previous = False

if os.path.isfile(f'{DATA_FOLDER}detect_ml_models.parquet'):
    # recover from previous run
    
    saved_data_df = pd.read_parquet(f'{DATA_FOLDER}detect_ml_models.parquet')

    merge_with_previous = True    
    
    processed_models = saved_data_df['query'].unique().tolist()
    
    unprocessed_models = list()
    
    for model in models:
        if model not in processed_models:
            unprocessed_models.append(model)
else:
    unprocessed_models = models

In [10]:
search_results = list()
for model in unprocessed_models:
    print(f"Searching for: {model}")    
    papers = search_and_download_papers(query=model, limit=5, output_dir=DETECT_ML_MODEL_FILES_FOLDER)
    search_results.append(papers)

Searching for: AlexNet
Downloaded: ../../../detect_ml_model_files/d55f2f33-e0a2-43cd-999b-20f8b3cdf9f6.pdf
Downloaded: ../../../detect_ml_model_files/b5e4a3c0-105f-4e27-8206-93cfcdada0e5.pdf
Downloaded: ../../../detect_ml_model_files/e5286cab-0f74-4066-bbd7-90d2c09b682e.pdf
non-PDF content: https://ojs.aaai.org/index.php/AAAI/article/view/10171/10030
Downloaded: ../../../detect_ml_model_files/fdc50210-7899-488e-95f4-c0a7ff9ca2e5.pdf
non-PDF content: https://www.researchgate.net/profile/Ali-Almisreb/publication/327712055_Utilizing_AlexNet_Deep_Transfer_Learning_for_Ear_Recognition/links/5cc7ccd64585156cd7bbc0ab/Utilizing-AlexNet-Deep-Transfer-Learning-for-Ear-Recognition.pdf
non-PDF content: https://www.researchgate.net/profile/Asad-Ullah-69/publication/378148232_Comparative_Analysis_of_AlexNet_ResNet18_and_SqueezeNet_with_Diverse_Modification_and_Arduous_Implementation/links/65ca0fb734bbff5ba70b4124/Comparative-Analysis-of-AlexNet-ResNet18-and-SqueezeNet-with-Diverse-Modification-and-A

In [11]:
# can do it in one line but is it easier to read?
# flattened_search_results = [paper for paper in papers for papers in search_results]
flattened_search_results = list()
for papers in search_results:
    for paper in papers:
        flattened_search_results.append(paper)    

In [12]:
data_df = pd.DataFrame(flattened_search_results)

In [13]:
data_df

Unnamed: 0,title,abstract,year,url,author_id,query,file_name,file_path
0,The history began from alexnet: A comprehensiv...,Deep learning has demonstrated tremendous succ...,2018,https://saulius-grazulis.lt/~saulius/paskaitos...,"[JU07ZEsAAAAJ, HcOrvg8AAAAJ, , ]",AlexNet,d55f2f33-e0a2-43cd-999b-20f8b3cdf9f6.pdf,../../../detect_ml_model_files/d55f2f33-e0a2-4...
1,Visualizing and comparing AlexNet and VGG usin...,"In Figure 6, we compare the process carried by...",2016,https://icmlviz.github.io/icmlviz2016/assets/p...,"[x2qhqtwAAAAJ, g2gAY_0AAAAJ, iYMBoHwAAAAJ, DaK...",AlexNet,b5e4a3c0-105f-4e27-8206-93cfcdada0e5.pdf,../../../detect_ml_model_files/b5e4a3c0-105f-4...
2,Inceptiontime: Finding alexnet for time series...,This paper brings deep learning at the forefro...,2020,https://arxiv.org/pdf/1909.04939,"[oUrGNaoAAAAJ, Axtv4-kAAAAJ, QDtJYqkAAAAJ]",AlexNet,e5286cab-0f74-4066-bbd7-90d2c09b682e.pdf,../../../detect_ml_model_files/e5286cab-0f74-4...
3,Feature extraction and image retrieval based o...,Convolutional Neural Network is a hot research...,2016,,"[, ]",AlexNet,,
4,Modified Alexnet architecture for classificati...,"CNN has different architectures like Le-Net, A...",2019,,"[hD2nQ3MAAAAJ, Xu_v8kAAAAAJ]",AlexNet,,
...,...,...,...,...,...,...,...,...
531,Generated faces in the wild: Quantitative comp...,"popular systems including Stable Diffusion, Mi...",2022,https://arxiv.org/pdf/2210.00586,[7jTNT1IAAAAJ],Stable Diffusion,0874bbf1-55fa-4df9-8a07-c6c15ac91c84.pdf,../../../detect_ml_model_files/0874bbf1-55fa-4...
532,What the daam: Interpreting stable diffusion u...,We probe Stable Diffusion to provide insight i...,2022,https://aclanthology.org/2023.acl-long.310.pdf,"[3icAgxMAAAAJ, tQU-BXIAAAAJ, G-2DepMAAAAJ, eJ5...",Stable Diffusion,e1e814fc-7dea-43b0-8049-928fee635bd7.pdf,../../../detect_ml_model_files/e1e814fc-7dea-4...
533,Red-teaming the stable diffusion safety filter,We conclude that the Stable Diffusion safety f...,2022,https://arxiv.org/pdf/2210.04610,"[d_rilUYAAAAJ, 9e_KfoEAAAAJ, p_aH5fgAAAAJ, Au7...",Stable Diffusion,37f5b622-ff4e-4405-b63e-4d13c5738a88.pdf,../../../detect_ml_model_files/37f5b622-ff4e-4...
534,Stable video diffusion: Scaling latent video d...,We present Stable Video Diffusion — a latent v...,2023,https://arxiv.org/pdf/2311.15127,"[vud0t5YAAAAJ, EtPn_v4AAAAJ, 1bYESBYAAAAJ]",Stable Diffusion,214899da-c78e-42d2-9a7d-0368daa239d6.pdf,../../../detect_ml_model_files/214899da-c78e-4...


In [14]:
if merge_with_previous:
    data_df = pd.concat([saved_data_df, data_df], ignore_index=True)

In [15]:
data_df

Unnamed: 0,title,abstract,year,url,author_id,query,file_name,file_path
0,Linear regression,Linear regression plays a fundamental role in ...,2012,https://www.cs.columbia.edu/~djhsu/coms4771-f2...,"[wcgZZP8AAAAJ, , El-UNYoAAAAJ]",Linear Regression,7cedd2b0-ac03-49fb-9ef1-398b1a5d7fe1.pdf,../../../detect_ml_model_files/7cedd2b0-ac03-4...
1,Introduction to linear regression analysis,Equation (1.2) is called a linear regression m...,2021,http://sutlib2.sut.ac.th/sut_contents/H133678.pdf,"[5PboKNAAAAAJ, , y2eb6cQAAAAJ]",Linear Regression,91374318-7135-4f51-a3bc-720ececd88de.pdf,../../../detect_ml_model_files/91374318-7135-4...
2,Linear regression,"linear regression, a very simple approach for ...",2023,https://datamineaz.org/readings/ISL_chp3.pdf,"[KUIjZqgAAAAJ, bHZf-c8AAAAJ, tQVe-fAAAAAJ, ZpG...",Linear Regression,6d3627b9-6b0b-423c-8898-3185837a7617.pdf,../../../detect_ml_model_files/6d3627b9-6b0b-4...
3,Linear regression,In linear regression the ordinary least square...,2012,,[vBWC0XIAAAAJ],Linear Regression,,
4,Applied linear regression,Applied linear regression Applied linear regre...,2005,https://www.stat.cmu.edu/~brian/valerie/617-20...,[wlu6jZQAAAAJ],Linear Regression,c7fed73c-8e13-4770-a589-67708aab0b7e.pdf,../../../detect_ml_model_files/c7fed73c-8e13-4...
...,...,...,...,...,...,...,...,...
1229,Generated faces in the wild: Quantitative comp...,"popular systems including Stable Diffusion, Mi...",2022,https://arxiv.org/pdf/2210.00586,[7jTNT1IAAAAJ],Stable Diffusion,0874bbf1-55fa-4df9-8a07-c6c15ac91c84.pdf,../../../detect_ml_model_files/0874bbf1-55fa-4...
1230,What the daam: Interpreting stable diffusion u...,We probe Stable Diffusion to provide insight i...,2022,https://aclanthology.org/2023.acl-long.310.pdf,"[3icAgxMAAAAJ, tQU-BXIAAAAJ, G-2DepMAAAAJ, eJ5...",Stable Diffusion,e1e814fc-7dea-43b0-8049-928fee635bd7.pdf,../../../detect_ml_model_files/e1e814fc-7dea-4...
1231,Red-teaming the stable diffusion safety filter,We conclude that the Stable Diffusion safety f...,2022,https://arxiv.org/pdf/2210.04610,"[d_rilUYAAAAJ, 9e_KfoEAAAAJ, p_aH5fgAAAAJ, Au7...",Stable Diffusion,37f5b622-ff4e-4405-b63e-4d13c5738a88.pdf,../../../detect_ml_model_files/37f5b622-ff4e-4...
1232,Stable video diffusion: Scaling latent video d...,We present Stable Video Diffusion — a latent v...,2023,https://arxiv.org/pdf/2311.15127,"[vud0t5YAAAAJ, EtPn_v4AAAAJ, 1bYESBYAAAAJ]",Stable Diffusion,214899da-c78e-42d2-9a7d-0368daa239d6.pdf,../../../detect_ml_model_files/214899da-c78e-4...


In [16]:
data_df.to_parquet(f'{DATA_FOLDER}detect_ml_models.parquet')