1. Find papers matching the ML search term
2. Download first five available files

In [1]:
import pandas as pd

#import spacy
#from spacy.matcher import PhraseMatcher

from scholarly import scholarly
import time

import requests
import mimetypes
import os

import uuid

#import PyPDF2

In [2]:
ROOT_FOLDER = '../../../'

In [3]:
DATA_FOLDER = '../../../data/'

In [4]:
DETECT_ML_MODEL_FILES_FOLDER = '../../../detect_ml_model_files/'

In [5]:
model_df = pd.read_excel(f'{ROOT_FOLDER}ML-Model-Categorization.ods', sheet_name='Sheet1', usecols=['MODEL'])

In [6]:
model_df

Unnamed: 0,MODEL
0,Linear Regression
1,Polynomial Regression
2,Ridge Regression
3,Lasso Regression
4,Elastic Net Regression
...,...
97,Claude
98,LLaMA
99,Contrastive Language-Image Pretraining
100,DALL-E


In [10]:
def search_and_download_papers(query, limit, output_dir):
    search_query = scholarly.search_pubs(query)
    papers = []
    counter = 0

    while(counter < limit):
        try:
            time.sleep(2)  # Introduce delay
            paper = next(search_query)
            paper_info = {
                "title": paper.get("bib", {}).get("title"),
                "abstract": paper.get("bib", {}).get("abstract"),
                "year": paper.get("bib", {}).get("pub_year"),
                "url": paper.get("eprint_url", ""),
                "author_id": paper.get("author_id", []),
                "query": query,
                "file_name": "",
                "file_path": None
            }

            # lets download the file if the url exists
            if paper_info['url']:
                url = paper_info['url']
                try:
                    # Download the file if a link is available
                    response = requests.get(url, stream=True)
                    content_type = response.headers.get('Content-Type', '')
                    # Check if the content type is PDF
                    if 'application/pdf' in content_type or mimetypes.guess_extension(content_type) == '.pdf':
                        unique_filename = str(uuid.uuid4()) + '.pdf'
                        file_path = os.path.join(output_dir, unique_filename)

                        with open(file_path, "wb") as file:
                            for chunk in response.iter_content(chunk_size=8192):
                                file.write(chunk)
                        
                        print(f"Downloaded: {file_path}")                            
                        paper_info['file_name'] = unique_filename
                        paper_info['file_path'] = file_path
                        counter += 1
                    else:
                        print(f"non-PDF content: {url}")             
                except Exception as e:
                    print(f"Error occurred while processing {url}: {e}")                                
            papers.append(paper_info)
        except StopIteration:
            break
        except Exception as e:
            print(f"Error processing paper : {e}")
            continue

    return papers

In [6]:
models = model_df['MODEL'].values.tolist()

In [9]:
search_results = list()

for model in models:
    print(f"Searching for: {model}")    
    papers = search_and_download_papers(query=model, limit=5, output_dir=DETECT_ML_MODEL_FILES_FOLDER)
    search_results.append(papers)

Searching for: Linear Regression
Downloaded: ../../../detect_ml_model_files/7cedd2b0-ac03-49fb-9ef1-398b1a5d7fe1.pdf
Downloaded: ../../../detect_ml_model_files/91374318-7135-4f51-a3bc-720ececd88de.pdf
Downloaded: ../../../detect_ml_model_files/6d3627b9-6b0b-423c-8898-3185837a7617.pdf
Downloaded: ../../../detect_ml_model_files/c7fed73c-8e13-4770-a589-67708aab0b7e.pdf
non-PDF content: https://www.sciencedirect.com/science/article/pii/S1877042813046429/pdf?md5=da62147e64e2f356bfe6696b4f8031c1&pid=1-s2.0-S1877042813046429-main.pdf&_valck=1
Downloaded: ../../../detect_ml_model_files/f8d408d7-9c86-4d72-b6af-6538ed46970f.pdf
Searching for: Polynomial Regression
non-PDF content: https://www.sciencedirect.com/science/article/pii/S1877705812046085/pdf?md5=e60c56375b4a69af5b4605c6ca1d0714&pid=1-s2.0-S1877705812046085-main.pdf&_valck=1
Downloaded: ../../../detect_ml_model_files/9a7070ea-d272-4c5a-9486-3b4520e752f5.pdf
Downloaded: ../../../detect_ml_model_files/f2bff7c1-7a16-4fe2-afee-bd4e033b49fc.

  m = re.search("cites=[\d+,]*", object["citedby_url"])


MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [10]:
# can do it in one line but is it easier to read?
# flattened_data = [paper for paper in papers for papers in search_results]
flattened_data = list()
for papers in search_results:
    for paper in papers:
        flattened_data.append(paper)    

In [11]:
data_df = pd.DataFrame(flattened_data)

In [12]:
data_df

Unnamed: 0,title,abstract,year,url,author_id,query,file_name,file_path
0,Linear regression,Linear regression plays a fundamental role in ...,2012,https://www.cs.columbia.edu/~djhsu/coms4771-f2...,"[wcgZZP8AAAAJ, , El-UNYoAAAAJ]",Linear Regression,7cedd2b0-ac03-49fb-9ef1-398b1a5d7fe1.pdf,../../../detect_ml_model_files/7cedd2b0-ac03-4...
1,Introduction to linear regression analysis,Equation (1.2) is called a linear regression m...,2021,http://sutlib2.sut.ac.th/sut_contents/H133678.pdf,"[5PboKNAAAAAJ, , y2eb6cQAAAAJ]",Linear Regression,91374318-7135-4f51-a3bc-720ececd88de.pdf,../../../detect_ml_model_files/91374318-7135-4...
2,Linear regression,"linear regression, a very simple approach for ...",2023,https://datamineaz.org/readings/ISL_chp3.pdf,"[KUIjZqgAAAAJ, bHZf-c8AAAAJ, tQVe-fAAAAAJ, ZpG...",Linear Regression,6d3627b9-6b0b-423c-8898-3185837a7617.pdf,../../../detect_ml_model_files/6d3627b9-6b0b-4...
3,Linear regression,In linear regression the ordinary least square...,2012,,[vBWC0XIAAAAJ],Linear Regression,,
4,Applied linear regression,Applied linear regression Applied linear regre...,2005,https://www.stat.cmu.edu/~brian/valerie/617-20...,[wlu6jZQAAAAJ],Linear Regression,c7fed73c-8e13-4770-a589-67708aab0b7e.pdf,../../../detect_ml_model_files/c7fed73c-8e13-4...
...,...,...,...,...,...,...,...,...
693,A robust system for road sign detection and cl...,"In this paper, we are reporting a system for d...",2020,https://www.academia.edu/download/89076721/s00...,"[s4PPcq8AAAAJ, , DPK9m_YAAAAJ, eBF5ZcwAAAAJ]",LeNet,,
694,Rolling-element bearing fault diagnosis using ...,the improved 2D LeNet-5 network and improved 1...,2020,https://www.mdpi.com/1424-8220/20/6/1693,"[lGOm9iwNkE4C, , , ]",LeNet,,
695,An improved LeNet-deep neural network model fo...,in LeNet by the concatenated layers. We have a...,2021,https://ieeexplore.ieee.org/iel7/6287639/93127...,"[zesMrQgAAAAJ, i95DGLQAAAAJ, of-iYjsAAAAJ, QSN...",LeNet,,
696,Prognostic factors associated with development...,PURPOSE Acute pancreatitis is a potentially li...,2022,,"[, , , , RPQKmqgAAAAJ]",LeNet,,


In [22]:
data_df.to_parquet(f'{DATA_FOLDER}detect_ml_models.parquet')

In [7]:
data_df = pd.read_parquet(f'{DATA_FOLDER}detect_ml_models.parquet')

In [8]:
unprocessed_models = list()

processed_models = data_df['query'].unique().tolist()

for model in models:
    if model not in processed_models:
        unprocessed_models.append(model)

In [11]:
search_results = list()
for model in unprocessed_models:
    print(f"Searching for: {model}")    
    papers = search_and_download_papers(query=model, limit=5, output_dir=DETECT_ML_MODEL_FILES_FOLDER)
    search_results.append(papers)

Searching for: AlexNet
Downloaded: ../../../detect_ml_model_files/c6bf255b-a845-4b9c-b6c2-bb3d29cbc175.pdf
Downloaded: ../../../detect_ml_model_files/233078ca-a4c0-49a0-9cce-84fadcadee3e.pdf
Downloaded: ../../../detect_ml_model_files/f54bed3f-8e74-455e-a1c3-8bee73ab326d.pdf
non-PDF content: https://ojs.aaai.org/index.php/AAAI/article/view/10171/10030
Downloaded: ../../../detect_ml_model_files/87740124-3680-4c25-ae86-f7f9e86b5873.pdf
non-PDF content: https://www.researchgate.net/profile/Ali-Almisreb/publication/327712055_Utilizing_AlexNet_Deep_Transfer_Learning_for_Ear_Recognition/links/5cc7ccd64585156cd7bbc0ab/Utilizing-AlexNet-Deep-Transfer-Learning-for-Ear-Recognition.pdf
non-PDF content: https://www.researchgate.net/profile/Asad-Ullah-69/publication/378148232_Comparative_Analysis_of_AlexNet_ResNet18_and_SqueezeNet_with_Diverse_Modification_and_Arduous_Implementation/links/65ca0fb734bbff5ba70b4124/Comparative-Analysis-of-AlexNet-ResNet18-and-SqueezeNet-with-Diverse-Modification-and-A

  m = re.search("cites=[\d+,]*", object["citedby_url"])


MaxTriesExceededException: Cannot Fetch from Google Scholar.

Find terms in file to use as labels

In [15]:
def search_in_file(pdf_file_path):
    search_results = dict()
    # Extract text from PDF
    with open(pdf_file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num, page in enumerate(pdf_reader.pages):
            # Extract text from the page
            text = page.extract_text()
            if text:
                # Use spaCy to process the text
                doc = nlp(text)
                
                # Detect entities
                for ent in doc.ents:
                    if ent.label_ == "ML_METHOD":
                        if ent.text in search_results:
                            search_results[ent.text].append(page_num + 1)  # Store page number (1-indexed)
                        else:
                            search_results[ent.text] = [page_num + 1]  # Store page number (1-indexed)
    
    # Display search results
    for term, pages in search_results.items():
        if pages:
            print(f"'{term}' found on page(s): {set(pages)}")
        #else:
        #    print(f"'{term}' not found in the document.")

In [17]:
file_paths = data_df['file_path'].dropna().values.tolist()

In [18]:
pdf_file_path = file_paths[0]

print(pdf_file_path)

../../detect_ml_model_files/18d9a07e-aaa5-4cbe-a094-ccc035a96392.pdf


In [19]:
with open(pdf_file_path, "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    for page_num, page in enumerate(pdf_reader.pages):
        # Extract text from the page
        text = page.extract_text()
        break

In [20]:
doc = nlp(text)

In [21]:
for ent in doc.ents:
    print(ent)

Linear regression
linear regression


In [22]:
#nlp.add_pipe("sentencizer", before="parser")

# Recreate the Doc object
#doc = nlp(text)

for sentence in doc.sents:
    print(sentence.text)

Linear regression
Daniel Hsu (COMS 4771)
Maximum likelihood estimation
One of the simplest linear regression models is the following: (X1,Y1),..., (Xn,Yn),(X,Y)are iid random
pairs taking values in Rd×R, and
Y|X=x∼N(xTw,σ2),x∈Rd.

Here, the vector w∈Rdand scalarσ2>0are the parameters of the model.
(The marginal distribution of
Xis unspeciﬁed.)

Thelog-likelihood of(w,σ2)given (Xi,Yi)
= (xi,yi)fori= 1,...,nis
n/summationdisplay
i=1/braceleftBigg
ln1√
2πσ2−(yi−xT
iw)2
2σ2/bracerightBigg
+T,
whereTis some quantity that does not depend on (w,σ2).
Therefore, maximizing the log-likelihood over
w∈Rd(for anyσ2>0) is the same as minimizing
1
nn/summationdisplay
i=1(xT
iw−yi)2.

So, the maximum likelihood estimator (MLE) ofwin this model is
ˆw∈arg min
w∈Rd1
nn/summationdisplay
i=1(xT
iw−yi)2.

(It is not necessarily uniquely determined.)

Empirical risk minimization
LetPnbe the empirical distribution on(x1,y1),..., (xn,yn)∈Rd×R, i.e., the probability distribution over
Rd×Rwith probability mass f

In [23]:
from spacy import displacy

# Render the entities in a Jupyter Notebook or as HTML
# displacy.render(doc, style="ent", jupyter=True)  # Use jupyter=True if in a notebook

options = {"colors": {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", "ML_METHOD": "#f6c5be"}}
displacy.render(doc, style="ent", options=options, jupyter=True)


Lets only select sentences with a ML_METHOD entity


In [24]:
# Extract sentences with ML_METHOD entities
filtered_sentences = [
    sentence.text
    for sentence in doc.sents
    if any(ent.label_ == "ML_METHOD" for ent in sentence.ents)
]

# Create a new text containing only those sentences
filtered_text = " ".join(filtered_sentences)

# Create a new Doc object with the filtered text
filtered_doc = nlp(filtered_text)

# Print the filtered sentences
print("Filtered Sentences:")
for sentence in filtered_doc.sents:
    print(sentence.text)

Filtered Sentences:
Linear regression
Daniel Hsu (COMS 4771)
Maximum likelihood estimation
One of the simplest linear regression models is the following: (X1,Y1),..., (Xn,Yn),(X,Y)are iid random
pairs taking values in Rd×R, and
Y|X=x∼N(xTw,σ2),x∈Rd.



In [25]:
#displacy.render(filtered_doc, style="ent", jupyter=True)  # Use jupyter=True if in a notebook

options = {"colors": {"ML_METHOD": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}}
displacy.render(filtered_doc, style="ent", options=options, jupyter=True)