1. Find papers matching the ML search term
2. Download first five available files

In [20]:
import pandas as pd

import spacy
from spacy.matcher import PhraseMatcher

from scholarly import scholarly

import requests
import mimetypes
import os

import uuid

import PyPDF2

In [3]:
ROOT_FOLDER = '../../'

In [4]:
DATA_FOLDER = '../../data/'

In [5]:
DETECT_ML_MODEL_FILES_FOLDER = '../../detect_ml_model_files/'

In [6]:
model_df = pd.read_excel(f'{ROOT_FOLDER}ML-Model-Categorization.ods', sheet_name='Sheet1', usecols=['MODEL'])

In [7]:
model_df

Unnamed: 0,MODEL
0,Linear Regression
1,Polynomial Regression
2,Ridge Regression
3,Lasso Regression
4,Elastic Net Regression
...,...
97,Claude
98,LLaMA
99,Contrastive Language-Image Pretraining
100,DALL-E


In [39]:
# nlp = spacy.blank("en")
# ruler = nlp.add_pipe("entity_ruler")

nlp = spacy.load("en_core_web_sm")
ruler = nlp.add_pipe("entity_ruler", before="ner")

# Load the EntityRuler patterns from a file
ruler.from_disk(f"{DATA_FOLDER}ml_entity_ruler_patterns")

<spacy.pipeline.entityruler.EntityRuler at 0x7bfe5605ecd0>

In [40]:
# Test the pipeline
text = "Support Vector Machine, Support Vector Machines, SVM, and S.V.M. are popular machine learning methods."
doc = nlp(text)

# Print detected entities
for ent in doc.ents:
    print(ent.text, ent.label_)


Support Vector Machine ML_METHOD
Support Vector Machines ML_METHOD
SVM ML_METHOD
S.V.M. ORG


In [10]:
def search_and_download_papers(query, limit, output_dir):
    search_query = scholarly.search_pubs(query)
    papers = []
    counter = 0

    while(counter < limit):
        try:
            paper = next(search_query)
            paper_info = {
                "title": paper.get("bib", {}).get("title"),
                "abstract": paper.get("bib", {}).get("abstract"),
                "year": paper.get("bib", {}).get("pub_year"),
                "url": paper.get("eprint_url", ""),
                "author_id": paper.get("author_id", []),
                "query": query,
                "file_name": "",
                "file_path": None
            }

            # lets download the file if the url exists
            if paper_info['url']:
                url = paper_info['url']
                try:
                    # Download the file if a link is available
                    response = requests.get(url, stream=True)
                    content_type = response.headers.get('Content-Type', '')
                    # Check if the content type is PDF
                    if 'application/pdf' in content_type or mimetypes.guess_extension(content_type) == '.pdf':
                        unique_filename = str(uuid.uuid4()) + '.pdf'
                        file_path = os.path.join(output_dir, unique_filename)

                        with open(file_path, "wb") as file:
                            for chunk in response.iter_content(chunk_size=8192):
                                file.write(chunk)
                        
                        print(f"Downloaded: {file_path}")                            
                        paper_info['file_name'] = unique_filename
                        paper_info['file_path'] = file_path
                        counter += 1
                    else:
                        print(f"non-PDF content: {url}")             
                except Exception as e:
                    print(f"Error occurred while processing {url}: {e}")                                
            papers.append(paper_info)
        except StopIteration:
            break
        except Exception as e:
            print(f"Error processing paper : {e}")
            continue

    return papers

In [11]:
models = model_df['MODEL'].values.tolist()

In [12]:
search_results = list()

for model in models:
    print(f"Searching for: {model}")    
    papers = search_and_download_papers(query=model, limit=5, output_dir=DETECT_ML_MODEL_FILES_FOLDER)
    search_results.append(papers)
    break

Searching for: Linear Regression
Downloaded: ../../detect_ml_model_files/055a4d9b-aa8e-472d-9c64-c161e1660251.pdf
Downloaded: ../../detect_ml_model_files/093ae391-bf6d-4f35-81e0-ce47557474b1.pdf
Downloaded: ../../detect_ml_model_files/4e69247a-1ca1-48f1-b9ec-97454a02a05c.pdf
Downloaded: ../../detect_ml_model_files/bc5bfac7-8bb1-4de5-91e5-25a0ffd62ecf.pdf
non-PDF content: https://www.sciencedirect.com/science/article/pii/S1877042813046429/pdf?md5=da62147e64e2f356bfe6696b4f8031c1&pid=1-s2.0-S1877042813046429-main.pdf&_valck=1
Downloaded: ../../detect_ml_model_files/07d733b9-322b-449d-9769-39794b54490b.pdf


In [13]:
# can do it in one line but is it easier to read?
# flattened_data = [paper for paper in papers for papers in search_results]
flattened_data = list()
for papers in search_results:
    for paper in papers:
        flattened_data.append(paper)
    

In [14]:
data_df = pd.DataFrame(flattened_data)

In [15]:
data_df

Unnamed: 0,title,abstract,year,url,author_id,query,file_name,file_path
0,Linear regression,Linear regression plays a fundamental role in ...,2012,https://www.cs.columbia.edu/~djhsu/coms4771-f2...,"[wcgZZP8AAAAJ, , El-UNYoAAAAJ]",Linear Regression,055a4d9b-aa8e-472d-9c64-c161e1660251.pdf,../../detect_ml_model_files/055a4d9b-aa8e-472d...
1,Introduction to linear regression analysis,Equation (1.2) is called a linear regression m...,2021,http://sutlib2.sut.ac.th/sut_contents/H133678.pdf,"[5PboKNAAAAAJ, , y2eb6cQAAAAJ]",Linear Regression,093ae391-bf6d-4f35-81e0-ce47557474b1.pdf,../../detect_ml_model_files/093ae391-bf6d-4f35...
2,Linear regression,"linear regression, a very simple approach for ...",2023,https://datamineaz.org/readings/ISL_chp3.pdf,"[KUIjZqgAAAAJ, bHZf-c8AAAAJ, tQVe-fAAAAAJ, ZpG...",Linear Regression,4e69247a-1ca1-48f1-b9ec-97454a02a05c.pdf,../../detect_ml_model_files/4e69247a-1ca1-48f1...
3,Linear regression,In linear regression the ordinary least square...,2012,,[vBWC0XIAAAAJ],Linear Regression,,
4,Applied linear regression,Applied linear regression Applied linear regre...,2005,https://www.stat.cmu.edu/~brian/valerie/617-20...,[wlu6jZQAAAAJ],Linear Regression,bc5bfac7-8bb1-4de5-91e5-25a0ffd62ecf.pdf,../../detect_ml_model_files/bc5bfac7-8bb1-4de5...
5,Linear regression analysis,been a steady flow of books on regression rang...,2012,,"[, ]",Linear Regression,,
6,Linear regression,"Next, we run a series of linear regression and...",2020,,[xBAIQ40AAAAJ],Linear Regression,,
7,A study on multiple linear regression analysis,"and which were cleared for this purpose, the l...",2013,https://www.sciencedirect.com/science/article/...,"[MLRy-6cAAAAJ, -PuUCI0AAAAJ]",Linear Regression,,
8,A review on linear regression comprehensive in...,We discuss linear regression and polynomial re...,2020,https://jastt.org/index.php/jasttpath/article/...,"[G9U01kwAAAAJ, aBdgHxkAAAAJ]",Linear Regression,07d733b9-322b-449d-9769-39794b54490b.pdf,../../detect_ml_model_files/07d733b9-322b-449d...


Find terms in file to use as labels

In [None]:
def search_in_file(pdf_file_path):
    search_results = dict()
    # Extract text from PDF
    with open(pdf_file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num, page in enumerate(pdf_reader.pages):
            # Extract text from the page
            text = page.extract_text()
            if text:
                # Use spaCy to process the text
                doc = nlp(text)
                
                # Detect entities
                for ent in doc.ents:
                    if ent.label_ == "ML_METHOD":
                        if ent.text in search_results:
                            search_results[ent.text].append(page_num + 1)  # Store page number (1-indexed)
                        else:
                            search_results[ent.text] = [page_num + 1]  # Store page number (1-indexed)
    
    # Display search results
    for term, pages in search_results.items():
        if pages:
            print(f"'{term}' found on page(s): {set(pages)}")
        #else:
        #    print(f"'{term}' not found in the document.")

In [17]:
data_df['file_path'].dropna().values.tolist()

['../../detect_ml_model_files/055a4d9b-aa8e-472d-9c64-c161e1660251.pdf',
 '../../detect_ml_model_files/093ae391-bf6d-4f35-81e0-ce47557474b1.pdf',
 '../../detect_ml_model_files/4e69247a-1ca1-48f1-b9ec-97454a02a05c.pdf',
 '../../detect_ml_model_files/bc5bfac7-8bb1-4de5-91e5-25a0ffd62ecf.pdf',
 '../../detect_ml_model_files/07d733b9-322b-449d-9769-39794b54490b.pdf']

In [18]:
pdf_file_path = '../../detect_ml_model_files/055a4d9b-aa8e-472d-9c64-c161e1660251.pdf'

In [42]:
with open(pdf_file_path, "rb") as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    for page_num, page in enumerate(pdf_reader.pages):
        # Extract text from the page
        text = page.extract_text()
        break

In [43]:
doc = nlp(text)

In [44]:
for ent in doc.ents:
    print(ent)

Linear regression
Daniel Hsu
One
linear regression
X1,Y1
Rd×R
Xis
Thelog
Xi,Yi
xi
2πσ2−(yi−xT
σ2
w∈Rd(for anyσ2>0
1
MLE
ˆw∈arg
LetPnbe
1
i=11{(x
y)=(xi
yi)},(x
1
1
x1,y1
1


In [45]:
#nlp.add_pipe("sentencizer", before="parser")

# Recreate the Doc object
#doc = nlp(text)

for sentence in doc.sents:
    print(sentence.text)

Linear regression
Daniel Hsu (COMS 4771)
Maximum likelihood estimation
One of the simplest linear regression models is the following: (X1,Y1),..., (Xn,Yn),(X,Y)are iid random
pairs taking values in Rd×R, and
Y|X=x∼N(xTw,σ2),x∈Rd.

Here, the vector w∈Rdand scalarσ2>0are the parameters of the model.
(The marginal distribution of
Xis unspeciﬁed.)

Thelog-likelihood of(w,σ2)given (Xi,Yi)
= (xi,yi)fori= 1,...,nis
n/summationdisplay
i=1/braceleftBigg
ln1√
2πσ2−(yi−xT
iw)2
2σ2/bracerightBigg
+T,
whereTis some quantity that does not depend on (w,σ2).
Therefore, maximizing the log-likelihood over
w∈Rd(for anyσ2>0) is the same as minimizing
1
nn/summationdisplay
i=1(xT
iw−yi)2.

So, the maximum likelihood estimator (MLE) ofwin this model is
ˆw∈arg min
w∈Rd1
nn/summationdisplay
i=1(xT
iw−yi)2.

(It is not necessarily uniquely determined.)

Empirical risk minimization
LetPnbe the empirical distribution on(x1,y1),..., (xn,yn)∈Rd×R, i.e., the probability distribution over
Rd×Rwith probability mass f

In [46]:
from spacy import displacy

# Render the entities in a Jupyter Notebook or as HTML
# displacy.render(doc, style="ent", jupyter=True)  # Use jupyter=True if in a notebook

options = {"colors": {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)", "ML_METHOD": "#f6c5be"}}
displacy.render(doc, style="ent", options=options, jupyter=True)


Lets only select sentences with a ML_METHOD entity


In [48]:
# Extract sentences with ML_METHOD entities
filtered_sentences = [
    sentence.text
    for sentence in doc.sents
    if any(ent.label_ == "ML_METHOD" for ent in sentence.ents)
]

# Create a new text containing only those sentences
filtered_text = " ".join(filtered_sentences)

# Create a new Doc object with the filtered text
filtered_doc = nlp(filtered_text)

# Print the filtered sentences
print("Filtered Sentences:")
for sentence in filtered_doc.sents:
    print(sentence.text)

Filtered Sentences:
Linear regression
Daniel Hsu (COMS 4771)
Maximum likelihood estimation
One of the simplest linear regression models is the following: (X1,Y1),..., (Xn,Yn),(X,Y)are iid random
pairs taking values in Rd×R, and
Y|X=x∼N(xTw,σ2),x∈Rd.



In [50]:
#displacy.render(filtered_doc, style="ent", jupyter=True)  # Use jupyter=True if in a notebook

options = {"colors": {"ML_METHOD": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}}
displacy.render(filtered_doc, style="ent", options=options, jupyter=True)

TODO:
    - Recreate the document by using the sentencizer

    ```
    # Add a sentencizer if needed
    nlp.add_pipe("sentencizer", before="parser")
    ```