In [2]:
import pandas as pd

In [4]:
ROOT_FOLDER = '../../'

In [29]:
DATA_FOLDER = '../../data/'

In [35]:
DOWNLOAD_FOLDER = '../../downloads/'

In [30]:
downloaded_files_df = pd.read_parquet(f'{DATA_FOLDER}downloaded_files_df.parquet')

In [31]:
downloaded_files_df

Unnamed: 0,title,file_name
0,LncMachine: a machine learning algorithm for l...,51807917-91f3-4b8f-8ad4-e1c5c923432e.pdf
1,DMFLDA: a deep learning framework for predicti...,92c4c16b-1cc5-49b0-8ff3-09182bfb02fc.pdf
2,Evaluation of deep learning in non-coding RNA ...,d320ce0f-7cd8-4afa-8ec7-baad93b09505.pdf


In [15]:
model_df = pd.read_excel(f'{ROOT_FOLDER}ML-Model-Categorization.ods', sheet_name='Sheet1', usecols=['MODEL'])

In [16]:
model_df

Unnamed: 0,MODEL
0,Linear Regression
1,Polynomial Regression
2,Ridge Regression
3,Lasso Regression
4,Elastic Net Regression
...,...
97,Claude
98,LLaMA
99,Contrastive Language-Image Pretraining
100,DALL-E


The model column has duplicate values since each model can be applied to multiple categories; we just need the unique list here

In [23]:
model_df.drop_duplicates(inplace=True)

In [24]:
model_df

Unnamed: 0,MODEL
0,Linear Regression
1,Polynomial Regression
2,Ridge Regression
3,Lasso Regression
4,Elastic Net Regression
...,...
96,ChatGPT
97,Claude
98,LLaMA
99,Contrastive Language-Image Pretraining


In [27]:
import spacy
from spacy.matcher import PhraseMatcher

# Load spaCy's language model
nlp = spacy.load("en_core_web_sm")

# Define a list of machine learning model names
model_names = model_df['MODEL'].values.tolist()

# Initialize PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

# Add model names as patterns
patterns = [nlp.make_doc(name) for name in model_names]
matcher.add("ML_MODELS", patterns)

# Process the academic text
text = """
In this paper, we compare the performance of Logistic Regression, Support Vector Machines, 
and Random Forests on several datasets. Furthermore, we explore the use of Artificial Neural Networks.
"""
doc = nlp(text)

# Find matches
matches = matcher(doc)

# Highlight matches in the text
for match_id, start, end in matches:
    span = doc[start:end]
    print(f"Matched model: {span.text}")


Matched model: Logistic Regression
Matched model: Support Vector Machines
Matched model: Random Forests
Matched model: Neural Networks


In [39]:
downloaded_files_df['pdf_file_path'] = DOWNLOAD_FOLDER + downloaded_files_df['file_name']

In [40]:
downloaded_files_df

Unnamed: 0,title,file_name,pdf_file_path
0,LncMachine: a machine learning algorithm for l...,51807917-91f3-4b8f-8ad4-e1c5c923432e.pdf,../../downloads/51807917-91f3-4b8f-8ad4-e1c5c9...
1,DMFLDA: a deep learning framework for predicti...,92c4c16b-1cc5-49b0-8ff3-09182bfb02fc.pdf,../../downloads/92c4c16b-1cc5-49b0-8ff3-09182b...
2,Evaluation of deep learning in non-coding RNA ...,d320ce0f-7cd8-4afa-8ec7-baad93b09505.pdf,../../downloads/d320ce0f-7cd8-4afa-8ec7-baad93...


In [33]:
import spacy
import PyPDF2

# Load spaCy language model
nlp = spacy.load("en_core_web_sm")

# List of terms to search
search_terms = model_df['MODEL'].values.tolist()

# Create a spaCy matcher for the search terms
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
patterns = [nlp.make_doc(term) for term in search_terms]
matcher.add("SearchTerms", patterns)


In [46]:
def search_in_file(matcher, search_terms, pdf_file_path):
    # Dictionary to store results
    search_results = {term: [] for term in search_terms}
    
    # Extract text from PDF
    with open(pdf_file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num, page in enumerate(pdf_reader.pages):
            # Extract text from the page
            text = page.extract_text()
            if text:
                # Use spaCy to process the text
                doc = nlp(text)
                
                # Search for terms using the matcher
                matches = matcher(doc)
                for match_id, start, end in matches:
                    term = doc[start:end].text
                    search_results[term].append(page_num + 1)  # Store page number (1-indexed)
    
    # Display search results
    for term, pages in search_results.items():
        if pages:
            print(f"'{term}' found on page(s): {pages}")
        #else:
        #    print(f"'{term}' not found in the document.")

In [43]:
paper_titles = downloaded_files_df['title'].values.tolist()
paper_pdf_file_paths = downloaded_files_df['pdf_file_path'].values.tolist()

In [47]:
for paper_title, pdf_file_path in zip(paper_titles, paper_pdf_file_paths):
    print('-'*80)
    print(f'processing: {paper_title}')
    print('-'*80)
    search_in_file(matcher=matcher, search_terms=search_terms, pdf_file_path=pdf_file_path)
    print('-'*80)
    

--------------------------------------------------------------------------------
processing: LncMachine: a machine learning algorithm for long noncoding RNA annotation in plants
--------------------------------------------------------------------------------
'Logistic Regression' found on page(s): [2, 6]
'Random Forests' found on page(s): [2, 2]
'Neural Networks' found on page(s): [7, 7, 9]
'AdaBoost' found on page(s): [3, 4]
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
processing: DMFLDA: a deep learning framework for predicting lncRNA–disease associations
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
processing: Evaluation of deep learning in non-coding RNA classification
---