In [1]:
import pandas as pd

In [2]:
ROOT_FOLDER = '../../'

In [3]:
DATA_FOLDER = '../../data/'

In [4]:
DOWNLOAD_FOLDER = '../../downloads/'

In [5]:
downloaded_files_df = pd.read_parquet(f'{DATA_FOLDER}downloaded_files_df.parquet')

In [6]:
downloaded_files_df

Unnamed: 0,title,file_name
0,LncMachine: a machine learning algorithm for l...,51807917-91f3-4b8f-8ad4-e1c5c923432e.pdf
1,DMFLDA: a deep learning framework for predicti...,92c4c16b-1cc5-49b0-8ff3-09182bfb02fc.pdf
2,Evaluation of deep learning in non-coding RNA ...,d320ce0f-7cd8-4afa-8ec7-baad93b09505.pdf


In [7]:
model_df = pd.read_excel(f'{ROOT_FOLDER}ML-Model-Categorization.ods', sheet_name='Sheet1', usecols=['MODEL'])

In [8]:
model_df

Unnamed: 0,MODEL
0,Linear Regression
1,Polynomial Regression
2,Ridge Regression
3,Lasso Regression
4,Elastic Net Regression
...,...
97,Claude
98,LLaMA
99,Contrastive Language-Image Pretraining
100,DALL-E


The model column has duplicate values since each model can be applied to multiple categories; we just need the unique list here

In [9]:
model_df.drop_duplicates(inplace=True)

In [10]:
model_df

Unnamed: 0,MODEL
0,Linear Regression
1,Polynomial Regression
2,Ridge Regression
3,Lasso Regression
4,Elastic Net Regression
...,...
96,ChatGPT
97,Claude
98,LLaMA
99,Contrastive Language-Image Pretraining


In [11]:
model_df.to_csv('models.csv', index=False)

In [12]:
patterns = [
    # Linear Regression
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'linear'}, {'LOWER': 'regression'}]},
    # Polynomial Regression
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'polynomial'}, {'LOWER': 'regression'}]},
    # Ridge Regression
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'ridge'}, {'LOWER': 'regression'}]},
    # Lasso Regression
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'lasso'}, {'LOWER': 'regression'}]},
    # Elastic Net Regression
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'elastic'}, {'LOWER': 'net'}, {'LOWER': 'regression'}]},
    # Bayesian Linear Regression
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'bayesian'}, {'LOWER': 'linear'}, {'LOWER': 'regression'}]},
    # Quantile Regression
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'quantile'}, {'LOWER': 'regression'}]},
    # Support Vector Regression
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'support'}, {'LOWER': 'vector'}, {'LOWER': 'regression'}]},
    # Logistic Regression
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'logistic'}, {'LOWER': 'regression'}]},
    # Support Vector Machines
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'support'}, {'LOWER': 'vector'}, {'LOWER': {'REGEX': 'machines?'}}]},
    {'label': 'ML_METHOD', 'pattern': 'SVM'},
    # k-Nearest Neighbors
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'k-nearest'}, {'LOWER': {'REGEX': 'neighbors?'}}]},
    {'label': 'ML_METHOD', 'pattern': {'LOWER': 'knn'}},
    {'label': 'ML_METHOD', 'pattern': {'LOWER': 'k-nn'}},
    # Decision Trees
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'decision'}, {'LOWER': {'REGEX': 'trees?'}}]},
    # Random Forests
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'random'}, {'LOWER': {'REGEX': 'forests?'}}]},
    # Gradient Boosted Trees
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'gradient'}, {'LOWER': 'boosted'}, {'LOWER': {'REGEX': 'trees?'}}]},
    {'label': 'ML_METHOD', 'pattern': 'GBT'},
    # XGBoost
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'xgboost'}]},
    # LightGBM
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'lightgbm'}]},
    # CatBoost
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'catboost'}]},
    # Naive Bayes
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'naive'}, {'LOWER': 'bayes'}]},
    # Neural Networks
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'neural'}, {'LOWER': 'networks'}]},
    # k-Means Clustering
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'k-means'}, {'LOWER': 'clustering'}]},
    # Hierarchical Clustering
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'hierarchical'}, {'LOWER': 'clustering'}]},
    # Density-Based Spatial Clustering
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'density-based'}, {'LOWER': 'spatial'}, {'LOWER': 'clustering'}]},
    # Mean Shift
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'mean'}, {'LOWER': 'shift'}]},    
    # Gaussian Mixture Models
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'gaussian'}, {'LOWER': 'mixture'}, {'LOWER': 'models'}]},
    {'label': 'ML_METHOD', 'pattern': 'GMM'},
    # Spectral Clustering
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'spectral'}, {'LOWER': 'clustering'}]},
    # Principal Component Analysis
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'principal'}, {'LOWER': 'component'}, {'LOWER': 'analysis'}]},
    {'label': 'ML_METHOD', 'pattern': 'PCA'},
    # t-Distributed Stochastic Neighbor Embedding
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 't-distributed'}, {'LOWER': 'stochastic'}, {'LOWER': 'neighbor'}, {'LOWER': 'embedding'}]},
    {'label': 'ML_METHOD', 'pattern': 'TSNE'},
    {'label': 'ML_METHOD', 'pattern': 'T-SNE'},
    # Uniform Manifold Approximation and Projection
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'uniform'}, {'LOWER': 'manifold'}, {'LOWER': 'approximation'}, {'LOWER': 'and'}, {'LOWER': 'projection'}]},
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'uniform'}, {'LOWER': 'manifold'}, {'LOWER': 'approximation'}, {'LOWER': 'and'}, {'LOWER': 'projections'}]},
    {'label': 'ML_METHOD', 'pattern': 'UMAAP'},
    # Independent Component Analysis
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'independent'}, {'LOWER': 'component'}, {'LOWER': 'analysis'}]},
    {'label': 'ML_METHOD', 'pattern': 'ICA'},
    # Autoencoders
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'autoencoders'}]},
    # Apriori Algorithm
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'apriori'}, {'LOWER': 'algorithm'}]},
    # Eclat Algorithm
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'eclat'}, {'LOWER': 'algorithm'}]},
    # FP-Growth Algorithm
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'fp-growth'}, {'LOWER': 'algorithm'}]},
    # Self-Training
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'self-training'}]},
    # Co-Training
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'co-training'}]},
    # Tri-Training
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'tri-training'}]},
    # Pseudo-Labeling
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'pseudo-labeling'}]},
    # Graph-Based Learning Models
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'graph-based'}, {'LOWER': 'learning'}, {'LOWER': {'REGEX': 'models?'}}]},
    # Generative Models
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'generative'}, {'LOWER': {'REGEX': 'models?'}}]},
    # Q-Learning
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'q-learning'}]},
    # Deep Q-Networks
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'deep'}, {'LOWER': 'q-networks'}]},
    {'label': 'ML_METHOD', 'pattern': 'DQ'},
    # Policy Gradient Methods
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'policy'}, {'LOWER': 'gradient'}, {'LOWER': 'methods'}]},    
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'reinforce'}]},
    # Actor-Critic Models
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'actor-critic'}, {'LOWER': 'models'}]},
    {'label': 'ML_METHOD', 'pattern': 'AM'},
    # Advantage Actor-Critic
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'advantage'}, {'LOWER': 'actor-critic'}]},
    {'label': 'ML_METHOD', 'pattern': 'AA'},
    # Proximal Policy Optimization
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'proximal'}, {'LOWER': 'policy'}, {'LOWER': {'REGEX': 'optimizations?'}}]},
    # Trust Region Policy Optimization
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'trust'}, {'LOWER': 'region'}, {'LOWER': 'policy'}, {'LOWER': {'REGEX': 'optimizations?'}}]},
    # Deep Deterministic Policy Gradient
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'deep'}, {'LOWER': 'deterministic'}, {'LOWER': 'policy'}, {'LOWER': 'gradient'}]},
    # Twin Delayed DDPG
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'twin'}, {'LOWER': 'delayed'}, {'LOWER': 'ddpg'}]},
    # Soft Actor-Critic
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'soft'}, {'LOWER': 'actor-critic'}]},
    # Monte Carlo Methods
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'monte'}, {'LOWER': 'carlo'}, {'LOWER':  {'REGEX': 'models?'}}]},
    {'label': 'ML_METHOD', 'pattern': 'MCM'},
    # Temporal-Difference Learning
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'temporal-difference'}, {'LOWER': 'learning'}]},
    # Multi Layer Perceptron
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'multi'}, {'LOWER': 'layer'}, {'LOWER': 'perceptron'}]},
    # LeNet
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'lenet'}]},
    # AlexNet
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'alexnet'}]},
    # VGGNet
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'vggnet'}]},
    # ResNet
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'resnet'}]},
    # Inception
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'inception'}]},
    # EfficientNet
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'efficientnet'}]},
    # Recurrent Neural Networks
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'recurrent'}, {'LOWER': 'neural'}, {'LOWER':  {'REGEX': 'networks?'}}]},
    {'label': 'ML_METHOD', 'pattern': 'RNN'},
    {'label': 'ML_METHOD', 'pattern': 'RNNs'},
    # Long Short-Term Memory
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'long'}, {'LOWER': 'short-term'}, {'LOWER': 'memory'}]},
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'long'}, {'LOWER': 'short'}, {'LOWER': 'term'}, {'LOWER': 'memory'}]},
    {'label': 'ML_METHOD', 'pattern': 'LSTM'},
    # Gated Recurrent Unit
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'gated'}, {'LOWER': 'recurrent'}, {'LOWER': 'unit'}]},
    {'label': 'ML_METHOD', 'pattern': 'GRU'},
    # Bidirectional RNNs
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'bidirectional'}, {'LOWER': 'rnns'}]},
    # Transformer
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': {'REGEX': 'transformers?'}}]},
    # Bidirectional Encoder Representations from Transformers
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'bidirectional'}, {'LOWER': 'encoder'}, {'LOWER': 'representations'}, {'LOWER': 'from'}, {'LOWER': 'transformers'}]},
    {'label': 'ML_METHOD', 'pattern': 'BERFT'},
    # Generative Pretrained Transformers
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'generative'}, {'LOWER': 'pretrained'}, {'LOWER': {'REGEX': 'transformers?'}}]},
    {'label': 'ML_METHOD', 'pattern': 'GPT'},
    # Text-to-Text Transfer Transformer
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'text-to-text'}, {'LOWER': 'transfer'}, {'LOWER': {'REGEX': 'transformers?'}}]},
    # Vision Transformers
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'vision'}, {'LOWER': {'REGEX': 'transformers?'}}]},
    # Swin Transformer
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'swin'}, {'LOWER': {'REGEX': 'transformers?'}}]},
    # Generative Adversarial Networks
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'generative'}, {'LOWER': 'adversarial'}, {'LOWER': {'REGEX': 'networks?'}}]},
    {'label': 'ML_METHOD', 'pattern': 'GAN'},
    # Variational Autoencoders
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'variational'}, {'LOWER': {'REGEX': 'autoencoders?'}}]},
    # Normalizing Flows
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'normalizing'}, {'LOWER': {'REGEX': 'flows?'}}]},
    # Diffusion Models
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'diffusion'}, {'LOWER': {'REGEX': 'models?'}}]},
    # Stable Diffusion
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'stable'}, {'LOWER': 'diffusion'}]},
    # Denoising Diffusion Probabilistic Models
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'denoising'}, {'LOWER': 'diffusion'}, {'LOWER': 'probabilistic'}, {'LOWER': {'REGEX': 'models?'}}]},
    # Bayesian Networks
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'bayesian'}, {'LOWER': {'REGEX': 'networks?'}}]},
    # Markov Chains
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'markov'}, {'LOWER': 'chains'}]},
    # Hidden Markov Models
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'hidden'}, {'LOWER': 'markov'}, {'LOWER': {'REGEX': 'models?'}}]},
    {'label': 'ML_METHOD', 'pattern': 'HMM'},
    {'label': 'ML_METHOD', 'pattern': 'HMMs'},
    # Kalman Filters
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'kalman'}, {'LOWER': {'REGEX': 'filters?'}}]},
    # Conditional Random Fields
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'conditional'}, {'LOWER': 'random'}, {'LOWER': 'fields'}]},
    # Graph Neural Networks
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'graph'}, {'LOWER': 'neural'}, {'LOWER': {'REGEX': 'networks?'}}]},
    {'label': 'ML_METHOD', 'pattern': 'GNN'},
    # Graph Convolutional Networks
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'graph'}, {'LOWER': 'convolutional'}, {'LOWER': {'REGEX': 'networks?'}}]},
    {'label': 'ML_METHOD', 'pattern': 'GCN'},
    # Graph Attention Networks
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'graph'}, {'LOWER': 'attention'}, {'LOWER': {'REGEX': 'networks?'}}]},
    {'label': 'ML_METHOD', 'pattern': 'GAN'},
    # Autoregressive Integrated Moving Average
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'autoregressive'}, {'LOWER': 'integrated'}, {'LOWER': 'moving'}, {'LOWER': 'average'}]},
    {'label': 'ML_METHOD', 'pattern': 'ARIMA'},
    # Seasonal ARIMA
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'seasonal'}, {'LOWER': 'arima'}]},
    {'label': 'ML_METHOD', 'pattern': 'SARIMA'},
    # Vector Autoregression
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'vector'}, {'LOWER': 'autoregression'}]},
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'vector'}, {'LOWER': 'autoregressive'}]},
    # State-Space Models
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'state-space'}, {'LOWER': 'models'}]},
    # Prophet
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'prophet'}]},
    # Bagging
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'bagging'}]},
    # Boosting
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'boosting'}]},
    # AdaBoost
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'adaboost'}]},
    # Gradient Boosting
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'gradient'}, {'LOWER': 'boosting'}]},
    {'label': 'ML_METHOD', 'pattern': 'GB'},
    # Stacking
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'stacking'}]},
    # Blending
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'blending'}]},
    # GPT-4
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'gpt-4'}]},
    # ChatGPT
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'chatgpt'}]},
    # Claude
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'claude'}]},
    # LLaMA
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'llama'}]},
    # Contrastive Language-Image Pretraining
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'contrastive'}, {'LOWER': 'language-image'}, {'LOWER': 'pretraining'}]},
    {'label': 'ML_METHOD', 'pattern': 'CLIP'},
    # DALL-E
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'dall-e'}]},
]

In [13]:
import spacy
from spacy.pipeline import EntityRuler

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize the EntityRuler
ruler = nlp.add_pipe("entity_ruler", before="ner")

In [14]:
ruler.add_patterns(patterns)

In [15]:
text = "Support Vector Machine, Support Vector Machines, SVM, and S.V.M. are popular machine learning methods."
doc = nlp(text)

In [16]:
# Print detected entities
for ent in doc.ents:
    print(ent.text, ent.label_)

Support Vector Machine ML_METHOD
Support Vector Machines ML_METHOD
SVM ML_METHOD
S.V.M. ORG


In [17]:
# Save the EntityRuler patterns to a file
ruler.to_disk(f'{DATA_FOLDER}ml_entity_ruler_patterns')