In this notebook:

1 - We create a list of patterns with rules to match specific machine learning methods

2 - These patterns are subsequently added a EntityRuler object.

3 - After testing this object with the new patterns, the EntityRuler is saved to disk for re-use.

In [1]:
import spacy
from spacy.pipeline import EntityRuler

In [2]:
DATA_FOLDER = '../../../data/'

In [3]:
patterns = [
    # LINEAR_REGRESSION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'linear'}, {'LOWER': 'regression'}], 'id': 'LINEAR_REGRESSION'} ,
    # POLYNOMIAL_REGRESSION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'polynomial'}, {'LOWER': 'regression'}], 'id': 'POLYNOMIAL_REGRESSION'} ,
    # RIDGE_REGRESSION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'ridge'}, {'LOWER': 'regression'}], 'id': 'RIDGE_REGRESSION'} ,
    # LASSO_REGRESSION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'lasso'}, {'LOWER': 'regression'}], 'id': 'LASSO_REGRESSION'} ,
    # ELASTIC_NET_REGRESSION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'elastic'}, {'LOWER': 'net'}, {'LOWER': 'regression'}], 'id': 'ELASTIC_NET_REGRESSION'} ,
    # BAYESIAN_LINEAR_REGRESSION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'bayesian'}, {'LOWER': 'linear'}, {'LOWER': 'regression'}], 'id': 'BAYESIAN_LINEAR_REGRESSION'} ,
    # QUANTILE_REGRESSION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'quantile'}, {'LOWER': 'regression'}], 'id': 'QUANTILE_REGRESSION'} ,
    # SUPPORT_VECTOR_REGRESSION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'support'}, {'LOWER': 'vector'}, {'LOWER': 'regression'}], 'id': 'SUPPORT_VECTOR_REGRESSION'} ,
    # LOGISTIC_REGRESSION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'logistic'}, {'LOWER': 'regression'}], 'id': 'LOGISTIC_REGRESSION'} ,
    # SUPPORT_VECTOR_MACHINES
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'support'}, {'LOWER': 'vector'}, {'LOWER': {'REGEX': 'machines?'}}], 'id': 'SUPPORT_VECTOR_MACHINES'} ,
    {'label': 'ML_METHOD', 'pattern': 'SVM', 'id': 'SUPPORT_VECTOR_MACHINES'} ,
    # K-NEAREST_NEIGHBORS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'k-nearest'}, {'LOWER': {'REGEX': 'neighbors?'}}], 'id': 'K-NEAREST_NEIGHBORS'} ,
    {'label': 'ML_METHOD', 'pattern': {'LOWER': 'knn'}, 'id': 'K-NEAREST_NEIGHBORS'} ,
    {'label': 'ML_METHOD', 'pattern': {'LOWER': 'k-nn'}, 'id': 'K-NEAREST_NEIGHBORS'} ,
    # DECISION_TREES
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'decision'}, {'LOWER': {'REGEX': 'trees?'}}], 'id': 'DECISION_TREES'} ,
    # RANDOM_FORESTS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'random'}, {'LOWER': {'REGEX': 'forests?'}}], 'id': 'RANDOM_FORESTS'} ,
    # GRADIENT_BOOSTED_TREES
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'gradient'}, {'LOWER': 'boosted'}, {'LOWER': {'REGEX': 'trees?'}}], 'id': 'GRADIENT_BOOSTED_TREES'} ,
    {'label': 'ML_METHOD', 'pattern': 'GBT', 'id': 'GRADIENT_BOOSTED_TREES'} ,
    # XGBOOST
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'xgboost'}], 'id': 'XGBOOST'} ,
    # LIGHTGBM
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'lightgbm'}], 'id': 'LIGHTGBM'} ,
    # CATBOOST
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'catboost'}], 'id': 'CATBOOST'} ,
    # NAIVE_BAYES
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'naive'}, {'LOWER': 'bayes'}], 'id': 'NAIVE_BAYES'} ,
    # NEURAL_NETWORKS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'neural'}, {'LOWER': 'networks'}], 'id': 'NEURAL_NETWORKS'} ,
    # K-MEANS_CLUSTERING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'k-means'}, {'LOWER': 'clustering'}], 'id': 'K-MEANS_CLUSTERING'} ,
    # HIERARCHICAL_CLUSTERING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'hierarchical'}, {'LOWER': 'clustering'}], 'id': 'HIERARCHICAL_CLUSTERING'} ,
    # DENSITY-BASED_SPATIAL_CLUSTERING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'density-based'}, {'LOWER': 'spatial'}, {'LOWER': 'clustering'}], 'id': 'DENSITY-BASED_SPATIAL_CLUSTERING'} ,
    # MEAN_SHIFT
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'mean'}, {'LOWER': 'shift'}], 'id': 'MEAN_SHIFT'} ,
    # GAUSSIAN_MIXTURE_MODELS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'gaussian'}, {'LOWER': 'mixture'}, {'LOWER': 'models'}], 'id': 'GAUSSIAN_MIXTURE_MODELS'} ,
    {'label': 'ML_METHOD', 'pattern': 'GMM', 'id': 'GAUSSIAN_MIXTURE_MODELS'} ,
    # SPECTRAL_CLUSTERING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'spectral'}, {'LOWER': 'clustering'}], 'id': 'SPECTRAL_CLUSTERING'} ,
    # PRINCIPAL_COMPONENT_ANALYSIS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'principal'}, {'LOWER': 'component'}, {'LOWER': 'analysis'}], 'id': 'PRINCIPAL_COMPONENT_ANALYSIS'} ,
    {'label': 'ML_METHOD', 'pattern': 'PCA', 'id': 'PRINCIPAL_COMPONENT_ANALYSIS'} ,
    # T-DISTRIBUTED_STOCHASTIC_NEIGHBOR_EMBEDDING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 't-distributed'}, {'LOWER': 'stochastic'}, {'LOWER': 'neighbor'}, {'LOWER': 'embedding'}], 'id': 'T-DISTRIBUTED_STOCHASTIC_NEIGHBOR_EMBEDDING'} ,
    {'label': 'ML_METHOD', 'pattern': 'TSNE', 'id': 'T-DISTRIBUTED_STOCHASTIC_NEIGHBOR_EMBEDDING'} ,
    {'label': 'ML_METHOD', 'pattern': 'T-SNE', 'id': 'T-DISTRIBUTED_STOCHASTIC_NEIGHBOR_EMBEDDING'} ,
    # UNIFORM_MANIFOLD_APPROXIMATION_AND_PROJECTION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'uniform'}, {'LOWER': 'manifold'}, {'LOWER': 'approximation'}, {'LOWER': 'and'}, {'LOWER': 'projection'}], 'id': 'UNIFORM_MANIFOLD_APPROXIMATION_AND_PROJECTION'} ,
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'uniform'}, {'LOWER': 'manifold'}, {'LOWER': 'approximation'}, {'LOWER': 'and'}, {'LOWER': 'projections'}], 'id': 'UNIFORM_MANIFOLD_APPROXIMATION_AND_PROJECTION'} ,
    {'label': 'ML_METHOD', 'pattern': 'UMAAP', 'id': 'UNIFORM_MANIFOLD_APPROXIMATION_AND_PROJECTION'} ,
    # INDEPENDENT_COMPONENT_ANALYSIS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'independent'}, {'LOWER': 'component'}, {'LOWER': 'analysis'}], 'id': 'INDEPENDENT_COMPONENT_ANALYSIS'} ,
    {'label': 'ML_METHOD', 'pattern': 'ICA', 'id': 'INDEPENDENT_COMPONENT_ANALYSIS'} ,
    # AUTOENCODERS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'autoencoders'}], 'id': 'AUTOENCODERS'} ,
    # APRIORI_ALGORITHM
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'apriori'}, {'LOWER': 'algorithm'}], 'id': 'APRIORI_ALGORITHM'} ,
    # ECLAT_ALGORITHM
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'eclat'}, {'LOWER': 'algorithm'}], 'id': 'ECLAT_ALGORITHM'} ,
    # FP-GROWTH_ALGORITHM
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'fp-growth'}, {'LOWER': 'algorithm'}], 'id': 'FP-GROWTH_ALGORITHM'} ,
    # SELF-TRAINING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'self-training'}], 'id': 'SELF-TRAINING'} ,
    # CO-TRAINING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'co-training'}], 'id': 'CO-TRAINING'} ,
    # TRI-TRAINING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'tri-training'}], 'id': 'TRI-TRAINING'} ,
    # PSEUDO-LABELING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'pseudo-labeling'}], 'id': 'PSEUDO-LABELING'} ,
    # GRAPH-BASED_LEARNING_MODELS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'graph-based'}, {'LOWER': 'learning'}, {'LOWER': {'REGEX': 'models?'}}], 'id': 'GRAPH-BASED_LEARNING_MODELS'} ,
    # GENERATIVE_MODELS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'generative'}, {'LOWER': {'REGEX': 'models?'}}], 'id': 'GENERATIVE_MODELS'} ,
    # Q-LEARNING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'q-learning'}], 'id': 'Q-LEARNING'} ,
    # DEEP_Q-NETWORKS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'deep'}, {'LOWER': 'q-networks'}], 'id': 'DEEP_Q-NETWORKS'} ,
    {'label': 'ML_METHOD', 'pattern': 'DQ', 'id': 'DEEP_Q-NETWORKS'} ,
    # MONTE_CARLO_METHODS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'monte'}, {'LOWER': 'carlo'}, {'LOWER': {'REGEX': 'models?'}}], 'id': 'MONTE_CARLO_METHODS'} ,
    {'label': 'ML_METHOD', 'pattern': 'MCM', 'id': 'MONTE_CARLO_METHODS'} ,
    # TEMPORAL-DIFFERENCE_LEARNING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'temporal-difference'}, {'LOWER': 'learning'}], 'id': 'TEMPORAL-DIFFERENCE_LEARNING'} ,
    # REINFORCE
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'reinforce'}], 'id': 'REINFORCE'} ,
    # ACTOR-CRITIC_MODELS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'actor-critic'}, {'LOWER': 'models'}], 'id': 'ACTOR-CRITIC_MODELS'} ,
    # ADVANTAGE_ACTOR-CRITIC
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'advantage'}, {'LOWER': 'actor-critic'}], 'id': 'ADVANTAGE_ACTOR-CRITIC'} ,
    {'label': 'ML_METHOD', 'pattern': 'A2C', 'id': 'ADVANTAGE_ACTOR-CRITIC'} ,
    # PROXIMAL_POLICY_OPTIMIZATION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'proximal'}, {'LOWER': 'policy'}, {'LOWER': {'REGEX': 'optimizations?'}}], 'id': 'PROXIMAL_POLICY_OPTIMIZATION'} ,
    {'label': 'ML_METHOD', 'pattern': 'PPO', 'id': 'PROXIMAL_POLICY_OPTIMIZATION'} ,
    # TRUST_REGION_POLICY_OPTIMIZATION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'trust'}, {'LOWER': 'region'}, {'LOWER': 'policy'}, {'LOWER': {'REGEX': 'optimizations?'}}], 'id': 'TRUST_REGION_POLICY_OPTIMIZATION'} ,
    {'label': 'ML_METHOD', 'pattern': 'TRPO', 'id': 'TRUST_REGION_POLICY_OPTIMIZATION'} ,
    # DEEP_DETERMINISTIC_POLICY_GRADIENT
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'deep'}, {'LOWER': 'deterministic'}, {'LOWER': 'policy'}, {'LOWER': 'gradient'}], 'id': 'DEEP_DETERMINISTIC_POLICY_GRADIENT'} ,
    {'label': 'ML_METHOD', 'pattern': 'DDPG', 'id': 'DEEP_DETERMINISTIC_POLICY_GRADIENT'} ,
    # DETERMINISTIC_POLICY_GRADIENT
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'deterministic'}, {'LOWER': 'policy'}, {'LOWER': 'gradient'}], 'id': 'DETERMINISTIC_POLICY_GRADIENT'} ,
    {'label': 'ML_METHOD', 'pattern': 'DPG', 'id': 'DETERMINISTIC_POLICY_GRADIENT'} ,
    # VANILLA_POLICY_GRADIENT
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'vanilla'}, {'LOWER': 'policy'}, {'LOWER': 'gradient'}], 'id': 'VANILLA_POLICY_GRADIENT'} ,
    {'label': 'ML_METHOD', 'pattern': 'VPG', 'id': 'VANILLA_POLICY_GRADIENT'} ,
    # TWIN_DELAYED_DDPG
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'twin'}, {'LOWER': 'delayed'}, {'LOWER': 'ddpg'}], 'id': 'TWIN_DELAYED_DDPG'} ,
    {'label': 'ML_METHOD', 'pattern': 'TD3', 'id': 'TWIN_DELAYED_DDPG'} ,
    # SOFT_ACTOR-CRITIC
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'soft'}, {'LOWER': 'actor-critic'}], 'id': 'SOFT_ACTOR-CRITIC'} ,
    {'label': 'ML_METHOD', 'pattern': 'SAC', 'id': 'SOFT_ACTOR-CRITIC'} ,
    # MULTI_LAYER_PERCEPTRON
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'multi'}, {'LOWER': 'layer'}, {'LOWER': 'perceptron'}], 'id': 'MULTI_LAYER_PERCEPTRON'} ,
    {'label': 'ML_METHOD', 'pattern': 'MLP', 'id': 'MULTI_LAYER_PERCEPTRON'} ,
    # LENET
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'lenet'}], 'id': 'LENET'} ,
    # ALEXNET
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'alexnet'}], 'id': 'ALEXNET'} ,
    # VGGNET
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'vggnet'}], 'id': 'VGGNET'} ,
    # RESNET
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'resnet'}], 'id': 'RESNET'} ,
    # INCEPTION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'inception'}], 'id': 'INCEPTION'} ,
    # EFFICIENTNET
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'efficientnet'}], 'id': 'EFFICIENTNET'} ,
    # RECURRENT_NEURAL_NETWORKS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'recurrent'}, {'LOWER': 'neural'}, {'LOWER': {'REGEX': 'networks?'}}], 'id': 'RECURRENT_NEURAL_NETWORKS'} ,
    {'label': 'ML_METHOD', 'pattern': 'RNN', 'id': 'RECURRENT_NEURAL_NETWORKS'} ,
    {'label': 'ML_METHOD', 'pattern': 'RNNs', 'id': 'RECURRENT_NEURAL_NETWORKS'} ,
    # LONG_SHORT-TERM_MEMORY
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'long'}, {'LOWER': 'short-term'}, {'LOWER': 'memory'}], 'id': 'LONG_SHORT-TERM_MEMORY'} ,
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'long'}, {'LOWER': 'short'}, {'LOWER': 'term'}, {'LOWER': 'memory'}], 'id': 'LONG_SHORT-TERM_MEMORY'} ,
    {'label': 'ML_METHOD', 'pattern': 'LSTM', 'id': 'LONG_SHORT-TERM_MEMORY'} ,
    # GATED_RECURRENT_UNIT
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'gated'}, {'LOWER': 'recurrent'}, {'LOWER': 'unit'}], 'id': 'GATED_RECURRENT_UNIT'} ,
    {'label': 'ML_METHOD', 'pattern': 'GRU', 'id': 'GATED_RECURRENT_UNIT'} ,
    # BIDIRECTIONAL_GATED_RECURRENT_UNIT
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'bidirectional'}, {'LOWER': 'gated'}, {'LOWER': 'recurrent'}, {'LOWER': 'unit'}], 'id': 'BIDIRECTIONAL_GATED_RECURRENT_UNIT'} ,
    {'label': 'ML_METHOD', 'pattern': 'BGRU', 'id': 'BIDIRECTIONAL_GATED_RECURRENT_UNIT'} ,    
    # BIDIRECTIONAL_RNNS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'bidirectional'}, {'LOWER': 'rnns'}], 'id': 'BIDIRECTIONAL_RNNS'} ,
    # TRANSFORMER
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': {'REGEX': 'transformers?'}}], 'id': 'TRANSFORMER'} ,
    # BIDIRECTIONAL_ENCODER_REPRESENTATIONS_FROM_TRANSFORMERS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'bidirectional'}, {'LOWER': 'encoder'}, {'LOWER': 'representations'}, {'LOWER': 'from'}, {'LOWER': 'transformers'}], 'id': 'BIDIRECTIONAL_ENCODER_REPRESENTATIONS_FROM_TRANSFORMERS'} ,
    {'label': 'ML_METHOD', 'pattern': 'BERFT', 'id': 'BIDIRECTIONAL_ENCODER_REPRESENTATIONS_FROM_TRANSFORMERS'} ,
    # GENERATIVE_PRETRAINED_TRANSFORMERS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'generative'}, {'LOWER': 'pretrained'}, {'LOWER': {'REGEX': 'transformers?'}}], 'id': 'GENERATIVE_PRETRAINED_TRANSFORMERS'} ,
    {'label': 'ML_METHOD', 'pattern': 'GPT', 'id': 'GENERATIVE_PRETRAINED_TRANSFORMERS'} ,
    # TEXT-TO-TEXT_TRANSFER_TRANSFORMER
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'text-to-text'}, {'LOWER': 'transfer'}, {'LOWER': {'REGEX': 'transformers?'}}], 'id': 'TEXT-TO-TEXT_TRANSFER_TRANSFORMER'} ,
    # VISION_TRANSFORMERS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'vision'}, {'LOWER': {'REGEX': 'transformers?'}}], 'id': 'VISION_TRANSFORMERS'} ,
    # SWIN_TRANSFORMER
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'swin'}, {'LOWER': {'REGEX': 'transformers?'}}], 'id': 'SWIN_TRANSFORMER'} ,
    # GENERATIVE_ADVERSARIAL_NETWORKS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'generative'}, {'LOWER': 'adversarial'}, {'LOWER': {'REGEX': 'networks?'}}], 'id': 'GENERATIVE_ADVERSARIAL_NETWORKS'} ,
    {'label': 'ML_METHOD', 'pattern': 'GAN', 'id': 'GENERATIVE_ADVERSARIAL_NETWORKS'} ,
    # VARIATIONAL_AUTOENCODERS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'variational'}, {'LOWER': {'REGEX': 'autoencoders?'}}], 'id': 'VARIATIONAL_AUTOENCODERS'} ,
    # NORMALIZING_FLOWS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'normalizing'}, {'LOWER': {'REGEX': 'flows?'}}], 'id': 'NORMALIZING_FLOWS'} ,
    # DIFFUSION_MODELS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'diffusion'}, {'LOWER': {'REGEX': 'models?'}}], 'id': 'DIFFUSION_MODELS'} ,
    # STABLE_DIFFUSION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'stable'}, {'LOWER': 'diffusion'}], 'id': 'STABLE_DIFFUSION'} ,
    # DENOISING_DIFFUSION_PROBABILISTIC_MODELS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'denoising'}, {'LOWER': 'diffusion'}, {'LOWER': 'probabilistic'}, {'LOWER': {'REGEX': 'models?'}}], 'id': 'DENOISING_DIFFUSION_PROBABILISTIC_MODELS'} ,
    # BAYESIAN_NETWORKS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'bayesian'}, {'LOWER': {'REGEX': 'networks?'}}], 'id': 'BAYESIAN_NETWORKS'} ,
    # MARKOV_CHAINS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'markov'}, {'LOWER': 'chains'}], 'id': 'MARKOV_CHAINS'} ,
    # HIDDEN_MARKOV_MODELS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'hidden'}, {'LOWER': 'markov'}, {'LOWER': {'REGEX': 'models?'}}], 'id': 'HIDDEN_MARKOV_MODELS'} ,
    {'label': 'ML_METHOD', 'pattern': 'HMM', 'id': 'HIDDEN_MARKOV_MODELS'} ,
    {'label': 'ML_METHOD', 'pattern': 'HMMs', 'id': 'HIDDEN_MARKOV_MODELS'} ,
    # KALMAN_FILTERS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'kalman'}, {'LOWER': {'REGEX': 'filters?'}}], 'id': 'KALMAN_FILTERS'} ,
    # CONDITIONAL_RANDOM_FIELDS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'conditional'}, {'LOWER': 'random'}, {'LOWER': 'fields'}], 'id': 'CONDITIONAL_RANDOM_FIELDS'} ,
    # GRAPH_NEURAL_NETWORKS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'graph'}, {'LOWER': 'neural'}, {'LOWER': {'REGEX': 'networks?'}}], 'id': 'GRAPH_NEURAL_NETWORKS'} ,
    {'label': 'ML_METHOD', 'pattern': 'GNN', 'id': 'GRAPH_NEURAL_NETWORKS'} ,
    # GRAPH_CONVOLUTIONAL_NETWORKS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'graph'}, {'LOWER': 'convolutional'}, {'LOWER': {'REGEX': 'networks?'}}], 'id': 'GRAPH_CONVOLUTIONAL_NETWORKS'} ,
    {'label': 'ML_METHOD', 'pattern': 'GCN', 'id': 'GRAPH_CONVOLUTIONAL_NETWORKS'} ,
    # GRAPH_ATTENTION_NETWORKS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'graph'}, {'LOWER': 'attention'}, {'LOWER': {'REGEX': 'networks?'}}], 'id': 'GRAPH_ATTENTION_NETWORKS'} ,
    {'label': 'ML_METHOD', 'pattern': 'GAN', 'id': 'GRAPH_ATTENTION_NETWORKS'} ,
    # AUTOREGRESSIVE_INTEGRATED_MOVING_AVERAGE
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'autoregressive'}, {'LOWER': 'integrated'}, {'LOWER': 'moving'}, {'LOWER': 'average'}], 'id': 'AUTOREGRESSIVE_INTEGRATED_MOVING_AVERAGE'} ,
    {'label': 'ML_METHOD', 'pattern': 'ARIMA', 'id': 'AUTOREGRESSIVE_INTEGRATED_MOVING_AVERAGE'} ,
    # SEASONAL_ARIMA
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'seasonal'}, {'LOWER': 'arima'}], 'id': 'SEASONAL_ARIMA'} ,
    {'label': 'ML_METHOD', 'pattern': 'SARIMA', 'id': 'SEASONAL_ARIMA'} ,
    # VECTOR_AUTOREGRESSION
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'vector'}, {'LOWER': 'autoregression'}], 'id': 'VECTOR_AUTOREGRESSION'} ,
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'vector'}, {'LOWER': 'autoregressive'}], 'id': 'VECTOR_AUTOREGRESSION'} ,
    {'label': 'ML_METHOD', 'pattern': 'VAR', 'id': 'VECTOR_AUTOREGRESSION'} ,
    # STATE-SPACE_MODELS
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'state-space'}, {'LOWER': 'models'}], 'id': 'STATE-SPACE_MODELS'} ,
    # PROPHET
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'prophet'}], 'id': 'PROPHET'} ,
    # BAGGING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'bagging'}], 'id': 'BAGGING'} ,
    # BOOSTING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'boosting'}], 'id': 'BOOSTING'} ,
    # ADABOOST
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'adaboost'}], 'id': 'ADABOOST'} ,
    # GRADIENT_BOOSTING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'gradient'}, {'LOWER': 'boosting'}], 'id': 'GRADIENT_BOOSTING'} ,
    # STACKING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'stacking'}], 'id': 'STACKING'} ,
    # BLENDING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'blending'}], 'id': 'BLENDING'} ,
    # GPT-4
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'gpt-4'}], 'id': 'GPT-4'} ,
    # CHATGPT
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'chatgpt'}], 'id': 'CHATGPT'} ,
    # CLAUDE
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'claude'}], 'id': 'CLAUDE'} ,
    # LLAMA
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'llama'}], 'id': 'LLAMA'} ,
    # CONTRASTIVE_LANGUAGE-IMAGE_PRETRAINING
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'contrastive'}, {'LOWER': 'language-image'}, {'LOWER': 'pretraining'}], 'id': 'CONTRASTIVE_LANGUAGE-IMAGE_PRETRAINING'} ,
    {'label': 'ML_METHOD', 'pattern': 'CLIP', 'id': 'CONTRASTIVE_LANGUAGE-IMAGE_PRETRAINING'} ,
    # DALL-E
    {'label': 'ML_METHOD', 'pattern': [{'LOWER': 'dall-e'}], 'id': 'DALL-E'} ,
]

In [4]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize the EntityRuler
ruler = nlp.add_pipe("entity_ruler", before="ner")

In [5]:
ruler.add_patterns(patterns)

Test EntityRuler

In [6]:
text = "Support Vector Machine, Support Vector Machines, SVM, and S.V.M. are popular machine learning methods."
doc = nlp(text)

In [7]:
# Print detected entities
for ent in doc.ents:
    print(ent.text, ent.label_, ent.id_)

Support Vector Machine ML_METHOD SUPPORT_VECTOR_MACHINES
Support Vector Machines ML_METHOD SUPPORT_VECTOR_MACHINES
SVM ML_METHOD SUPPORT_VECTOR_MACHINES
S.V.M. ORG 


Save the EntityRuler to a file

In [8]:
ruler.to_disk(f'{DATA_FOLDER}ml_entity_ruler_patterns')