In [1]:
# Install Spacy the first time you run this notebook
!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting spacy
  Downloading spacy-3.8.2-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.0 (from spacy)
  Downloading thinc-8.3.2-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.4.8-cp312-cp312-win_amd64.

In [11]:
from mtasklite import delayed_init

SPACY_POS = 'tagger'
SPACY_NER = 'ner'
SPACY_PARSER = 'parser'

SPACY_MODEL = 'en_core_web_sm'

@delayed_init
class SpacyTextParser:
    def __init__(self, model_name, disable_modules=[SPACY_NER, SPACY_PARSER]):
        # It's important to load spacy here
        import spacy
        self._nlp = spacy.load(model_name, disable=disable_modules)

    def __call__(self, text):
        return [e.text for e in self._nlp(text)]

In [12]:
parser = SpacyTextParser(SPACY_MODEL)
parser('This is a simple text!')

['This', 'is', 'a', 'simple', 'text', '!']

In [13]:
from mtasklite.processes import pqdm

input_arr = [
    'Accelerate is a library that enables the same PyTorch code to be run across any distributed configuration by adding just four lines of code!',
    'In short, training and inference at scale made simple, efficient and adaptable.',
    'Built on torch_xla and torch.distributed, Accelerate takes care of the heavy lifting, so you don’t have to write any custom code to adapt to these platforms.',
    'Convert existing codebases to utilize DeepSpeed, perform fully sharded data parallelism, and have automatic support for mixed-precision training!',
    'Welcome to the Accelerate tutorials!',
    'These introductory guides will help catch you up to speed on working with Accelerate.',
    'You’ll learn how to modify your code to have it work with the API seamlessly, how to launch your script properly, and more!',
    'These tutorials assume some basic knowledge of Python and familiarity with the PyTorch framework.'
]

# Exactly 4 workers initialized with a given model name
N_JOBS=4
result = pqdm(input_arr, N_JOBS * [SpacyTextParser(SPACY_MODEL)])

list(result)

  0%|          | 0/8 [00:00<?, ?it/s]

[['Accelerate',
  'is',
  'a',
  'library',
  'that',
  'enables',
  'the',
  'same',
  'PyTorch',
  'code',
  'to',
  'be',
  'run',
  'across',
  'any',
  'distributed',
  'configuration',
  'by',
  'adding',
  'just',
  'four',
  'lines',
  'of',
  'code',
  '!'],
 ['In',
  'short',
  ',',
  'training',
  'and',
  'inference',
  'at',
  'scale',
  'made',
  'simple',
  ',',
  'efficient',
  'and',
  'adaptable',
  '.'],
 ['Built',
  'on',
  'torch_xla',
  'and',
  'torch.distributed',
  ',',
  'Accelerate',
  'takes',
  'care',
  'of',
  'the',
  'heavy',
  'lifting',
  ',',
  'so',
  'you',
  'do',
  'n’t',
  'have',
  'to',
  'write',
  'any',
  'custom',
  'code',
  'to',
  'adapt',
  'to',
  'these',
  'platforms',
  '.'],
 ['Convert',
  'existing',
  'codebases',
  'to',
  'utilize',
  'DeepSpeed',
  ',',
  'perform',
  'fully',
  'sharded',
  'data',
  'parallelism',
  ',',
  'and',
  'have',
  'automatic',
  'support',
  'for',
  'mixed',
  '-',
  'precision',
  'training',
 