# Imports

In [1]:
import inspect
import pathlib

In [2]:
from reader import HTMLPickledCorpusReader
from transformer import TextNormalizer, identity

# Constants

In [3]:
PROJECT_DIR = pathlib.Path('~/project').expanduser()
DATA_DIR = PROJECT_DIR / 'data'

In [4]:
CORPUS_ROOT = DATA_DIR / 'sample'

# Classes

## HTMLPickledCorpusReader

In [5]:
corpus = HTMLPickledCorpusReader(CORPUS_ROOT.as_posix())
print(corpus.describes())

HTML corpus contains 2,538 files in 12 categories.
Structured as:
    43,922 paragraphs (17.306 mean paragraphs per file)
    74,899 sentences (1.705 mean sentences per paragraph).
Word count of 1,624,862 with a vocabulary of 58,748 (27.658 lexical diversity).
Corpus scan took 2.002 seconds.


## TextNormalizer

In [6]:
corpus = HTMLPickledCorpusReader(CORPUS_ROOT.as_posix())
normalizer = TextNormalizer()

In [7]:
len(list(normalizer.fit_transform(corpus.docs())))

2538

# Python Multiprocessing

## Running Tasks in Parallel

In [8]:
from mp_train import sequential, parallel

In [9]:
print("beginning sequential tasks")
_, delta = sequential(CORPUS_ROOT.as_posix())
print("total sequential fit time: {:0.2f} seconds".format(delta))

beginning sequential tasks


MainProcess 2024-03-03 18:54:51 naive bayes training took 31.92 seconds with an average score of 0.459
MainProcess 2024-03-03 18:56:21 logistic regression training took 90.45 seconds with an average score of 0.570
MainProcess 2024-03-03 18:57:24 multilayer perceptron training took 63.13 seconds with an average score of 0.556


total sequential fit time: 185.51 seconds


In [10]:
print("beginning parallel tasks")
_, delta = parallel(CORPUS_ROOT.as_posix())
print("total parallel fit time: {:0.2f} seconds".format(delta))

beginning parallel tasks


fit_naive_bayes 2024-03-03 18:58:12 naive bayes training took 48.19 seconds with an average score of 0.459
fit_multilayer_perceptron 2024-03-03 18:58:53 multilayer perceptron training took 88.74 seconds with an average score of 0.572
fit_logistic_regression 2024-03-03 18:59:17 logistic regression training took 112.90 seconds with an average score of 0.570


total parallel fit time: 112.93 seconds


## Process Pools and Queues

In [11]:
from mcpi import mcpi_sequential, mcpi_parallel

In [12]:
N = 10000000

In [13]:
pi, delta = mcpi_sequential(N)
print("sequential pi: {} in {:0.2f} seconds".format(pi, delta))

sequential pi: 3.1421316 in 1.56 seconds


In [14]:
pi, delta = mcpi_parallel(N)
print("parallel pi: {} in {:0.2f} seconds".format(pi, delta))

parallel pi: 3.14214 in 0.34 seconds


## Parallel Corpus Preprocessing

In [15]:
from preprocessor import Preprocessor

In [16]:
print(inspect.getsource(Preprocessor))

class Preprocessor(object):
    """
    The preprocessor wraps a corpus object (usually a `HTMLCorpusReader`)
    and manages the stateful tokenization and part of speech tagging into a
    directory that is stored in a format that can be read by the
    `HTMLPickledCorpusReader`. This format is more compact and necessarily
    removes a variety of fields from the document that are stored in the JSON
    representation dumped from the Mongo database. This format however is more
    easily accessed for common parsing activity.
    """

    def __init__(self, corpus, target=None, **kwargs):
        """
        The corpus is the `HTMLCorpusReader` to preprocess and pickle.
        The target is the directory on disk to output the pickled corpus to.
        """
        self.corpus = corpus
        self.target = target

    @property
    def target(self):
        return self._target

    @target.setter
    def target(self, path):
        if path is not None:
            # Normalize the path

# Cluster Computing with Spark

## Anatomy of a Spark Job

## Distributing the Corpus

## RDD Operations

## NLP with Spark

### From Scikit-Learn to MLLib

### Feature extraction

### Text clustering with MLLib

### Text classification with MLLib

### Local fit, global evaluation