# Imports

In [1]:
import pathlib

In [2]:
from reader import HTMLPickledCorpusReader
from transformer import TextNormalizer, identity

# Constants

In [3]:
PROJECT_DIR = pathlib.Path('~/project').expanduser()
DATA_DIR = PROJECT_DIR / 'data'

In [4]:
CORPUS_ROOT = DATA_DIR / 'sample'

# Classes

## HTMLPickledCorpusReader

In [5]:
corpus = HTMLPickledCorpusReader(CORPUS_ROOT.as_posix())
print(corpus.describes())

HTML corpus contains 2,538 files in 12 categories.
Structured as:
    43,922 paragraphs (17.306 mean paragraphs per file)
    74,899 sentences (1.705 mean sentences per paragraph).
Word count of 1,624,862 with a vocabulary of 58,748 (27.658 lexical diversity).
Corpus scan took 1.666 seconds.


## TextNormalizer

In [6]:
corpus = HTMLPickledCorpusReader(CORPUS_ROOT.as_posix())
normalizer = TextNormalizer()

In [7]:
len(list(normalizer.fit_transform(corpus.docs())))

2538

# Python Multiprocessing

## Running Tasks in Parallel

In [8]:
from mp_train import sequential, parallel

In [9]:
print("beginning sequential tasks")
_, delta = sequential(CORPUS_ROOT.as_posix())
print("total sequential fit time: {:0.2f} seconds".format(delta))

beginning sequential tasks


MainProcess 2024-02-26 21:34:46 naive bayes training took 31.25 seconds with an average score of 0.459
MainProcess 2024-02-26 21:36:18 logistic regression training took 91.73 seconds with an average score of 0.570
MainProcess 2024-02-26 21:37:29 multilayer perceptron training took 71.02 seconds with an average score of 0.585


total sequential fit time: 194.01 seconds


In [10]:
print("beginning parallel tasks")
_, delta = parallel(CORPUS_ROOT.as_posix())
print("total parallel fit time: {:0.2f} seconds".format(delta))

beginning parallel tasks


fit_naive_bayes 2024-02-26 21:38:08 naive bayes training took 39.44 seconds with an average score of 0.459
fit_multilayer_perceptron 2024-02-26 21:38:46 multilayer perceptron training took 77.72 seconds with an average score of 0.559
fit_logistic_regression 2024-02-26 21:39:07 logistic regression training took 98.67 seconds with an average score of 0.570


total parallel fit time: 98.70 seconds


## Process Pools and Queues

## Parallel Corpus Preprocessing

# Cluster Computing with Spark

## Anatomy of a Spark Job

## Distributing the Corpus

## RDD Operations

## NLP with Spark

### From Scikit-Learn to MLLib

### Feature extraction

### Text clustering with MLLib

### Text classification with MLLib

### Local fit, global evaluation