# Imports

In [1]:
import inspect
import pathlib

In [2]:
from reader import HTMLPickledCorpusReader
from transformer import TextNormalizer

# Constants

In [3]:
PROJECT_DIR = pathlib.Path('~/project').expanduser()
DATA_DIR = PROJECT_DIR / 'data'

In [4]:
SAMPLE_ROOT = DATA_DIR / 'sample'
HOBBIES_ROOT = DATA_DIR / 'hobbies'

# Classes

## HTMLPickledCorpusReader

In [5]:
corpus = HTMLPickledCorpusReader(SAMPLE_ROOT.as_posix())
print(corpus.describes())

HTML corpus contains 2,538 files in 12 categories.
Structured as:
    43,922 paragraphs (17.306 mean paragraphs per file)
    74,899 sentences (1.705 mean sentences per paragraph).
Word count of 1,624,862 with a vocabulary of 58,748 (27.658 lexical diversity).
Corpus scan took 1.652 seconds.


## TextNormalizer

In [6]:
corpus = HTMLPickledCorpusReader(SAMPLE_ROOT.as_posix())
normalizer = TextNormalizer()

In [7]:
len(list(normalizer.fit_transform(corpus.docs())))

2538

# Python Multiprocessing

## Running Tasks in Parallel

In [8]:
from mp_train import sequential, parallel

In [9]:
print("beginning sequential tasks")
_, delta = sequential(SAMPLE_ROOT.as_posix())
print("total sequential fit time: {:0.2f} seconds".format(delta))

beginning sequential tasks


MainProcess 2024-03-03 18:54:51 naive bayes training took 31.92 seconds with an average score of 0.459
MainProcess 2024-03-03 18:56:21 logistic regression training took 90.45 seconds with an average score of 0.570
MainProcess 2024-03-03 18:57:24 multilayer perceptron training took 63.13 seconds with an average score of 0.556


total sequential fit time: 185.51 seconds


In [10]:
print("beginning parallel tasks")
_, delta = parallel(SAMPLE_ROOT.as_posix())
print("total parallel fit time: {:0.2f} seconds".format(delta))

beginning parallel tasks


fit_naive_bayes 2024-03-03 18:58:12 naive bayes training took 48.19 seconds with an average score of 0.459
fit_multilayer_perceptron 2024-03-03 18:58:53 multilayer perceptron training took 88.74 seconds with an average score of 0.572
fit_logistic_regression 2024-03-03 18:59:17 logistic regression training took 112.90 seconds with an average score of 0.570


total parallel fit time: 112.93 seconds


## Process Pools and Queues

In [11]:
from mcpi import mcpi_sequential, mcpi_parallel

In [12]:
N = 10000000

In [13]:
pi, delta = mcpi_sequential(N)
print("sequential pi: {} in {:0.2f} seconds".format(pi, delta))

sequential pi: 3.1421316 in 1.56 seconds


In [14]:
pi, delta = mcpi_parallel(N)
print("parallel pi: {} in {:0.2f} seconds".format(pi, delta))

parallel pi: 3.14214 in 0.34 seconds


## Parallel Corpus Preprocessing

In [15]:
from preprocessor import Preprocessor

In [16]:
print(inspect.getsource(Preprocessor))

class Preprocessor(object):
    """
    The preprocessor wraps a corpus object (usually a `HTMLCorpusReader`)
    and manages the stateful tokenization and part of speech tagging into a
    directory that is stored in a format that can be read by the
    `HTMLPickledCorpusReader`. This format is more compact and necessarily
    removes a variety of fields from the document that are stored in the JSON
    representation dumped from the Mongo database. This format however is more
    easily accessed for common parsing activity.
    """

    def __init__(self, corpus, target=None, **kwargs):
        """
        The corpus is the `HTMLCorpusReader` to preprocess and pickle.
        The target is the directory on disk to output the pickled corpus to.
        """
        self.corpus = corpus
        self.target = target

    @property
    def target(self):
        return self._target

    @target.setter
    def target(self, path):
        if path is not None:
            # Normalize the path

# Cluster Computing with Spark

## Anatomy of a Spark Job

In [8]:
from sc_template import confugure_spark

In [9]:
APP_NAME = 'My Spark Application'

In [10]:
sc, spark = confugure_spark(APP_NAME)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/22 00:01:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Distributing the Corpus

In [11]:
corpus = sc.wholeTextFiles((HOBBIES_ROOT / '*' / '*.txt').as_posix())

## RDD Operations

In [12]:
# !/opt/spark/bin/spark-submit sc_bigramcount.py

In [13]:
from sc_bigramcount import count_labels, count_bigrams

In [14]:
count_labels(corpus)

[Stage 0:>                                                          (0 + 2) / 2]

Label      Count
-------  -------
books         72
cinema       100
gaming       128
sports       118
cooking       30


                                                                                

In [15]:
count_bigrams(corpus)

                                                                                

unique bigrams: 138204


                                                                                

Row(_1=Row(_1='From', _2='to'), _2=1)


## NLP with Spark

### From Scikit-Learn to MLLib

### Feature extraction

In [16]:
# !/opt/spark/bin/spark-submit sc_vectorization.py

In [17]:
from sc_vectorization import load_corpus, make_vectorizer

In [18]:
corpus = load_corpus(
    sc, spark,
    path=(HOBBIES_ROOT / '*' / '*.txt').as_posix()
)

In [19]:
vectorizer = make_vectorizer()
vectorizer = vectorizer.fit(corpus)

                                                                                

In [20]:
vectors = vectorizer.transform(corpus)

In [21]:
vectors.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|                text|              tokens|     filtered_tokens|           frequency|               tfidf|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|books|\r\n\r\nFrom \n\n...|[, , , , from, , ...|[, , , , , , , , ...|(4096,[38,71,106,...|(4096,[38,71,106,...|
|books|The Lonely City b...|[the, lonely, cit...|[lonely, city bri...|(4096,[89,132,156...|(4096,[89,132,156...|
|books|\n\n\n\nRelated P...|[, , , , related,...|[, , , , related,...|(4096,[445,2545,3...|(4096,[445,2545,3...|
|books|The first story i...|[the, first, stor...|[first, story, sa...|(4096,[3,27,31,57...|(4096,[3,27,31,57...|
|books|by Sonny Liew\n\n...|[by, sonny, liew,...|[sonny, liew, , h...|(4096,[315,480,53...|(4096,[315,480,53...|
+-----+--------------------+--------------------+--------------------+--------------------+-----

### Text clustering with MLLib

In [22]:
# !/opt/spark/bin/spark-submit sc_clustering.py

In [23]:
from sc_clustering import load_corpus, make_clusterer, evaluate_clustering

In [24]:
corpus = load_corpus(
    sc, spark,
    path=(HOBBIES_ROOT / '*' / '*.txt').as_posix()
)

In [25]:
clusterer = make_clusterer()
clusterer = clusterer.fit(corpus)

24/03/22 00:02:08 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [26]:
predictions = clusterer.transform(corpus)

In [27]:
evaluate_clustering(clusterer, predictions)

  Cluster    Size  Terms
---------  ------  ----------------------------------------------------------------------------
        0       3  glass-cutting mvp friday's and ($3,000) sportsbooks storage
        1      18  the and to a of in
        2      34  warrior," teases more wear—it's blogcast@playstation.sony.com spotrac. loot,
        3      74  loot, that backing 12th warrior," more (58),
        4      73  and transformative backing glow, loot, was 12th
        5      91  loot, backing that 12th and record. glow,
        6     107  and record. 12th in mcilroy-6 countered. for
        7      34  record. countered. and statutes 12th territories? factory
        8      10  on-loan  territories? countered. hype statutes imperium,
        9       4  yung paused. lab’s 10-on-10 lines. saqlain splatoon
Sum of square distance to center: 2.687
Silhouette with squared euclidean distance: 0.264


### Text classification with MLLib

### Local fit, global evaluation