# Imports

In [1]:
import inspect
import pathlib

In [2]:
from reader import SqliteCorpusReader, PickledCorpusReader
from preprocessor import Preprocessor
from transformer import TextNormalizer, KeyphraseExtractor

# Constants

In [3]:
PROJECT_DIR = pathlib.Path('~/project').expanduser()
DATA_DIR = PROJECT_DIR / 'data'

In [4]:
CORPUS_ROOT = DATA_DIR / 'database.sqlite'
CORPUS_PICKLED_ROOT = DATA_DIR / 'pitchfork_reviews'

# Classes

## SqliteCorpusReader

In [5]:
corpus = SqliteCorpusReader(CORPUS_ROOT)
print(corpus.describes())

Corpus contains 18,389 files.
Structured as:
    42,032 paragraphs (2.286 mean paragraphs per file)
    433,821 sentences (10.321 mean sentences per paragraph).
Word count of 15,159,413 with a vocabulary of 153,836 (98.543 lexical diversity).
Corpus scan took 14.652 seconds.


## Preprocessor

In [6]:
# corpus = SqliteCorpusReader(CORPUS_ROOT)
# preprocessor = Preprocessor(corpus, CORPUS_PICKLED_ROOT.as_posix())
# docs = preprocessor.transform()

## HTMLPickledCorpusReader

In [7]:
corpus = PickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())
print(corpus.describes())

Corpus contains 18,389 files.
Structured as:
    42,032 paragraphs (2.286 mean paragraphs per file)
    433,821 sentences (10.321 mean sentences per paragraph).
Word count of 15,159,413 with a vocabulary of 153,836 (98.543 lexical diversity).
Corpus scan took 14.613 seconds.


## TextNormalizer

In [5]:
corpus = PickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())
normalizer = TextNormalizer()

In [6]:
sum(1 for _ in normalizer.fit_transform(corpus.docs()))

18389

## KeyphraseExtractor

In [5]:
corpus = PickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())
keyphrase_extractor = KeyphraseExtractor()

In [6]:
sum(1 for _ in keyphrase_extractor.fit_transform(corpus.docs()))

18389

# Applied Neural Networks

# Neural Language Models

## Artificial Neural Networks

### Training a multilayer perceptron

In [7]:
from snark import fit_mlp_classifier, fit_mlp_regressor

In [8]:
fit_mlp_classifier(CORPUS_PICKLED_ROOT.as_posix())

Score on slice #1: 0.677
Score on slice #2: 0.680
Score on slice #3: 0.686
Score on slice #4: 0.679
Score on slice #5: 0.682
Score on slice #6: 0.664
Score on slice #7: 0.670
Score on slice #8: 0.667
Score on slice #9: 0.670
Score on slice #10: 0.680
CV score: 0.676 ± 0.007
Total fit time: 22237.650 seconds
Model saved to mlp_classifier_1714948900.4946284.pkl


In [11]:
fit_mlp_regressor(CORPUS_PICKLED_ROOT.as_posix())

Score on slice #1: 0.290
Score on slice #2: 0.354
Score on slice #3: 0.331
Score on slice #4: 0.209
Score on slice #5: 0.301
Score on slice #6: 0.332
Score on slice #7: 0.289
Score on slice #8: 0.317
Score on slice #9: 0.263
Score on slice #10: 0.319
CV score: 0.301 ± 0.039
Total fit time: 54632.035 seconds
Model saved to mlp_regressor_1715085786.9095948.pkl


## Deep Learning Architectures

### TensorFlow: A framework for deep learning

### Keras: An API for deep learning

In [5]:
from deep_snark import fit_dnn_classifier

In [6]:
fit_dnn_classifier(CORPUS_PICKLED_ROOT.as_posix())

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

# Sentiment Analysis

## Deep Structure Analysis

### Predicting sentiment with a bag-of-keyphrases

In [7]:
from deep_snark import fit_lstm_classifier

In [8]:
fit_lstm_classifier(CORPUS_PICKLED_ROOT.as_posix())

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

# The Future Is (Almost) Here