# Imports

In [1]:
import inspect
import pathlib

In [2]:
from reader import SqliteCorpusReader, PickledCorpusReader
from preprocessor import Preprocessor
from transformer import TextNormalizer

# Constants

In [3]:
PROJECT_DIR = pathlib.Path('~/project').expanduser()
DATA_DIR = PROJECT_DIR / 'data'

In [4]:
CORPUS_ROOT = DATA_DIR / 'database.sqlite'
CORPUS_PICKLED_ROOT = DATA_DIR / 'pitchfork_reviews'

# Classes

## SqliteCorpusReader

In [5]:
corpus = SqliteCorpusReader(CORPUS_ROOT)
print(corpus.describes())

Corpus contains 18,389 files.
Structured as:
    42,032 paragraphs (2.286 mean paragraphs per file)
    433,821 sentences (10.321 mean sentences per paragraph).
Word count of 15,159,413 with a vocabulary of 153,836 (98.543 lexical diversity).
Corpus scan took 14.652 seconds.


## Preprocessor

In [6]:
# corpus = SqliteCorpusReader(CORPUS_ROOT)
# preprocessor = Preprocessor(corpus, CORPUS_PICKLED_ROOT.as_posix())
# docs = preprocessor.transform()

## HTMLPickledCorpusReader

In [7]:
corpus = PickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())
print(corpus.describes())

Corpus contains 18,389 files.
Structured as:
    42,032 paragraphs (2.286 mean paragraphs per file)
    433,821 sentences (10.321 mean sentences per paragraph).
Word count of 15,159,413 with a vocabulary of 153,836 (98.543 lexical diversity).
Corpus scan took 14.613 seconds.


## TextNormalizer

In [8]:
corpus = PickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())
normalizer = TextNormalizer()

In [9]:
sum(1 for _ in normalizer.fit_transform(corpus.docs()))

18389

# Applied Neural Networks

# Neural Language Models

## Artificial Neural Networks

### Training a multilayer perceptron

In [10]:
from snark import fit_mlp_classifier, fit_mlp_regressor

In [11]:
fit_mlp_classifier(CORPUS_PICKLED_ROOT.as_posix())

MainProcess 2024-04-21 16:58:49 mlp classifier training took 22839.06 seconds with an average score of 0.676


In [12]:
fit_mlp_regressor(CORPUS_PICKLED_ROOT.as_posix())

MainProcess 2024-04-22 05:57:51 mlp regressor training took 46741.54 seconds with an average score of 0.300


## Deep Learning Architectures

### TensorFlow: A framework for deep learning

### Keras: An API for deep learning

# Sentiment Analysis

## Deep Structure Analysis

### Predicting sentiment with a bag-of-keyphrases

# The Future Is (Almost) Here