<a href="https://colab.research.google.com/github/edponce/DoyleInvestigators2/blob/main/Doyle_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install AuthorDetect from GitHub repository

In [None]:
!pip install git+https://github.com/edponce/DoyleInvestigators2

# Set up NLTK

In [54]:
import nltk

nltk.download('stopwords')
# Sentencizer
nltk.download('punkt')
# Tagger
nltk.download('averaged_perceptron_tagger')
# Universal POS tags
nltk.download('universal_tagset')
# Stemmers
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Reproducibility Settings

In [55]:
#PYTHONHASHSEED = 0
global_seed = None

# Imports and mount Google Drive

In [94]:
from authordetect import (
    Author, Tokenizer, EmbeddingModel, Classifier,
    save_pickle, load_pickle, save_json, load_json, np_avg, np_sum,
)
import numpy
import random
import itertools
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, precision_recall_fscore_support
from typing import Any, Dict, Union, Iterable

In [57]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Author Vector Embedding Model

## Writer2Vec

### Method 1: Load existing model

In [None]:
embedding_infile = '/content/gdrive/My Drive/Doyle Investigators 2/w2v/doyle_50dim_350part.bin'
embedding = EmbeddingModel(embedding_infile)

### Method 2: Train new model

In [58]:
embedding = None

### Embedding parameters

In [60]:
writer2vec_params = {
    # Tokenizer
    'tokenizer': Tokenizer(lemmatizer='wordnet'),

    # Document partitioning
    'part_size': 350,  # int, None=for standalong documents
    # 'remain_factor': 350/350,  # float [0,1], default=1

    # word2vec - Parameters passed directly to gensim.models.Word2Vec
    # https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
    'size': 50,
    'window': 12,
    'min_count': 1,
    # 'sg': 0,
    # 'hs': 0,
    'negative': 200,
    'alpha': 0.05,
    # 'min_alpha': 0.0007,
    'seed': global_seed,
    'sample': 6e-5,
    'iter': 5,

    # word2vec embedding
    'embedding': embedding,
    
    # doc2vec
    'stopwords': Tokenizer.STOPWORDS,  # iterable[str], None
    'func': np_avg,  # callable
    'use_norm': True,  # bool
    'missing_value': 0,  # int
}

### Train model

In [61]:
corpus_file = '/content/gdrive/My Drive/Doyle Investigators 2/raw/Doyle_10.txt'

main_author = Author(corpus_file)
main_author.writer2vec(**writer2vec_params)

print('Corpus characters:', len(main_author.text))
print('Corpus sentences:', len(main_author.sentences))
print('Corpus words:', len(main_author.words))
print('Corpus vocabulary:', len(main_author.parsed_text.vocabulary))
print('Embedding vocabulary:', len(main_author.embedding.vocabulary))
print('Embedding matrix:', main_author.embedding.vectors.shape)

Author input will be loaded from a file
Corpus characters: 149008
Corpus sentences: 1856
Corpus words: 28077
Corpus vocabulary: 3110
Embedding vocabulary: 3110
Embedding matrix: (3110, 50)


### Save model

In [62]:
embedding_outfile = '/content/gdrive/My Drive/Doyle Investigators 2/w2v/doyle_50dim_350part.bin'
if embedding_outfile:
    main_author.embedding.save(embedding_outfile)

## Example: Word vector embeddings

### Method 1: AuthorDetect API

In [63]:
main_author.embedding.vectors
vec = main_author.embedding['holmes']
print(vec)

[ 0.46549737  0.60374576  0.8470459  -0.19474304 -1.3384175  -0.81365055
  0.28888994  0.62765944  0.2118628  -1.1505791   0.35912797  0.29321107
  0.04541311 -0.01090856  1.0617627  -0.713252   -0.482456    0.86590815
 -0.6698707   0.51217306 -1.8313785   0.15041183  1.4251413   1.2170594
  1.1220206   0.92609    -0.699016    0.7248917  -1.3157141  -1.4904219
  0.45005497  1.2250013  -0.55145645  1.5065546   1.0859183   0.7529688
  0.49549443 -0.35432148 -1.1073892  -0.14995696 -0.13276358 -0.44677708
  0.34673393 -1.3918135  -1.155032   -0.332819   -0.62239987 -0.14582188
  0.51104206  0.87963104]


### Method 2: Gensim API

In [64]:
w2v_model = main_author.embedding.model
vec = w2v_model.wv['holmes']
print(vec)

[ 0.46549737  0.60374576  0.8470459  -0.19474304 -1.3384175  -0.81365055
  0.28888994  0.62765944  0.2118628  -1.1505791   0.35912797  0.29321107
  0.04541311 -0.01090856  1.0617627  -0.713252   -0.482456    0.86590815
 -0.6698707   0.51217306 -1.8313785   0.15041183  1.4251413   1.2170594
  1.1220206   0.92609    -0.699016    0.7248917  -1.3157141  -1.4904219
  0.45005497  1.2250013  -0.55145645  1.5065546   1.0859183   0.7529688
  0.49549443 -0.35432148 -1.1073892  -0.14995696 -0.13276358 -0.44677708
  0.34673393 -1.3918135  -1.155032   -0.332819   -0.62239987 -0.14582188
  0.51104206  0.87963104]


# Writer2Vec - Document Vector Embedding Models

In [65]:
def writer2vec(data: Union[str, Iterable[str]], labels: Union[Any, Iterable[Any]], **kwargs):
    if isinstance(data, str):
        data = [data]
        labels = [labels]

    authors = []
    for i, (corpus, label) in enumerate(zip(data, labels), start=1):
        author = Author(corpus, label)
        author.writer2vec(**kwargs)
        authors.append(author)
    return authors


def split_combine_data(data, labels, pos_frac=0.5, neg_frac=None, *, seed=None):
    """Split a nested dataset of the form [[pos], [neg], [neg], ...],
    using the fractions provided.

    The positive data is first and is used completely with the pos_frac
    stating the fraction it represents. Partial negative datasets are
    generated via random sampling. Data is returned in the same format.
    """
    random.seed(seed)

    total_size = len(data[0]) / pos_frac

    # Calculate negative fraction, uniformly for each negative dataset
    _neg_frac = (1 - pos_frac) / (len(data) - 1)
    if neg_frac is not None:
        _neg_frac = min(_neg_frac, neg_frac)
    neg_frac = _neg_frac

    # Randomly select negative data
    combined_data = [data[0]]
    combined_labels = [labels[0]]
    neg_size = int(total_size * neg_frac)
    for neg_data, neg_labels in zip(data[1:], labels[1:]):
        _neg_size = min(neg_size, len(neg_data))
        idxs = random.sample(range(len(neg_data)), _neg_size)
        _neg_data = [neg_data[i] for i in idxs]
        _neg_labels = [neg_labels[i] for i in idxs]
        combined_data.append(_neg_data)
        combined_labels.append(_neg_labels)

    return combined_data, combined_labels


def flatten(data):
    return numpy.array(list(itertools.chain.from_iterable(data)))

## Train model using 90% from 10/90 split of original data

In [66]:
# Positive author first
train_data = [
    '/content/gdrive/My Drive/Doyle Investigators 2/raw/Doyle_90.txt',
    '/content/gdrive/My Drive/Doyle Investigators 2/raw/Rinehart_90.txt',
    '/content/gdrive/My Drive/Doyle Investigators 2/raw/Christie_90.txt',
]
train_labels = [1, 0, 0]

In [67]:
writer2vec_params['embedding'] = main_author.embedding

# Document vectors and labels
train_authors = writer2vec(train_data, train_labels, **writer2vec_params)

# Extract vectors and labels
vectors = []
labels = []
for author in train_authors:
    vectors.append(author.dv)
    labels.append([author.label] * len(author.dv))

# Fraction select (50% of 90% for positive, 25% of 90% for each negative)
vectors, labels = split_combine_data(vectors, labels, seed=global_seed)

# Flatten data
train_vectors = flatten(vectors)
train_labels = flatten(labels)

Author input will be loaded from a file
Author input will be loaded from a file
Author input will be loaded from a file


# MLP Classifier

In [70]:
def train_classifier(vectors: Iterable[Iterable[float]], labels: Iterable[int], train_test_params: Dict[str, Any] = None, **kwargs) -> 'MLPClassifier':
    if train_test_params is None:
        train_test_params = {}

    vectors = numpy.array(vectors)
    labels = numpy.array(labels)
    train_vectors, test_vectors, train_labels, test_labels = train_test_split(
        vectors, labels,
        stratify=labels,
        **train_test_params,
    )

    mlp = MLPClassifier(**kwargs)
    mlp.fit(train_vectors, train_labels)

    return mlp

## Classifier parameters

In [69]:
# Train/test data split - Parameters passed directly to sklearn.model_selection.train_test_split
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html#sklearn.model_selection.train_test_split
train_test_params = {
    'test_size': 0.1,  # train_size=1-test_size
    'random_state': global_seed,
}

# Classifier - Paramaters passed directly to sklearn.neural_network.MLPClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
mlp_params = {
    'hidden_layer_sizes': (50,50,50),
    # 'activation': 'relu',
    'solver': 'adam',
    'alpha': 1e-4,
    'random_state': global_seed,
    # 'learning_rate': 'constant',  # only used when solver='sgd'
    'max_iter': 2000,
    # 'shuffle': True,  # only used when solver='sgd' or 'adam'
    # 'warm_start': False,
    # 'momentum': 0.9,  # only used when solver='sgd'
    # 'max_fun': 15000,  # only used when solver='lbfgs'
}

## Train model

### Method 1: Load existing model

In [None]:
mlp_infile = '/content/gdrive/My Drive/Doyle Investigators 2/mlp/doyle_50dim_350part.pkl'
mlp = load_pickle(mlp_infile)
print(mlp.coefs_[0].shape)
print(mlp.coefs_[1].shape)

### Method 2: Train new model

In [71]:
mlp = train_classifier(train_vectors, train_labels, train_test_params, **mlp_params)

## Save model

In [72]:
mlp_outfile = '/content/gdrive/My Drive/Doyle Investigators 2/mlp/doyle_50dim_350part.pkl'
if mlp_outfile:
    save_pickle(mlp, mlp_outfile)

# MLP Predict

In [103]:
def predict(mlp: 'MLPClassifier', vectors: Iterable[Iterable[float]], true_labels: Iterable[Any] = None):
    predict_labels = mlp.predict(vectors)
    probabilities = mlp.predict_proba(vectors)

    if true_labels is not None:
        score = mlp.score(vectors, true_labels)
        f1 = f1_score(true_labels, predict_labels, zero_division=1)
        precision, recall, fbeta, support = precision_recall_fscore_support(
            true_labels, predict_labels, zero_division=1
        )
        metrics = {
            'f1': f1,
            'precision': precision,
            'recall': recall,
            'fbeta': fbeta,
        }
    else:
        metrics = None

    return predict_labels, mlp.classes_, probabilities, metrics


def get_perturbed_data(docfile: str, label_map: dict = None):
    docs = []
    labels = []
    for doc in load_json(docfile):       
        label = label_map[doc['label'].lower()] if label_map else doc['label']
        labels.append(label)
        docs.append(doc['text'])
    return docs, labels

## Method 1: Predict using 10% from 10/90 split of original data

In [114]:
test_data = [
    '/content/gdrive/My Drive/Doyle Investigators 2/raw/Doyle_10.txt',
    '/content/gdrive/My Drive/Doyle Investigators 2/raw/Christie_10.txt',
    '/content/gdrive/My Drive/Doyle Investigators 2/raw/Rinehart_10.txt',
]

test_labels = [1, 0, 0]

## Method 2: Predict using perturbed test data

In [123]:
LABEL_MAP = {
    'doyle': 1,
    'christie': 0,
    'rinehart': 0,
}

test_file = '/content/gdrive/My Drive/Doyle Investigators 2/perturbed/perturbed_synonym_doyle_350.json'
test_file = '/content/gdrive/My Drive/Doyle Investigators 2/perturbed/perturbed_numericals_doyle_350.json'

test_data, test_labels = get_perturbed_data(test_file, LABEL_MAP)

## Predict

In [124]:
# Document vectors and labels
test_authors = writer2vec(test_data, test_labels, **writer2vec_params)

# Extract vectors and labels
vectors = []
labels = []
for author in test_authors:
    vectors.append(author.dv)
    labels.append([author.label] * len(author.dv))

# Flatten data
test_vectors = flatten(vectors)
test_labels = flatten(labels)

Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input was provided as raw text
Author input

In [125]:
predict_labels, classes, probabilities, metrics = predict(mlp, test_vectors, test_labels)
print('True:', test_labels)
print('Predict:', predict_labels)
# print('Classes:', classes)
# print('Probabilities:', probabilities)
print(metrics)

True: [0 0 1 1 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0
 0 1 1 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 1 0
 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0
 0 0 1 1 0 1 0 1 0 0 1 1 0 1 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 1 1 1 1 0 0 1 1
 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0
 1 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 0 1 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
Predict: [0 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 0 0 1 1 0 0 1 1 1 0 0 1 1 0 0 0 0 0 0 1
 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 0 0 0 0 1 0 1 0 1 1 1 1 1 0 1 1 1 0
 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 1 0 1 1 1 0 0 1 0 1 1 0 1 1 0
 0 0 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 0 0 1 0 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 0 0 0 1 1 1 1
 1 0 0 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 0 1 1 1 1 1
 1 1 1 0 0 1 1 0 1 1 1 0 0 1 1 0 0 0 0 