In [None]:
#@title << Setup Google Colab by running this cell {display-mode: "form"}
import sys
if 'google.colab' in sys.modules:
    # Clone GitHub repository
    !git clone https://github.com/pxydi/Authorship-analysis.git
        
    # Copy files required to run the code
    #!cp -r "text/data" "text/tools.py" .
    
    # Install packages via pip
    !pip install -r "Authorship-analysis/colab-requirements.txt"
    
    # Restart Runtime
    import os
    os.kill(os.getpid(), 9)

# Authorship verification

## Text preprocessing

Data source: https://archive.ics.uci.edu/ml/datasets/Victorian+Era+Authorship+Attribution

***Documentation***
  
**Gdelt dataset**  

To decrease the bias and create a reliable authorship attribution dataset the following criteria have been chosen to filter out authors in `Gdelt` database: 
* English language writing authors
* Authors that have enough books available (at least 5)
* 19th century authors. 

With these criteria 50 authors have been selected and their books were queried through Big Query Gdelt database. 

The next task has been cleaning the dataset due to OCR reading problems in the original raw form. To achieve that, 
* firstly all books have been scanned through to get the overall number of unique words and each words frequencies. 
* While scanning the texts, the first 500 words and the last 500 words have been removed to take out specific features such as the name of the author, the name of the book and other word specific features that could make the classification task easier. 

After this step, we have chosen top 10,000 words that occurred in the whole 50 authors text data corpus. The words that are not in top 10,000 words were removed while keeping the rest of the sentence structure intact. Afterwards, the words are represented with numbers from 1 to 10,000 reverse ordered according to their frequencies. 

The entire book is split into text fragments with 1000 words each. We separately maintained author and book identification number for each one of them in different arrays. 

Text segments with less than 1000 words were filled with zeros to keep them in the dataset as well. 1000 words make approximately 2 pages of writing, which is long enough to extract a variety of features from the document. The reason why we have represented top 10,000 words with numbers is to keep the anonymity of texts and allow researchers to run feature extraction techniques faster. Dealing with large amounts of text data can be more challenging than numerical data for some feature extraction techniques.

In [None]:
# Load libraries
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os ,re, random
import seaborn as sns

from collections import defaultdict
from math import ceil

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
# Load train data

df = pd.read_csv('/content/gdrive/My Drive/data/Gungor_2018_VictorianAuthorAttribution_data-train.csv',encoding = "ISO-8859-1")
print(df.shape)

In [None]:
# Print a few samples

pd.set_option('display.max_colwidth',500)
df.sample(5)

A few observations (train data):
* labeled text data
* lowercase
* no punctuation marks
* stopwords aren't removed
* a few non-ASCII characters.

All texts have a length of 1'000 (words).

## Preliminary EDA

In [None]:
# Nbr of samples per author ID

plt.figure(figsize=(15,7))
sns.barplot(x=pd.value_counts(df['author']).index, y = pd.value_counts(df['author']).values, color='b');
plt.xlabel('Author ID number',fontsize=12)
plt.ylabel('Number of samples',fontsize=12);
plt.title('Number of samples per author',fontsize=14,fontweight='bold');

In [None]:
print('There are {} unique author ID numbers.'.format(df['author'].nunique()))

I can see a potential issue here: author ID numbers span from 1 to 50; whereas, there are only 45 unique ID numbers. For the machine learning part, the target labels need to be encoded with values between 0 and n_classes-1. [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html) can be used to normalize labels.

In [None]:
# Show an example

from sklearn.preprocessing import LabelEncoder

# Create toy labels

y = [0, 2, 4]
label_encoder = LabelEncoder()
label_encoder.fit(y)

# Transform labels
label_encoder.transform(y)

### Stopwords

In [None]:
# Show NLTK's stopwords list
from nltk.corpus import stopwords          
stopwords_english = stopwords.words('english') 
print(stopwords_english)

Let's investigate what type of stopwords we have in the texts.

In [None]:
# Sample a document randomly
rdn_idx = random.randint(0,len(df)-1)
sample = df.iloc[rdn_idx,0]

# Empty dictionary
stopwords_dict = defaultdict(int)

for w in sample.split():
    if w in stopwords_english:
        stopwords_dict[w] += 1

print(stopwords_dict["you're"])
print(stopwords_dict["don't"])
print(stopwords_dict["hadn'"])
print(stopwords_dict["not"])
print(stopwords_dict["nor"])
print(stopwords_dict["no"])

So, it seem that I don't need to expand contractions. I could customize stopwords list not to remove negation words, but, I'm not sure how importart negation is for authorship detection. Anyway, I will use a customized stopwords list. 

In [None]:
# Customize nltk stopwords
not_stopwords = {'no', 'nor', 'not'} 
custom_stopwords = set([word for word in stopwords_english if word not in not_stopwords])

## Text cleaning

The text is clean... I will only:
* remove non-ASCII characters
* apply stemming (to reduce the size of the vocabulary : this may be an issue given the size of the corpus)
* remove stopwords
* and tokenize (split on the whitespace - no need for something more elaborate). 

In [None]:
# Show a sample

pd.set_option('display.max_colwidth',None)
df.sample()

The text is clean... I will only:
* remove non-ASCII characters
* apply stemming
* remove stopwords
* and tokenize (split on the whitespace - no need for something more elaborate). 

In [None]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

In [None]:
# Define process_text function

def process_text(text):
    
    text = text.lower().strip()
    
    # Remove non-ASCII chars
    text = ''.join([c for c in str(text) if ord(c) < 128])
    
    # Tokenize, stem, remove stopwords and 1-char words
    clean_tokens = [stemmer.stem(tok) for tok in text.split() if (tok not in custom_stopwords) and (len(tok)>1)]
    
    return clean_tokens

In [None]:
# Sample a document randomly

rdn_idx = random.randint(0,len(df)-1)
sample = df.iloc[rdn_idx,0]

print('Before cleaning: \t{}\n'.format(sample[0:500]))
print('After cleaning: \t{}\n'.format(process_text(sample)[0:50]))

In [None]:
%%time
# Apply process_text function to the entire dataset

df['process_text'] = df['text'].apply(lambda x: process_text(x))

# Wall time: 6min 13s

In [None]:
# Re-order columns (for convenience)

df = df[['text','process_text','author']].copy()

In [None]:
# Show a few samples

pd.set_option('display.max_colwidth',500)
df.sample(3)

In [None]:
# Count number of words in cleaned texts

sns.distplot(df['process_text'].apply(lambda x:len(x)))
plt.xlabel('Number of words in samples')
plt.ylabel('Frequency');

I'm curious why some texts have less than 300 words after cleaning.

In [None]:
df.loc[(df['process_text'].apply(lambda x:len(x)) <  300)].sample(3)

It looks there is an issue with the encoding of a few texts. I don't know how to fix this... So, for the moment, I will remove texts with less than 350 words. 

In [None]:
print(df.shape)
df = df.loc[df['process_text'].apply(lambda x:len(x)) >  350].copy()
df.reset_index(drop=True,inplace=True)
print(df.shape)

In [None]:
# Count number of words in cleaned texts (again)

sns.distplot(df['process_text'].apply(lambda x:len(x)))
plt.xlabel('Number of words in samples')
plt.ylabel('Frequency');

In [None]:
df.sample()

In [None]:
# Export clean data

#df.to_csv('data/gdelt_train_clean.csv',index=False, encoding='utf-8')

## Text vectorization

- Build the vocabulary (consider only words that appear in at least 2 documents)
- Assign a unique integer to each word of the vocabulary.
- Create vector representations of documents. For example:

$$\text{'I came to Bern by train'}$$
$$\text{[700, 680, 320, 230, 120, 55]}$$

- Texts aren't of the same size. However, the input to nns, needs to be of fixed size. Therefore, we need to decide what the max length of these vectors should be; (padding or truncation).

Let's have a look at the length of the processed texts once more.

In [None]:
sns.distplot(df['process_text'].apply(lambda x:len(x)));

Using: max_length = 500 seems a good starting point.

In [None]:
# X array
X = df['process_text']

# Create y labels
y = df['author']
label_encoder = LabelEncoder()

# Transform labels
y = label_encoder.fit_transform(y)
print(y)

In [None]:
### Train-text split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

X_train = X_train.values
y_train = y_train
X_test  = X_test.values
y_test  = y_test

print('Train data: {} {}'.format(X_train.shape,y_train.shape))
print('Test data: {} {}'.format(X_test.shape,y_test.shape))

### Build the vocabulary

Now build the vocabulary using the training data and map a unique integer to each word in the vocabulary (we are not counting frequencies!).

- Map each word in each text to an integer (an "index"). 
- The following code does this for you, but please read it and understand what it's doing.
- Note that you will build the vocabulary based on the training data. 
- To do so, you will assign an index to everyword by iterating over your training set.

The vocabulary will also include some special tokens
- `__PAD__`: padding
- `</e>`: end of line
- `__UNK__`: a token representing any word that is not in the vocabulary.

In [None]:
# Include special tokens 
# started with pad, end of line and unk tokens
Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

for text in X_train:
    for w in text:
        if w not in Vocab:
            Vocab[w] = len(Vocab)
            
print("Total words in vocab are",len(Vocab))

In [None]:
### Convert texts to tensors

def doc2tensor(list_tokens, vocab, max_length = 500):
    
    """ Converts list of tokens into list of integers
    
    Parameters
    ----------
    
    list_tokens : list of strings e.g. ['i','came','to','bern','by','train'] 
        List of tokens to convert to tensor 
        
    vocab : dict
        Dictionary mapping each word of the vocabulary to a unique integer
        
    max_length : integer, default = 500
        Output length of document tensor
        
    Output
    ------
    
    doc_tensor: list of integers (e.g. [700, 680, 320, 230, 120, 55])
        Padding and concatenation
    
    Example
    -------
    'I came to Bern by train' -> [700, 680, 320, 230, 120, 55]
    
    """
    
    doc_tensor = []
    unk_ID = vocab['__UNK__']
    
    for tok in list_tokens:
        doc_tensor.append(vocab.get(tok,unk_ID))

    # If doc_tensor > max_length : concatenation
    if len(doc_tensor) > max_length:
        doc_tensor = doc_tensor[0:max_length]
    # If doc_tensor < max_length : padding      
    else:
        doc_tensor = doc_tensor + [0]*(max_length - len(doc_tensor))

    return doc_tensor

In [None]:
# Show on a random sample 
    
rdn_idx = random.randint(0,len(X_train)-1)
sample = X_train[rdn_idx]

print('Before: \t{}\n'.format(sample[0:50]))
print('After: \t{}\n'.format(doc2tensor(sample,Vocab)[0:50]))

In [None]:
# Convert to tensor all dataset

# train data
X_train_tensor = []

for sample in X_train:
    X_train_tensor.append(doc2tensor(sample,Vocab))
    
X_train_tensor = np.array(X_train_tensor)

# test data

X_test_tensor = []

for sample in X_test:
    X_test_tensor.append(doc2tensor(sample,Vocab))
    
X_test_tensor = np.array(X_test_tensor)

In [None]:
# Show a few samples

X_test_tensor[0:3]

### Batch generators

In [None]:
### Batch generators

# ================================== #
# Batch generator with shuffling
# (for training data)
# ================================== #

def train_gen(X, y, batch_size):
    
    # Shuffle X,y
    shuffled_idx = np.arange(len(y)) # 1,2,...,n
    np.random.shuffle(shuffled_idx)

    # Enumerate indexes by steps of batch_size
    # i: 0b, 2b, 3b, 4b, .. where b is the batch size
    while 1: #run forever, so you can generate elements indefinitely
        for i in range(0, len(y), batch_size):
            
            # Batch indexes
            batch_idx = shuffled_idx[i:i+batch_size]

            yield X[batch_idx], y[batch_idx]

# ================================== #
# Batch generator without shuffling
# (for val/test data)
# ================================== #

def test_gen(X, y , batch_size):
    # Do not shuffle X,y
    idx = np.arange(len(y)) # 1,2,...,n

    # Enumerate indexes by steps of batch_size
    # i: 0b, 2b, 3b, 4b, .. where b is the batch size
    while 1: #run forever, so you can generate elements indefinitely
        for i in range(0, len(y), batch_size):
            # Batch indexes
            batch_idx = idx[i:i+batch_size]

            yield X[batch_idx], y[batch_idx]

In [None]:
# Test train_gen

next(train_gen(X_train_tensor,y_train,4))

In [None]:
# Test test_gen

next(test_gen(X_test_tensor,y_test,4))

### Neural network

In [None]:
# Load dependencies

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten

from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping
callbacks = [EarlyStopping(monitor='val_loss',patience=6)]

# Metrics
from sklearn.metrics import f1_score,accuracy_score, precision_score, recall_score

In [None]:
len(X_train_tensor[0])

In [None]:
# Set parameters

maxlen = len(X_train_tensor[0])           # Sequence (document) length
embedding_dims = 100                      # Embedding size
voc_size = len(Vocab)                     # Size of the vocabulary

batch_size = 512
epochs = 300

n_classes = len(np.unique(y_train))

print('Max sequence lenght: {}'.format(maxlen))
print('Embedding: {}'.format(embedding_dims))
print('Vocabulary size: {}'.format(voc_size))
print('Batch size: {}'.format(batch_size))
print('Number of classes: {}'.format(n_classes))

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
pd.DataFrame({'label': label_encoder.classes_, 'weight': class_weights}).sort_values(by='weight').head()

### Model 1

<img src="model0_loss_curves.png" style="width:700px; height:280px;">

In [None]:
from tensorflow.keras import backend as K
K.clear_session() # Create new graph to avoid clutter

trained_model_0 = keras.models.load_model('model.h5') # Load model

# New predictions
y_train_pred_0 = trained_model_0.predict_classes(X_train_tensor)
y_test_pred_0  = trained_model_0.predict_classes(X_test_tensor)

In [None]:
print('Train data')
print('-'*10)
print('F1 score: {}'.format(np.round(f1_score(y_train,y_train_pred_0,average='micro'),3)))
print('Accuracy: {}'.format(np.round(accuracy_score(y_train,y_train_pred_0),3)))

print()
print('Test data')
print('-'*10)
print('F1 score: {}'.format(np.round(f1_score(y_test,y_test_pred_0,average='micro'),3)))
print('Accuracy: {}'.format(np.round(accuracy_score(y_test,y_test_pred_0),3)))

In [None]:
from sklearn.metrics import classification_report

## TRAIN DATA

print(classification_report(y_train, y_train_pred_0))

In [None]:
## TEST DATA

print(classification_report(y_test, y_test_pred_0))

In [None]:
pd.set_option("display.max_columns", 60)

plt.figure(figsize=(14,10))
plt.title('Training data')
sns.heatmap(pd.crosstab(y_train,y_train_pred_0),vmax=10,cmap="Blues");
plt.ylabel('True labels')
plt.xlabel('Predicted labels');

In [None]:
pd.set_option("display.max_columns", 60)

plt.figure(figsize=(14,10))
plt.title('Test data')
sns.heatmap(pd.crosstab(y_test,y_test_pred_0),vmax=14,cmap="Blues");
plt.ylabel('True labels')
plt.xlabel('Predicted labels');

### Model 2

Now I will add one dense hidden layer (with 64 neurons). In retrospect, using only 64 neurons may be too low, given the number of output neurons (45).


<img src="model1_loss_curves.png" style="width:700px; height:280px;">

In [None]:
from tensorflow.keras import backend as K
K.clear_session() # Create new graph to avoid clutter

trained_model_1 = keras.models.load_model('model1.h5') # Load model

# New predictions
y_train_pred = trained_model_1.predict_classes(X_train_tensor)
y_test_pred  = trained_model_1.predict_classes(X_test_tensor)

In [None]:
print('Train data')
print('-'*10)
print('F1 score: {}'.format(np.round(f1_score(y_train,y_train_pred,average='micro'),3)))
print('Accuracy: {}'.format(np.round(accuracy_score(y_train,y_train_pred),3)))

print()
print('Test data')
print('-'*10)
print('F1 score: {}'.format(np.round(f1_score(y_test,y_test_pred,average='micro'),3)))
print('Accuracy: {}'.format(np.round(accuracy_score(y_test,y_test_pred),3)))

In [None]:
from sklearn.metrics import classification_report

## TRAIN DATA

print(classification_report(y_train, y_train_pred))

In [None]:
## TEST DATA

print(classification_report(y_test, y_test_pred))

In [None]:
pd.set_option("display.max_columns", 60)

plt.figure(figsize=(14,10))
plt.title('Training data')
sns.heatmap(pd.crosstab(y_train,y_train_pred),vmax=10,cmap="Blues");
plt.ylabel('True labels')
plt.xlabel('Predicted labels');

In [None]:
pd.set_option("display.max_columns", 60)

plt.figure(figsize=(14,10))
plt.title('Test data')
sns.heatmap(pd.crosstab(y_test,y_test_pred),vmax=14,cmap="Blues");
plt.ylabel('True labels')
plt.xlabel('Predicted labels');

### AutoKeras

Source : https://autokeras.com/