In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Python Stuff
import numpy as np
import pandas as pd
import zipfile
import os
import gc
import sys
import string
from collections import defaultdict, Counter
import urllib.request
import os.path

# Visualization Stuff
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 240)

# Statistics Stuff
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
import scipy.stats as stats

# NLP Stuff
if 'transformers' not in sys.modules:
  !pip install transformers
import transformers
from transformers import DistilBertTokenizer, DistilBertModel
if 'nltk' not in sys.modules:
  !pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Neural Networks Stuff
import torch
from torch import nn, optim
from torch.utils import data

is_colab = 'google.colab' in sys.modules
if is_colab:
    from google.colab import drive

is_kaggle = 'kaggle' in os.getcwd()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device
if torch.cuda.is_available():
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# Spooky Authors Distribution Analysis

<a id="toc"></a>
## Table of Contents
1. [Introduction](#introduction)
1. [Data Preparation](#preparation)
1. [Data Exploration and Visualizatin](#exploration)
1. [Embeddings](#embeddings)
1. [Classification](#classification)
1. [Text Generation](#creating_poe)
1. [Conclusions](#conclusions)

<a id="introduction"></a>

## Introduction

This notebook contains a statistical analysis of excerpts from horror stories by Edgar Allan Poe, Mary Shelley, and HP Lovecraft.

The notebook is avaible on [Kaggle](https://www.kaggle.com/odedgolden/spooky-authors-analysis/)

<table><tr><td><img src='https://upload.wikimedia.org/wikipedia/commons/9/97/Edgar_Allan_Poe%2C_circa_1849%2C_restored%2C_squared_off.jpg', width="200"></td>
    <td><img src='https://upload.wikimedia.org/wikipedia/commons/6/65/RothwellMaryShelley.jpg', width="230"></td>
    <td><img src='https://upload.wikimedia.org/wikipedia/commons/1/10/H._P._Lovecraft%2C_June_1934.jpg', width="230"></td>
    </tr></table>

Before we start with statistics, let's get a better context for our authors.

### Edgar Allan Poe:
Edgar Allan Poe (January 19, 1809 – October 7, 1849) was an American writer, poet, editor, and literary critic. Poe is best known for his poetry and short stories, particularly his tales of mystery and the macabre. He is widely regarded as a central figure of Romanticism in the United States and of American literature as a whole, and he was one of the country's earliest practitioners of the short story. He is also generally considered the inventor of the detective fiction genre and is further credited with contributing to the emerging genre of science fiction.Poe was the first well-known American writer to earn a living through writing alone, resulting in a financially difficult life and career. (Wikipedia)

### Mary Shelley:
Mary Wollstonecraft Shelley (30 August 1797 – 1 February 1851) was an English novelist who wrote the Gothic novel Frankenstein; or, The Modern Prometheus (1818). She also edited and promoted the works of her husband, the Romantic poet and philosopher Percy Bysshe Shelley. Her father was the political philosopher William Godwin and her mother was the philosopher and feminist Mary Wollstonecraft. (Wikipedia)

###  H.P. Lovecraft:
Howard Phillips Lovecraft (August 20, 1890 – March 15, 1937) was an American writer of weird fiction and horror fiction, who is known for his creation of what became the Cthulhu Mythos.
Born in Providence, Rhode Island, Lovecraft spent most of his life in New England. He was born into affluence, but the family's wealth dissipated soon after the death of his grandfather. In 1913, he wrote a critical letter to a pulp magazine that ultimately led to his involvement in pulp fiction. During the interwar period, he wrote and published stories that focused on his interpretation of humanity's place in the universe. In his view, humanity was an unimportant part of an uncaring cosmos that could be swept away at any moment. These stories also included fantastic elements that represented the perceived fragility of anthropocentrism. (Wikipedia)

### Hypothesis:


I assume that the three authors do have different ”style” and therefore expect the following:
1. Statistically significant differences between the authors. 
2. Good prediction for classification task. 
3. Somewhat entertaining yet sensible simulated excerpts from each author.

<a id="preparation"></a>
## Data Preparation

> “Creative minds are uneven, and the best of fabrics have their dull spots.” - H.P. Lovecraft

First, let's take a look at the data - we have three dataframes - train, test and the sample submission.

Download the data if needed:

In [None]:
if is_colab:
  drive.mount('/content/drive')
  DIR_NAME = '/content/drive/My Drive/Colab Notebooks/Spooky/data/'
elif is_kaggle:
    DIR_NAME = "/kaggle/input/spooky-author-identification/"
    zips = os.listdir(DIR_NAME)
    for name in zips:
        with zipfile.ZipFile(DIR_NAME + name, 'r') as zip_ref:
            zip_ref.extractall(".")
else:
    fnames = [r'./train.csv',r'./test.csv', r'./sample_submission.csv']
    url = 'https://drive.google.com/drive/folders/1tP8T8_-6Xy5BgQa3q2g7reFHBv4Uer_a?usp=sharing'

    if not os.path.exists(fnames[0]):
        for fname in fnames:
            urllib.request.urlretrieve(url, fname)
    print(fname, 'exists:', os.path.exists(fname))
    DIR_NAME = './' 

Read the data to dataframes:

In [None]:
if is_colab:
  train = pd.read_csv(DIR_NAME+'train.csv')
  test = pd.read_csv(DIR_NAME+'test.csv')
  sample = pd.read_csv(DIR_NAME+'sample_submission.csv')
else:
  train = pd.read_csv('./train.csv')
  test = pd.read_csv('./test.csv')
  sample = pd.read_csv('./sample_submission.csv')
train.head(3)

### Initiate BERT Model

The BERT model was proposed in [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It’s a bidirectional transformer pre-trained using a combination of masked language modeling objective and next sentence prediction on a large corpus comprising the Toronto Book Corpus and Wikipedia.

It will be very helpfull to use the pretrained BERT model, in order to get the tokenizer and the word embeddings.

In [None]:
PRE_TRAINED_MODEL_NAME = 'distilbert-base-cased'
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
bert_model = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

bert_embeddings = bert_model.get_input_embeddings()

Let's look at an example of the tokenizing:

In [None]:
sample_sentence = "Creative minds are uneven, and the best of fabrics have their dull spots."
tokens = tokenizer.tokenize(sample_sentence)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
encoded = tokenizer.encode(sample_sentence)
print(tokens)
print(token_ids)
print(encoded)

I now want to create some metadata, specifically - the word count and stopword count for each excerpt.

I will do it by applying the BERT tokenizer.

In [None]:
train['word_ids'] = train['text'].apply(lambda x: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x)))
train['word_count'] = train['word_ids'].apply(lambda x: len(x))

eng_stopword_toekns = set([tokenizer.convert_tokens_to_ids(str(x)) for x in stopwords.words("english")])
train['stopword_count'] = train['word_ids'].apply(lambda x: len([token for token in x if token in eng_stopword_toekns]))

train.head(1)

I will first calculate word counts for each of the authors, in order to understand their words distributions.

I suspect that every author may use a slightly different vocabulary.

In [None]:
def count_words(df, author_key):
    df = df[df['author']==author_key]
    word_count = df['text'].str.split(expand=True).stack().value_counts()
    return word_count

In [None]:
word_count_EAP = count_words(train, 'EAP')
word_count_HPL = count_words(train, 'HPL')
word_count_MWS = count_words(train, 'MWS')
print(f'word_count_EAP size: {len(word_count_EAP)}')
print(f'word_count_HPL size: {len(word_count_HPL)}')
print(f'word_count_MWS size: {len(word_count_MWS)}')

After counting the words for each author, I now want to subtract all the stopwords, which are not unique in term of vocabulary.

In [None]:
english_stopwords = stopwords.words("english") + list(string.punctuation)

word_count_EAP = word_count_EAP.drop(labels=english_stopwords, errors='ignore')
word_count_HPL = word_count_HPL.drop(labels=english_stopwords, errors='ignore')
word_count_MWS = word_count_MWS.drop(labels=english_stopwords, errors='ignore')

However, I am also only interested in the differences, so I will also drop all the common words (words that appear on each and every author in it's top COMMON_WORDS_POOL list).

In [None]:
PCA_ON_N_WORDS = 1000
COMMON_WORDS_POOL = 400
PLOT_M_WORDS = 100

common_words = pd.Series(list(set(word_count_EAP[:COMMON_WORDS_POOL].index) & set(word_count_HPL[:COMMON_WORDS_POOL].index)  & set(word_count_MWS[:COMMON_WORDS_POOL].index)))

word_count_EAP = word_count_EAP.drop(common_words)
word_count_HPL = word_count_HPL.drop(common_words)
word_count_MWS = word_count_MWS.drop(common_words)

Next, let's take the top PCA_ON_N_WORDS words from each author, in order to perform PCA (Principal Component Analysis).

Using the PCA, I will be able to reduce the embedding dimension from 768 to 2, and therefore plot the most common words.

Hopefully, we will be able to derive some insights from that.

We will start by creating a word count dataframe.

In [None]:
cols = ['count']
word_count_EAP_top_n_for_pca = pd.DataFrame(word_count_EAP[:PCA_ON_N_WORDS], columns = cols)
word_count_EAP_top_n_for_plot = pd.DataFrame(word_count_EAP[:PLOT_M_WORDS], columns = cols)

word_count_EAP_top_n_for_pca['author'] = 'EAP'
word_count_EAP_top_n_for_plot['author'] = 'EAP'


word_count_HPL_top_n_for_pca = pd.DataFrame(word_count_HPL[:PCA_ON_N_WORDS], columns = cols)
word_count_HPL_top_n_for_plot = pd.DataFrame(word_count_HPL[:PLOT_M_WORDS], columns = cols)

word_count_HPL_top_n_for_pca['author'] = 'HPL'
word_count_HPL_top_n_for_plot['author'] = 'HPL'

word_count_MWS_top_n_for_pca = pd.DataFrame(word_count_MWS[:PCA_ON_N_WORDS], columns = cols)
word_count_MWS_top_n_for_plot = pd.DataFrame(word_count_MWS[:PLOT_M_WORDS], columns = cols)

word_count_MWS_top_n_for_pca['author'] = 'MWS'
word_count_MWS_top_n_for_plot['author'] = 'MWS'

df_for_pca = pd.concat([word_count_EAP_top_n_for_pca, word_count_HPL_top_n_for_pca, word_count_MWS_top_n_for_pca])
df_for_pca = df_for_pca.reset_index()
df_for_pca.columns = ['word', 'count', 'author']

df_for_plot = pd.concat([word_count_EAP_top_n_for_plot, word_count_HPL_top_n_for_plot, word_count_MWS_top_n_for_plot])
df_for_plot = df_for_plot.reset_index()
df_for_plot.columns = ['word', 'count', 'author']
df_for_plot.head(2)

<a id="embeddings"></a>

## Embeddings
> "Words have no power to impress the mind without the exquisite horror of their reality." - Edgar Allan Poe.

We will use the BERT pre-trained embeddings which map each word id to its (1,768) vector.

Let's define some helper functions in order to extract the word id for each token, and the embedding vector for each word id:

In [None]:
def word_to_index(word):
    token_id = tokenizer.convert_tokens_to_ids(word)
    return token_id

print(word_to_index('Hello'))

def indices_to_vec(word_ids):
    embeded_tokens = bert_embeddings(torch.Tensor(word_ids).to(torch.long))
    return embeded_tokens.detach().numpy()

print(indices_to_vec([word_to_index('Hello'), word_to_index('Jacob')]).shape)

def index_to_vec(word_id):
    embeded_token = bert_embeddings(torch.Tensor([word_id]).to(torch.long))
    return embeded_token.detach().numpy()
vec = index_to_vec(word_to_index('Hello'))
print(vec.shape)

Now let's apply the functions and fit the PCA for our data

In [None]:
df_for_plot['word_id'] = df_for_plot['word'].apply(word_to_index)
df_for_pca['word_id'] = df_for_pca['word'].apply(word_to_index)
vectors = index_to_vec(df_for_pca['word_id'].to_numpy())
pca = PCA(n_components=2)
pca.fit(vectors.squeeze())
df_for_pca['word_vec'] = df_for_pca['word_id'].apply(index_to_vec)

vectors.shape

We can now reduce the dimension to 2:

In [None]:
def vec_to_2dim(word_vec):
    xy = pca.transform(word_vec)
    return xy[0][0], xy[0][1]
x, y = vec_to_2dim(vec)
print(x, y)

def tuple_x(xy):
    return tuple(xy)[0]
print(tuple_x((1,2)))

def tuple_y(xy):
    return tuple(xy)[1]
print(tuple_y((1,2)))

In [None]:
df_for_pca['word_xy'] = df_for_pca['word_vec'].apply(vec_to_2dim)
df_for_pca['word_x'] = df_for_pca['word_xy'].apply(tuple_x)
df_for_pca['word_y'] = df_for_pca['word_xy'].apply(tuple_y)

In [None]:
merged_df = df_for_plot.join(df_for_pca, on='word_id',lsuffix='_l', rsuffix='_r') 
# df_cleaned = merged_df[(merged_df['word_x'] < 0.5 ) & (merged_df['count_x'] > np.median(merged_df['count_x']))]
df_cleaned = merged_df[merged_df['word_x'] < 0.6 ]

# df_cleaned = merged_df
df_cleaned.columns

In [None]:
df_cleaned['count_l'].min()

<a id="exploration"></a>

## Exploration and Visualization

> "When falsehood can look so like the truth, who can assure themselves of certain happiness?" - Mary Shelley.

Before we even start, let's make sure the dataset is kind of even:

In [None]:
sns.countplot(train['author'])
plt.xlabel('Authors');

I guess we can work with that. That's pretty balanced.

I also want to check if the word counts are close enough to normal distribution.

In [None]:
f, axes = plt.subplots(1, 2, figsize=(40,10))

sns.distplot(ax=axes[0], a=train['word_count'])
sns.distplot(ax=axes[1], a=train['word_count'][train['word_count'] < 150]);

In [None]:
stats.normaltest(train['word_count'])

In [None]:
f, axes = plt.subplots(1, 2, figsize=(40,10))

sns.distplot(ax=axes[0], a=train['stopword_count'])
sns.distplot(ax=axes[1], a=train['stopword_count'][train['stopword_count'] < 50]);

In [None]:
stats.normaltest(train['stopword_count'])

Seems like it is, and that's great since we are going to need this assumption for later ANOVA tests.

Looking at the lengths of the excerpts, can we find significant differences between the different authors?

Let's first plot it:

In [None]:
f, axes = plt.subplots(1, 2, figsize=(20,10))

sns.violinplot(x='author', y='word_count', data=train[train['word_count'] < 60], ax=axes[0])
plt.xlabel('Author Name', fontsize=12)
plt.ylabel('Number of words in text', fontsize=12)
plt.title("Number of words by author", fontsize=15);

sns.violinplot(x='author', y='word_count', data=train[train['word_count'] < 90], ax=axes[1])
plt.xlabel('Author Name', fontsize=12)
plt.ylabel('Number of words in text', fontsize=12)
plt.title("Number of words by author", fontsize=15);


While it is not very clear from the plot, we can perform one way ANOVA (Analysis of Variance) in order to test for significant differences between the authors

### ANOVA - Analysis of Variance

Since "author" is a categorical variable, and we want to check the difference between the three authors, the appropriate test will be ANOVA.
If there is any significant effect, we will perform three T tests.


In [None]:
def calculate_one_way_anova(column_name):
    return stats.f_oneway(train[column_name][train['author'] == 'EAP'],
               train[column_name][train['author'] == 'HPL'],
               train[column_name][train['author'] == 'MWS'])

In [None]:
one_way = calculate_one_way_anova('word_count')
print(f'The F score when comaring all the authors: {one_way.statistic}, which reflect pvalue of: {one_way.pvalue}')

We got a very low pvalue, which means that we can conclude that there is indeed significant difference between the authors.

However, we don't really know where is this difference coming from.

In order to do that, we will use T tests for comparing each of the pairs.

In [None]:
t_EAP_HPL = stats.ttest_ind(train['word_count'][train['author'] == 'EAP'], train['word_count'][train['author'] == 'HPL'])
t_MWS_HPL = stats.ttest_ind(train['word_count'][train['author'] == 'MWS'], train['word_count'][train['author'] == 'HPL'])
t_MWS_EAP = stats.ttest_ind(train['word_count'][train['author'] == 'MWS'], train['word_count'][train['author'] == 'EAP'])

print(f'The T score when comaring EAP and HPL: {t_EAP_HPL.statistic} which reflect pvalue of: {t_EAP_HPL.pvalue}')
print(f'The T score when comaring MWS and HPL: {t_MWS_HPL.statistic} which reflect pvalue of: {t_MWS_HPL.pvalue}')
print(f'The T score when comaring MWS and EAP: {t_MWS_EAP.statistic} which reflect pvalue of: {t_MWS_EAP.pvalue}')

Well, we can see that each of the pairs proved to be statistically different with very high probability.

Now let's perform the same process for stopwords:

In [None]:
f, axes = plt.subplots(1, 2, figsize=(20,10))

sns.violinplot(x='author', y='stopword_count', data=train[train['stopword_count'] < 30], ax=axes[0])
plt.xlabel('Author Name', fontsize=12)
plt.ylabel('Number of stop words in text', fontsize=12)
plt.title("Number of stop words by author", fontsize=15);

sns.violinplot(x='author', y='stopword_count', data=train[train['stopword_count'] < 60], ax=axes[1])
plt.xlabel('Author Name', fontsize=12)
plt.ylabel('Number of stop words in text', fontsize=12)
plt.title("Number of stop words by author", fontsize=15);

In [None]:
one_way = calculate_one_way_anova('stopword_count')
print(f'The F score when comaring all the authors: {one_way.statistic}, which reflect pvalue of: {one_way.pvalue}')

In [None]:
t_EAP_HPL = stats.ttest_ind(train['stopword_count'][train['author'] == 'EAP'], train['stopword_count'][train['author'] == 'HPL'])
t_MWS_HPL = stats.ttest_ind(train['stopword_count'][train['author'] == 'MWS'], train['stopword_count'][train['author'] == 'HPL'])
t_MWS_EAP = stats.ttest_ind(train['stopword_count'][train['author'] == 'MWS'], train['stopword_count'][train['author'] == 'EAP'])

print(f'The T score when comaring EAP and HPL: {t_EAP_HPL.statistic} which reflect pvalue of: {t_EAP_HPL.pvalue}')
print(f'The T score when comaring MWS and HPL: {t_MWS_HPL.statistic} which reflect pvalue of: {t_MWS_HPL.pvalue}')
print(f'The T score when comaring MWS and EAP: {t_MWS_EAP.statistic} which reflect pvalue of: {t_MWS_EAP.pvalue}')

Again, we got very similar results, with significant differences between all of the authors.

Now, I want to use the computed PCA values of the words from the word counts, in order to see if there is any visible trend.

In [None]:
p = sns.relplot(x="word_x", y="word_y", hue="author_l", size="count_l",
             alpha=.5, palette="muted", sizes=(70,450),
            height=20, aspect=1, data=df_cleaned)
ax = p.axes[0,0]

for idx, row in df_cleaned.iterrows():
     ax.text(row['word_x']+ 0.001, row['word_y'], row['word_l'], horizontalalignment='left', size='large', color='black')

Unfortunately, I can't see any obvious trend in this plot, but at least we tried.

We can also get a sense of the most unique used words by each author: 

In [None]:
sns.set(style="white", context="talk", font_scale = 4)

# Set up the matplotlib figure
f, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(90, 60), sharex=False)

# Edgar Allan Poe
x1 = word_count_EAP[:20].index
y1 = word_count_EAP[:20]
sns.barplot(x=x1, y=y1, ax=ax1)
ax1.axhline(0, color="k", clip_on=False)
ax1.set_ylabel("Edgar Allan Poe",fontsize=80)

# Mary Shelley
x2 = word_count_MWS[:20].index
y2 = word_count_MWS[:20]
sns.barplot(x=x2, y=y2, ax=ax2)
ax2.axhline(0, color="k", clip_on=False)
ax2.set_ylabel("Mary Shelley",fontsize=80)

# "H.P. Lovecraft
x3 = word_count_HPL[:20].index
y3 = word_count_HPL[:20]
sns.barplot(x=x3, y=y3, palette="deep", ax=ax3)
ax3.axhline(0, color="k", clip_on=False)
ax3.set_ylabel("H.P. Lovecraft",fontsize=80)

# Finalize the plot
sns.despine(bottom=True)
plt.setp(f.axes, yticks=[])
plt.tight_layout(h_pad=5)

<a id="classification"></a>

## Classification

First let's prepare the labels for our train set.

In [None]:
def author_to_label(author):
    labels = {'EAP': 0,'HPL': 1,'MWS': 2}
    return labels[author]

def label_to_author(label):
    authors = ['EAP','HPL','MWS']
    return authors[int(label)]

In [None]:
train['label'] = train['author'].apply(author_to_label)
dummies = pd.get_dummies(train['author'])
train = pd.concat([train,dummies], axis=1)
train.head(1)

### Torch Classes

We will need two datasets for the train set and the test set:

In [None]:
class TrainDataSet(data.Dataset):
    def __init__(self, excerpts, labels, tokenizer, max_len):
        self.excerpts = excerpts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpts)
    
    def __getitem__(self, item):
        excerpt = str(self.excerpts[item])
        
        encoding  = self.tokenizer.encode_plus(
            excerpt,
            max_length = self.max_len,
            add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        
        return {
            'excerpt_text': excerpt,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(self.labels[item], dtype=torch.long)
        }

In [None]:
class TestDataSet(data.Dataset):
    def __init__(self, ids, excerpts, tokenizer, max_len):
        self.ids = ids
        self.excerpts = excerpts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpts)
    
    def __getitem__(self, item):
        excerpt = str(self.excerpts[item])
        excerpt_id = str(self.ids[item])
        
        encoding  = self.tokenizer.encode_plus(
            excerpt,
            max_length = self.max_len,
            add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )
        
        return {
            'excerpt_id': excerpt_id,
            'excerpt_text': excerpt,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

And dataloaders for them:

In [None]:
def create_train_data_loader(df, tokenizer, max_len, batch_size):
    excerpts = df['text'].to_numpy(),
    print(f'Excerpts size: {len(excerpts)}')
    labels = df['label'].to_numpy(),
    dataset = TrainDataSet(excerpts=excerpts[0], labels=labels[0], tokenizer=tokenizer, max_len=max_len)
    print(f'Dataset size: {len(dataset)}')
    return data.DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=True)

def create_test_data_loader(df, tokenizer, max_len, batch_size):
    excerpts = df['text'].to_numpy(),
    ids = df['id'].to_numpy(),
    print(f'Excerpts size: {len(excerpts)}')
    dataset = TestDataSet(ids= ids[0], excerpts=excerpts[0], tokenizer=tokenizer, max_len=max_len)
    print(f'Dataset size: {len(dataset)}')
    return data.DataLoader(dataset, batch_size=batch_size, num_workers=4)

And now split the train data for train and validation sets.

In [None]:
train_set, val_set = train_test_split(train, test_size=0.2)

We will pad excerpt up to MAX_LEN or trim them if needed.

In [None]:
BATCH_SIZE = 16
MAX_LEN = 160

In [None]:
train_data_loader = create_train_data_loader(train_set, tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)
val_data_loader = create_train_data_loader(val_set, tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)

In [None]:
sample = next(iter(train_data_loader))
print(sample['input_ids'].shape)

The classifier is an nn.Module which adds linear classification layer to the bert model outputs.

I use DistilBERT in order to get a smaller model.

In [None]:
class DistilBertAuthorClassifier(nn.Module):
    def __init__(self):
        super(DistilBertAuthorClassifier, self).__init__()
        self.num_labels = 3

        self.softmax = nn.Softmax(dim=1)
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-cased')
        # self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
        self.classifier = nn.Linear(self.distilbert.config.dim, 3)
        self.dropout = nn.Dropout(0.3)

        nn.init.xavier_normal_(self.classifier.weight)

    def forward(self, input_ids, attention_mask):
        distilbert_output = self.distilbert(input_ids=input_ids,
                                            attention_mask=attention_mask)
        hidden_state = distilbert_output[0]
        # print(f'hidden_state shape: {hidden_state.shape}')                
        # print(f'hidden_state shape[2]: {hidden_state.shape[2]}')                
        pooled_output = hidden_state[:, 0, :]                   
        # pooled_output = self.pre_classifier(pooled_output)   
        # pooled_output = nn.ReLU()(pooled_output)             
        pooled_output = self.dropout(pooled_output)        
        logits = self.classifier(pooled_output)
        # logits = self.softmax(logits)
        return logits

In [None]:
gc.collect()
model = DistilBertAuthorClassifier()
model = model.to(device)

In [None]:
input_ids = sample['input_ids'].to(device)
attention_mask = sample['attention_mask'].to(device)

print(input_ids.shape)
print(attention_mask.shape)
prob, pred = torch.max(model(input_ids=input_ids, attention_mask=attention_mask),dim=1)
print(prob)
print(pred)
print(sample['targets'])

In [None]:
model.distilbert.config

### Training

I trained the model on Google's Coalb GPU using the following code, using cross entropy loss function and adam optimizer:

In [None]:
EPOCHES = 3
if torch.cuda.is_available():
    optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_data_loader) * EPOCHES

    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, scheduler, n_examples):
    model = model.train()
    
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        # print(targets.shape)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        loss.backward()
        
        # nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, n_examples):
    model = model.eval()
    losses = []
    
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def predict_authors(model, data_loader, submission_df):
    model = model.eval()
    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            print(outputs)
            

In [None]:
if is_colab and torch.cuda.is_available():
    optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_data_loader) * EPOCHES

    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.4)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
filename = 'finalized_model.pt'

if is_colab and torch.cuda.is_available():
    model = model.to(device)

    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(EPOCHES):
        print(f'Epoch {epoch + 1}/{EPOCHES}')
        print('-'*10)

        train_acc, train_loss = train_epoch(
            model,
            train_data_loader,
            loss_fn,
            optimizer,
            scheduler,
            len(train_set)   
        )
        print(f'Train loss: {train_loss}, accuracy: {train_acc}')

        val_acc, val_loss = eval_model(
            model,
            val_data_loader,
            loss_fn,
            len(val_set)   
        )
        print(f'Validation loss: {val_loss}, accuracy: {val_acc}')

    torch.save(model.state_dict(), DIR_NAME+filename)

In [None]:
if not is_colab:
    model.load_state_dict(torch.load('/kaggle/input/spookydistilbert/finalized_model.pt', map_location=device))
    model.eval()

In [None]:
test_data_loader = create_test_data_loader(test, tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)
sample = next(iter(test_data_loader))

input_ids = sample['input_ids'].to(device)
attention_mask = sample['attention_mask'].to(device)

print(input_ids.shape)
print(attention_mask.shape)
prob, pred = torch.max(model(input_ids=input_ids, attention_mask=attention_mask),dim=1)
print(prob)
print(pred)

In [None]:
# plt.plot(history['train_acc'], label='train accuracy')
# plt.plot(history['val_acc'], label='validation accuracy')

# plt.title('Training history')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend()
# plt.ylim([0, 1]);

## Create Submission for Comparison

In order to be able to compare the results of the model I created a submission for the kaggle contest:

In [None]:
def test_model(model, data_loader, results_df):
    model = model.eval()
    submission = []

    with torch.no_grad():
        for d in data_loader:
            excerpt_ids = d['excerpt_id'],
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            outputs = nn.functional.softmax(outputs,dim=1)
#             print(excerpt_ids[0])
            for i, excerpt_id in enumerate(excerpt_ids[0]):
#                 print(i, excerpt_id)
#                 print(results_df[results_df['id']==excerpt_id])
                results_df.loc[results_df['id'] == excerpt_id, ['EAP','HPL','MWS']] =  outputs[i].tolist()
#                 results_df[results_df['id']==excerpt_id][['EAP','HPL','MWS']] = outputs[i].tolist()
    return results_df

In [None]:
results_df = pd.read_csv('./sample_submission.csv')
results_df.head()

In [None]:
results_df = test_model(model, test_data_loader, results_df)
results_df.head()

In [None]:
results_df.to_csv("submission.csv", index=False)

I ended up with loss of 0.42742 which is pretty mediocre, but still not bad for not a lot of effort, relying mostly on pretrained models.

<a id="creating_poe"></a>


## Text Generation:

> “I wish I could write as mysterious as a cat.” - Edgar Allan Poe

Ultimately, we would like to "draw" a sentence out of each author's words distributions.
We therefore need a probabilistic model that capture some aspects of these distributions.

<table><tr><td><img src='https://vignette.wikia.nocookie.net/altered-carbon/images/7/71/Poe.jpg', width="300"> <figcaption>Poe, Owner of the Raven - AI hotel, Altered Carbon</figcaption></td>
    </tr></table>

### N-gram Language Model

A simple approach for sentence generation is N-gram models.

With N-gram models, we estimate probability of each word given prior context (sequence of words).

$$
P(red|Roses,are)
$$
An N-gram model uses only N−1 words of prior context.
* unigram: $$ P(red) $$
* bigram: $$ P(red|are) $$
* trigram: $$ P(red|Roses,are) $$

The approximation of bigram: $$ P(w_1^n)= \Pi_{k=1}^n P(w_k|w_{k-1}) $$
The approximation of N-gram: $$ P(w_1^n)= \Pi_{k=1}^n P(w_k|w_{k-N+1}^{k-1}) $$

We can estimate the conditional probabilities from raw text based on the relative frequency of word sequences.
For N-gram: $$ P(w_n|w_{n-N+1}^{n-1}) = \frac{C(w^{n-1}_{n-N+1}w_n)}{C(w_{n-N+1}^{n-1})} $$

However, since the word-wise N-gram approach demands too much data for what we have, we will use charachter-wise N-gram model, which is pretty much the same theoretical idea.

In [None]:
class LM:
  def __init__(self, n):
    self.n_gram = n
    
  def train(self, text):
    self.n_counts = defaultdict(Counter)
    for i in range(0, len(text) - self.n_gram + 1):
      t = text[i:i+self.n_gram-1]
      n_char = text[i+self.n_gram-1]
      self.n_counts[t][n_char] += 1
  
  def generate(self, init_text, n):
    text = init_text
    while len(text) < n:
      lookup_text = text[-self.n_gram+1:]
      if lookup_text not in self.n_counts:
        break
      counter = self.n_counts[text[-self.n_gram+1:]]
      keys = list(counter.keys())
      values = list(counter.values())
      probs = [v/sum(values) for v in values]
      cummulative_probs = np.cumsum(probs)
      p = np.random.rand()
      for i in range(len(cummulative_probs)):
        if p <= cummulative_probs[i]:
          text += keys[i]
          break
    return text

In [None]:
def get_trained_lm(n, excerpts):
    lm = LM(n)
    text = ' '.join(list(excerpts))
    lm.train(text)
    return lm

Let's train an N-gram language model for each of the authors:

In [None]:
n=7
lm_EAP = get_trained_lm(n, train[train['author']=='EAP']['text'])
lm_HPL = get_trained_lm(n, train[train['author']=='HPL']['text'])
lm_MWS = get_trained_lm(n, train[train['author']=='MWS']['text'])
print(f"lm_EAP size: {len(lm_EAP.n_counts)}, lm_HPL size: {len(lm_HPL.n_counts)}, lm_MWS size: {len(lm_MWS.n_counts)}")

Now let's see what are the different sentences created from the same seed text, and different language models:

In [None]:
seed_string = "Dark night"
l = 120
models = {'Poe':lm_EAP, 'Lovecraft': lm_HPL, 'Shelley': lm_MWS}
for model_name in models.keys():
    generated = models[model_name].generate(seed_string, l)
    print(f"\"{generated}...\", The bot {model_name}, 2020\n")

As expected, the result is more entertaining than informative, but it sure helps to grasp what a simple model such as the N-gram may achieve.

<a id="conclusions"></a>


## Conclusions

> “Never Explain Anything”  - H.P. Lovecraft


I started with three assumptions:
1. Statistically significant differences between the authors. 
2. Good prediction for classification task. 
3. Somewhat entertaining yet sensible simulated excerpts from each author.

My conclusions are therefore the following:
1. There is indeed some significant differences between the authors, as we saw in the ANOVA and T tests.
2. We achieved reasonable prediction for the classification data, where we definitely had an overfitting problem. I assume that if we had ten times the amount of data we would have achieved better results.
3. It's actually quite surprising that a very simple language model can produce interesting generated excerpts.

