This notebook is dedicated for the classification between Nietzsche's early and late work. The cutoff is set to be 1879, when Nietzsche had to resign from his university post for his illness.

In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
import lucem_illud 
import time
import datetime

%matplotlib inline

import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [18]:
gpu = torch.cuda.is_available()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if gpu:
    n_gpu = torch.cuda.device_count()
    torch.cuda.get_device_name(0)

In [4]:
# Helper functions from week 8 homework
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Import the Nietzsche dataset

In [5]:
# label = 1 means that the text is from Nietzsche's work after 1879, 
# the year where his health conditions force him to resign his professorship at Basel.
ndf = pd.read_csv('data/nietzsche_gutenberg.csv')
ndf.sample(5, random_state=20210310)

Unnamed: 0,text,book,year,after_1879,elisabeth_published
5622,Of course I do not mean that the thought in an...,early_greek,1871,0,0
32772,With the storm that is called spirit did I blo...,zarathustra,1883,1,0
23097,The world of energy does not therefore reach a...,twilight,1888,1,0
33837,"Curiously do they exert themselves, like an el...",zarathustra,1883,1,0
4502,"Their pastime, as cruel as it is lamentable, i...",dawn,1881,1,0


In [7]:
# pull out a holdout set from data
main_set, holdout_set = train_test_split(ndf, test_size=0.2, random_state=20210310)

main_set.to_csv('data/nietzsche_gutenberg_main.csv', index=False)
holdout_set.to_csv('data/nietzsche_gutenberg_holdout.csv', index=False)

## Prepping main set for training

In [9]:
sentences = main_set.text.values
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = main_set.after_1879.values
MAX_LEN = 128

# Tokenizing 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

# Word to idx
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)
    
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2020, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2020, test_size=0.1)

Tokenize the first sentence:
['[CLS]', 'and', 'it', 'is', 'perhaps', 'not', 'only', 'the', 'agree', '##able', 'and', 'friendly', 'pictures', 'that', 'he', 'realises', 'in', 'himself', 'with', 'such', 'perfect', 'understanding', ':', 'the', 'earnest', ',', 'the', 'troubled', ',', 'the', 'dr', '##ear', '##y', ',', 'the', 'gloom', '##y', ',', 'the', 'sudden', 'checks', ',', 'the', 'tricks', 'of', 'fortune', ',', 'the', 'uneasy', 'present', '##ime', '##nts', ',', 'in', 'short', ',', 'the', 'whole', 'divine', 'comedy', 'of', 'life', ',', 'and', 'the', 'inferno', ',', 'also', 'pass', 'before', 'him', ',', 'not', 'merely', 'like', 'pictures', 'on', 'the', 'wall', '##for', 'he', 'too', 'lives', 'and', 'suffers', 'in', 'these', 'scenes', ',', 'and', 'yet', 'not', 'without', 'that', 'fleeting', 'sensation', 'of', 'appearance', '.', '[SEP]']


In [68]:
# Prepping test set
sentences_holdout = holdout_set.text.values
sentences_holdout = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels_holdout = holdout_set.after_1879.values
MAX_LEN = 128

# Tokenizing 
tokenizer_holdout = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts_holdout = [tokenizer.tokenize(sent) for sent in sentences]

# Word to idx
input_ids_holdout = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids_holdout = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# LSTM model

In [10]:
# LSTM model parameters
vocab_in_size = tokenizer.vocab_size
embedding_dim = 32
unit = 100
no_labels = len(np.unique(train_labels))
batch_size = 32

In [11]:
# LSTM model, with one hidden layer
model_lstm = Sequential()
model_lstm.add(Embedding(vocab_in_size, embedding_dim, input_length=MAX_LEN))
model_lstm.add(LSTM(unit))
model_lstm.add(Dense(no_labels, activation='softmax'))
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 128, 32)           976704    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 2)                 202       
Total params: 1,030,106
Trainable params: 1,030,106
Non-trainable params: 0
_________________________________________________________________


In [12]:
nietzsche_lstm = model_lstm.fit(train_inputs, train_labels, 
                              epochs=10,batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [76]:
def predict_label(logits):
    m = torch.nn.Sigmoid()
    labels = []
    for pair in logits:
        l = 1
        if pair[0] > pair[1]:
            l = 0
        labels.append(l)
    
    return np.array(labels)      

In [79]:
# Testing on holdout set
predicted_logits = model_lstm.predict(input_ids_holdout)
predictions = predict_label(predicted_logits)
accuracy = (predictions == labels_holdout).sum() / len(labels_holdout)
print(f'The holdout set accuracy for LSTM model is {accuracy}')

The holdout set accuracy for LSTM model is 0.8233318732661702


# BERT model

In [14]:
# Please refer to the following link for the Colab notebook, or to file 'clf/Classification_DeepLearning_colab.ipynb'
# https://colab.research.google.com/drive/1AAJxWCLKu6bv8TQWN3X3bn1UJbBXVmUb?usp=sharing
model = BertForSequenceClassification.from_pretrained("clf/model_nietzsche_clf", num_labels=2)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [16]:
# Prepping the holdout set for BERT model input
sentences = holdout_set.text.values
labels = holdout_set.after_1879.values

input_ids = []

for sent in sentences:
    encoded_sent = tokenizer.encode(
                        sent,                      
                        add_special_tokens = True, 
                   )
    
    input_ids.append(encoded_sent)

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

attention_masks = []

for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

batch_size = 32  

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [20]:
# Predicting the holdout set
print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

model.eval()

predictions , true_labels = [], []

for batch in prediction_dataloader:
    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 6,849 test sentences...
    DONE.


In [53]:
# calculating accuracy on holdout set
accu = 0
total = 0
for i in range(len(true_labels)):
    total += len(true_labels[i])
    accu += (predict_label(predictions[1]) == true_labels[1]).sum()

print(f'The holdout accuracy is {round(accu/total, 3)}')

The holdout accuracy is 0.879


# Text Generation with BERT

In [55]:
# Please refer to colab notebook above

# The following code is for prepping dataset, then the ready input is put into colab file
# train_text, test_text = train_test_split(ndf['text'], test_size=0.2, random_state=20210310)
#train_text.to_frame().to_csv(r'train_text_nietzsche', header=None, index=None, sep=' ', mode='a')
#test_text.to_frame().to_csv(r'test_text_nietzsche', header=None, index=None, sep=' ', mode='a')

from transformers import AutoModelWithLMHead, AutoTokenizer

# loading trained model
tokenizer_nietzsche = AutoTokenizer.from_pretrained("clf/output_nietzsche")
model_nietzsche = AutoModelWithLMHead.from_pretrained("clf/output_nietzsche")

Then we start to generate sentences:

In [56]:
sequence = "Life is nothing but"

input = tokenizer_nietzsche.encode(sequence, return_tensors="pt")
generated = model_nietzsche.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_nietzsche.decode(generated.tolist()[0])
print(resulting_string)

Life is nothing but a _dramatic_ phenomenon, and the _dramatic_ phenomenon is the _dramatic_ phenomenon."
"The _dramatic_ phenomenon is the _dramatic_ phenomenon, and the _


In [57]:
sequence = "One should live like"

input = tokenizer_nietzsche.encode(sequence, return_tensors="pt")
generated = model_nietzsche.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_nietzsche.decode(generated.tolist()[0])
print(resulting_string)

One should live like a man, and not like a woman."
"The _décadence_ of the _décadence_ is the _décadence_ of the _décadence_


In [58]:
sequence = "My dear friend"

input = tokenizer_nietzsche.encode(sequence, return_tensors="pt")
generated = model_nietzsche.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_nietzsche.decode(generated.tolist()[0])
print(resulting_string)

My dear friend, I am not a man of the future, but a man of the past."
"The _décadence_ of the _décadence_ is the _décadence_ of the


In [59]:
sequence = "If you gaze for long into an abyss"

input = tokenizer_nietzsche.encode(sequence, return_tensors="pt")
generated = model_nietzsche.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_nietzsche.decode(generated.tolist()[0])
print(resulting_string)

If you gaze for long into an abyss, you will find that it is a labyrinth, and that it is a labyrinth of labyrinths."
"The _décadence_ of the _décadence_ is the _


In [62]:
sequence = "Women are"

input = tokenizer_nietzsche.encode(sequence, return_tensors="pt")
generated = model_nietzsche.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_nietzsche.decode(generated.tolist()[0])
print(resulting_string)

Women are the most dangerous, the most dangerous, the most dangerous, the most dangerous, the most dangerous, the most dangerous, the most dangerous, the most dangerous, the most dangerous, the most dangerous, the most dangerous, the most dangerous,


In [67]:
sequence = "Men are"

input = tokenizer_nietzsche.encode(sequence, return_tensors="pt")
generated = model_nietzsche.generate(input, max_length=50, bos_token_id=1, pad_token_id=1, eos_token_ids=1)

resulting_string = tokenizer_nietzsche.decode(generated.tolist()[0])
print(resulting_string)

Men are the most dangerous creatures in the world, and the most dangerous creatures in the world are the most dangerous creatures in the world."
"The _great_ and the _greatest_ are the _greatest_ and the greatest_."
