In [1]:
!pip install transformers
!pip install torch



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
import torch
import transformers as ppb
import warnings
import collections
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
warnings.filterwarnings('ignore')

In [3]:
#Read in dataframes, classify one real dataset vs all fantasy datasets
df_bird = pd.read_csv("bird_history.csv")
df_NYT = pd.read_csv("current_history_NYT.csv")
df_dorothy = pd.read_csv("dorothy.csv")
df_arthur = pd.read_csv("arthur.csv")
df_wonder = pd.read_csv("bookofwonder.csv")
df_irish = pd.read_csv("irishfairy.csv")
df_iceandfire = pd.read_csv("iceandfire.csv")

In [4]:
#Since the lines in the realistic dataset may contain footnote numbers and formatting,
#code removes formatting, but not numbers since numbers may be important to history
#Referenced for formatting: https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column
df_NYT["Sentences"] = df_NYT["Sentences"].str.replace("*", "")

In [5]:
#Modifiable lists of which realistic and fantasy dataframes to consider when randomly choosing next sentences
real_list = [df_bird, df_NYT]
fantasy_list = [df_dorothy, df_arthur, df_wonder, df_irish, df_iceandfire]

#List of datasets to pass to getTrain and use in NSP fine tuning
#Authorship labels assigned in order corresponding to this full list of datasets
df_list = [df_bird, df_NYT, df_dorothy, df_arthur, df_wonder, df_irish, df_iceandfire]


In [6]:
#Called in getTrain. 
#Contains code that fills in half correct next sentences and half random sentences from opposite genre
def fillNSP(fill_list, ref_df, opposite_df):
  half = int(len(fill_list) / 2)
  sequence_list = []
  #Code to get indices from https://www.geeksforgeeks.org/how-to-get-rows-index-names-in-pandas-dataframe/
  index_list = fill_list.index.values.tolist()
  #Fill in accurate next lines
  for j in range(half):
    current_sentence = fill_list.iloc[j].strip()
    
    #Check index first to preempt edge case where trying to access next sentence at end of df
    index = index_list[j]
    if(index + 1 < len(ref_df)):
      next_sentence = ref_df["Sentences"][index + 1].strip()
    else:
      #If at end of df, just step back 1 and use current sentence as next sentence
      next_sentence = fill_list.iloc[j].strip()
      current_sentence = ref_df["Sentences"][index - 1].strip()
    #Add formatting for first sentence
    sequence = "[CLS] " + current_sentence + " [SEP] " + next_sentence + " [SEP]"
    sequence_list.append(sequence)
  
  #Fill in random next_lines from the opposite genre
  for j in range(half, len(fill_list)):
    #Get index of df first since np.random.choice can't choose a random dataframe directly
    random_df_index = np.random.choice(range(len(opposite_df)), 1)
    random_df_index = random_df_index[0]
    random_df = opposite_df[random_df_index]
    random_index = np.random.choice(range(len(random_df)), 1)
    random_index = random_index[0]
    next_sentence = random_df["Sentences"][random_index].strip()
    
    current_sentence = fill_list.iloc[j].strip()
    #Add formatting for first sentence
    sequence = "[CLS] " + current_sentence + " [SEP] " + next_sentence + " [SEP]"
    sequence_list.append(sequence)
  return sequence_list

def getFeatures(df_list, real_list, fantasy_list, max_size):
  #df_list is a full list of dataframes to get examples from
  #real_list is a list of realistic dataframes to use
  #fantasy_list is a list of fantasy dataframes to use
  #max_size is the maximum number of examples to grab from any given dataset
  #Returns a list of lists of next sentence prediction formatted examples, to be split into train/test hidden states outputs after being passed through the model.
  #Return will be ordered in same order as the input. Authorship labels not added in this function


  #Shuffling dataframe references https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows

  #Split off the testing examples after shuffling
  shuffled = []
  for i in range(len(df_list)):
    shuffled.append(shuffle(df_list[i]))

  sample_list = []

  for df in shuffled:
    #Get size of split using max train and test_size
    #If dataset is big enough, use max_size examples, else just split whole dataset
    if(max_size < len(df)):
      #Just grab max_size first examples since the dataframes are shuffled
      df = df[:max_size]

    #Append tuples to fill in subsequent sentences
    sample_list.append((df["Sentences"], df["Label"]))
  
  #Define lists of lists to fill and return
  nsp_inputs = []

  #Fill train sequences
  for i in range(len(sample_list)):
    ref_df = df_list[i]
    X = sample_list[i][0]
    y = sample_list[i][1]

    #Since all dataframes have same label of realistic vs fantasy,
    #use first index label to get whether currently working on a fantasy dataframe
    fantasy = y.iloc[0]
    if(fantasy == 1):
      opposite_df = real_list
    else:
      opposite_df = fantasy_list
    
    sequence_list = fillNSP(X, ref_df, opposite_df)
    nsp_inputs.append(sequence_list)
  return nsp_inputs

def assignLabels(nsp_inputs, used_labels, mixed_label=7):
  #nsp_inputs is a list of lists of inputs divided by dataframe returned by getFeatures()
  #used_labels is the corresponding authorship labels used in getFeatures, since getFeatures can use a subset of the data
  #mixed_label is an index corresponding to an extra class beyond the datasets. It represents data that comes from two different authors
  #The mixed label is used since we are using NSP examples as inputs into the authorship classifier, and it can be changed if the test requires it.
  #Returns three values:
  # 1. A list of all of the examples from all of the dataframes concatenated together
  # 2. A list of nsp_labels where 0 indicates that the second sentence follows and 1 indicates the second sentence is random
  # 3. A list of authorship labels corresponding to each example

  nsp_examples = []
  nsp_labels = []
  author_labels = []

  for i in range(len(nsp_inputs)):
    examples = nsp_inputs[i]
    author = used_labels[i]
    half = float(len(examples) / 2)
    for j in range(len(examples)):
      nsp_examples.append(examples[j])
      if(j < half):
        nsp_labels.append(0)
        author_labels.append(author)
      else:
        nsp_labels.append(1)
        author_labels.append(mixed_label)
  return nsp_examples, nsp_labels, author_labels


In [7]:
#Use functions defined in previous block to get features, then assign labels and format features

#Returns a list of lists of examples drawn from the dataframes listed in the first argument
nsp_inputs = getFeatures(df_list, real_list, fantasy_list, 2000)

#Manually create a list of the authorship labels corresponding to the list passed in above
#For example, if df_arthur wasn't used in the first argument, then 3 should be excluded from this list
used_labels = [0, 1, 2, 3, 4, 5, 6]

#Assign authorship and NSP labels here
nsp_examples, nsp_labels, author_labels = assignLabels(nsp_inputs, used_labels)


In [8]:
#Perform Tokenization
#Code modified from example given in https://huggingface.co/transformers/model_doc/bert.html
from transformers import BertTokenizer, BertForNextSentencePrediction,BertConfig

config = BertConfig.from_pretrained('bert-base-uncased',output_hidden_states=True, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased',config=config)

#Encode all examples in place
for i in range(len(nsp_examples)):
  nsp_examples[i] = tokenizer.encode(nsp_examples[i])

#After encoding, need to do padding and attention mask before creating the tensor since example len needs to match
#Code to do padding and create attention mask came from the below link:
#https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=izA3-6kffbdT
max_val = 0
for example in nsp_examples:
  if(len(example) > max_val):
    max_val = len(example)


padded = np.array([i + [0]*(max_val-len(i)) for i in nsp_examples])
attention_mask = np.where(padded != 0, 1, 0)

#Create tensors to pass into model
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)
nsp_labels = torch.tensor(nsp_labels)

In [9]:
print(max_val)

283


In [10]:
#Code to do remove grad variable restriction from below link:
#https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=izA3-6kffbdT
with torch.no_grad():
  outputs = model(input_ids, attention_mask=attention_mask, next_sentence_label=nsp_labels)

#Get BERT loss on the NSP task
loss = outputs[0]
hidden_states = outputs[2]
embedding_output = hidden_states[0]
print(len(embedding_output))
features= embedding_output.numpy()
print(np.shape(features))

#Referenced to fix an error with model inputs:
#https://stackoverflow.com/questions/58682026/failed-to-find-data-adapter-that-can-handle-input-class-numpy-ndarray-cl
labels=np.asarray(author_labels)

10977
(10977, 283, 768)


In [None]:
print(loss)

In [12]:
#Referenced during creation of the model: https://keras.io/layers/recurrent/

#Do 0.25 test split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

data_dim = 768
timesteps = max_val
a = tf.keras.Sequential()
a.add(layers.LSTM(32, return_sequences=True,
               input_shape=(timesteps, data_dim))) 
a.add(layers.Flatten())
a.add(layers.Dense(8, activation='softmax'))
a.compile(loss='sparse_categorical_crossentropy',
            optimizer='rmsprop',
              metrics=['accuracy'])

a.fit(train_features,train_labels,epochs=10)
a.evaluate(test_features, test_labels)

Train on 8232 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[1.0555759338299, 0.81857926]

In [14]:
print(len(test_features))

2745
