In [0]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 3.1MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 8.3MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 17.3MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |█████

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [0]:
#Read in dataframes, classify one real dataset vs all fantasy datasets
df_real = pd.read_csv("current_history_NYT.csv")
df_dorothy = pd.read_csv("dorothy.csv")
df_arthur = pd.read_csv("arthur.csv")
df_wonder = pd.read_csv("bookofwonder.csv")
df_irish = pd.read_csv("irishfairy.csv")
df_iceandfire = pd.read_csv("iceandfire.csv")

In [0]:
#Since the lines in the realistic dataset may contain footnote numbers and formatting,
#code removes formatting, but not numbers since numbers may be important to history
#Referenced for formatting: https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column
df_real["Sentences"] = df_real["Sentences"].str.replace("*", "")

In [0]:
#For BERT features, limit datasets with more than 1200 lines to 1200.
#This is done to avoid exceeding the RAM provided by free Colab
df_real = df_real[:1200]
df_dorothy = df_dorothy[:1200]
df_arthur = df_arthur[:1200]
df_wonder = df_wonder[:1200]
df_irish = df_irish[:1200]
df_iceandfire = df_iceandfire[:1200]

In [0]:
from sklearn.utils import shuffle

#Create batch dataframes that store combined realistic and fantasy data
dorothy_batch = df_real.append(df_dorothy, ignore_index=True)
arthur_batch = df_real.append(df_arthur, ignore_index=True)
wonder_batch = df_real.append(df_wonder, ignore_index=True)
irish_batch = df_real.append(df_irish, ignore_index=True)
iceandfire_batch = df_real.append(df_iceandfire, ignore_index=True)

In [0]:
#Place all batch columns into variables

dorothy_sentences = dorothy_batch["Sentences"]
dorothy_labels = dorothy_batch["Label"]

arthur_sentences = arthur_batch["Sentences"]
arthur_labels = arthur_batch["Label"]

wonder_sentences = wonder_batch["Sentences"]
wonder_labels = wonder_batch["Label"]

irish_sentences = irish_batch["Sentences"]
irish_labels = irish_batch["Label"]

iceandfire_sentences = iceandfire_batch["Sentences"]
iceandfire_labels = iceandfire_batch["Label"]

In [0]:
#Code from this point downward is a modified version of base code for a BERT classifier from below link:
#https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=izA3-6kffbdT

model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [0]:
#Create tokenized inputs for BERT
tokenized = iceandfire_sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=False)))

In [0]:
#Pad all sentences to greatest length because BERT needs all inputs to be the same length
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [0]:
#Create attention mask on padded that tells BERT to avoid calculating attention on padding
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2400, 180)

In [0]:
#Place variables in tensors since the library is a pytorch base
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

#torch.no_grad disables autograd on the last_hidden_states variable
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [0]:
#Get only the [CLS] token feature
features = last_hidden_states[0][:,0,:].numpy()

In [0]:
#Assign labels variable based on current dataset
labels = iceandfire_labels

In [0]:
#Split using .25 test split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [0]:
#Fit logistic regression model with 100 epochs
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
#Get test accuracy
lr_clf.score(test_features, test_labels)

0.9416666666666667