### Connect to Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%cd "/content/drive/My Drive/StoryCompletion/src"

/content/drive/My Drive/StoryCompletion/src


### Read Model and Generate Text

In [0]:
import json
import os
import numpy as np
import tensorflow as tf

import model, sample, encoder

In [0]:
def get_output(
    model_name='117M',
    seed=None,
    nsamples=1,
    batch_size=1,
    length=None,
    temperature=1,
    top_k=0,
    input_text=""
):
    """
    Interactively run the model
    :model_name=117M : String, which model to use
    :seed=None : Integer seed for random number generators, fix seed to reproduce
     results
    :nsamples=1 : Number of samples to return total
    :batch_size=1 : Number of batches (only affects speed/memory).  Must divide nsamples.
    :length=None : Number of tokens in generated text, if None (default), is
     determined by model hyperparameters
    :temperature=1 : Float value controlling randomness in boltzmann
     distribution. Lower temperature results in less random completions. As the
     temperature approaches zero, the model will become deterministic and
     repetitive. Higher temperature results in more random completions.
    :top_k=0 : Integer value controlling diversity. 1 means only 1 word is
     considered for each step (token), resulting in deterministic completions,
     while 40 means 40 words are considered at each step. 0 (default) is a
     special setting meaning no restrictions. 40 generally is a good value.
    """
    if batch_size is None:
        batch_size = 1
    assert nsamples % batch_size == 0
    
    if len(input_text) == 0:
        raise ValueError("Please input a valid sequence!")

    enc = encoder.get_encoder(model_name)
    hparams = model.default_hparams()
    with open(os.path.join('models', model_name, 'hparams.json')) as f:
        hparams.override_from_dict(json.load(f))

    if length is None:
        length = hparams.n_ctx // 2
    elif length > hparams.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)

    with tf.Session(graph=tf.Graph()) as sess:
        context = tf.placeholder(tf.int32, [batch_size, None])
        np.random.seed(seed)
        tf.set_random_seed(seed)
        output = sample.sample_sequence(
            hparams=hparams, length=length,
            context=context,
            batch_size=batch_size,
            temperature=temperature, top_k=top_k
        )

        saver = tf.train.Saver()
        ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name))
        saver.restore(sess, ckpt)

        while True:
            context_tokens = enc.encode(input_text)
            generated = 0
            for _ in range(nsamples // batch_size):
                out = sess.run(output, feed_dict={
                    context: [context_tokens for _ in range(batch_size)]
                })[:, len(context_tokens):]
                for i in range(batch_size):
                    generated += 1
                    text = enc.decode(out[i])
#                     print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
#                     print(text)
                    return text
#             print("=" * 80)

In [0]:
get_output(model_name="GOT", input_text="Arya was walking towards the woods")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.random.categorical instead.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from models/GOT/model-1092


' near the gate then turned and turned and turned in the trees, pushing her foot away from the sword.\n\nI should have peered at him a third time. She has enough balls to make the stands but I never knew. He had a lord’s will—she had full approval from his mother, but Arya had only half so much his . . .\n\nWhen she saw his sword, how did she lift her gaze? She landed in a fistful of trees, startled to see the smell of hoarfrost dying at once and her fingers discolored with soft. This is real bread, Arys frowned at her. The bench was there to keep her milling, the youngest of his brothers catching her chuckling. Thankfully, they tend her cowling in this lumbering field.\n\nA dead branch yawned before her. Arya leaned out and pressed it in with her finger as if it were wings. Swiftly she rose, face first, carried it to the wood, then slid the blade down inside and walked back to the stream. Behind she heard a faint beeping on the bridge. Their feet were tangled in the trunk when they fi

In [0]:
text = get_output(model_name="GOT", input_text="Arya was walking towards the woods.", length=50)

INFO:tensorflow:Restoring parameters from models/GOT/model-1092


In [0]:
text

' When she looked, she saw Marsh near her, behind some wood. Benjen told her about them. They were:\n\nOld Nan told them his father was Nurse, his uncle Jasper. His son Wicar said he was Ser Nurse.'

### Load validation data for GOT

In [0]:
validation_sample = open("evaluation/got_validation.txt").read()

In [0]:
validation_sample = validation_sample.split("\n")

In [0]:
import pandas as pd
import random

In [0]:
got_df = pd.DataFrame(columns=["input", "continuation", "label"])

random_indices = []
df_idx = 0

while True:
    if got_df.shape[0] > 200:
        break
    index = random.randint(0, len(validation_sample) - 2)
    if index in random_indices:
        continue
    random_indices.append(index)
    row1 = {
        "input": validation_sample[index],
        "continuation": validation_sample[index + 1],
        "label": 0
    }
    got_df.loc[df_idx] = row1
    df_idx += 1
    row2 = {
        "input": validation_sample[index],
        "continuation": get_output(model_name="GOT", 
                                input_text = validation_sample[index],
                                length = 50),
        "label": 1
    }
    got_df.loc[df_idx] = row2
    df_idx += 1

In [0]:
got_df["continuation"] = got_df["continuation"].str.replace("\n", "")

In [0]:
got_df.to_csv("got_validation.csv", index=False)

### LR-GOT

In [0]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [41]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  

In [0]:
def clean_doc(document, stemmer):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(document))

   # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Converting to Lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)

    return document

In [0]:
stemmer = WordNetLemmatizer()
got_df['clean_output'] = got_df['continuation'].apply(lambda doc: clean_doc(doc, stemmer))

In [0]:
got_df.head()

Unnamed: 0,input,continuation,label,clean_output
0,From a sword through his belly if you have you...,"Stannis pressed his lips together. ""Serve me ...",0,stannis pressed his lip together serve me well...
1,From a sword through his belly if you have you...,Neither were eating well enough unless he joi...,1,neither were eating well enough unless he join...
2,""" — you will avenge my death, and seat my daug...","Ser Justin put one hand on his sword hilt. ""O...",0,ser justin put one hand on his sword hilt on m...
3,""" — you will avenge my death, and seat my daug...",“Your lady mother has squandered the last thre...,1,your lady mother ha squandered the last three ...
4,"""They would.""","""We might question them more sharply... """,0,we might question them more sharply


In [0]:
X = tfidfconverter.fit_transform(list(got_df["clean_output"])).toarray()

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, got_df['label'].astype(int))

In [0]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [0]:
predictions = lr_clf.predict(X_test)

In [0]:
print("Accuracy", accuracy_score(y_test, predictions))
print("F1", f1_score(y_test, predictions))

Accuracy 0.49019607843137253
F1 0.5517241379310346


### LR- ROC Stories

In [0]:
roc = open("evaluation/ShortStoriesTest.txt").read()

In [0]:
roc = roc.split("\n")

In [0]:
roc_df = pd.DataFrame(columns=["input", "continuation", "label"])

random_indices = []
df_idx = 0

while True:
    if roc_df.shape[0] > 200:
        break
    index = random.randint(0, len(roc) - 2)
    if index in random_indices:
        continue
    random_indices.append(index)
    input_text = roc[index].split('. ')[0] + "."
    output_text = '. '.join(roc[index].split('. ')[1:])
    row1 = {
        "input": input_text,
        "continuation": output_text,
        "label": 0
    }
    roc_df.loc[df_idx] = row1
    df_idx += 1
    row2 = {
        "input": input_text,
        "continuation": get_output(model_name="ROC", 
                                input_text = input_text,
                                length = 50).split('\n')[0],
        "label": 1
    }
    roc_df.loc[df_idx] = row2
    df_idx += 1

In [0]:
roc_df.to_csv("evaluation/roc_test.csv", index=False)

In [0]:
roc_df['clean_output'] = roc_df['continuation'].apply(lambda doc: clean_doc(doc, stemmer))

In [0]:
X = tfidfconverter.fit_transform(list(roc_df["clean_output"])).toarray()

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, roc_df['label'].astype(int))

In [0]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [0]:
predictions = lr_clf.predict(X_test)

In [0]:
print("Accuracy", accuracy_score(y_test, predictions))
print("F1", f1_score(y_test, predictions))

Accuracy 0.49019607843137253
F1 0.4999999999999999


### LR - Essays

In [0]:
essays = pd.read_csv("evaluation/EssaysTest.tsv", sep="\t", encoding="latin1")

In [0]:
essays = essays[(~essays.essay.str.contains("Dear")) & (~essays.essay.str.contains("@"))].copy()

In [0]:
essays["essay"] = essays["essay"].astype(str)
essays["sentences"] = essays["essay"].str.split("\. ")

In [0]:
essays["sentences"].iloc[0]

['My opinion of the effect computers have on people is that people arent safe',
 'A lot of teens and kids on on their computers and go to yahoo + or my space to talk to their friends',
 'but sometimes teen talk to people they don\x92t know that can hurt them and tell them something that makes them uncomfortible',
 'I like computers really I do and I know you like it too but sometimes you have to be careful when you surfing the web ',
 'Another effect computers have on people is that people arnt healthy',
 'Kids and teens even abults dont go outside to exercise or get fresh air because they are too busy at their house playing on their computer',
 'That why alot of kids and adults gain alot of weigh because they dont stop messing with their computer',
 'you would not want to get fat because your not takeing care of your body because you are playing on the computer',
 'My third reason computer have effect on people is that they arent learning',
 'Kids are just play mini games on their com

In [0]:
essay_sentences = []
for essay in essays["sentences"]:
    essay_sentences.extend(essay)

In [0]:
len(essay_sentences)

9668

In [0]:
essay_sentences_clean = [essay + "." for essay in essay_sentences if len(essay) > 0 and essay[-1] != "."]

In [0]:
essay_sentences_clean[:5]

['My opinion of the effect computers have on people is that people arent safe.',
 'A lot of teens and kids on on their computers and go to yahoo + or my space to talk to their friends.',
 'but sometimes teen talk to people they don\x92t know that can hurt them and tell them something that makes them uncomfortible.',
 'I like computers really I do and I know you like it too but sometimes you have to be careful when you surfing the web .',
 'Another effect computers have on people is that people arnt healthy.']

In [0]:
len(essay_sentences_clean)

8322

In [0]:
essay_df = pd.DataFrame(columns=["input", "continuation", "label"])

random_indices = []
df_idx = 0

while True:
    if essay_df.shape[0] > 200:
        break
    index = random.randint(0, len(essay_sentences_clean) - 2)
    if index in random_indices:
        continue
    random_indices.append(index)
    input_text = essay_sentences_clean[index]
    row1 = {
        "input": input_text,
        "continuation": essay_sentences_clean[index + 1],
        "label": 0
    }
    essay_df.loc[df_idx] = row1
    df_idx += 1
    row2 = {
        "input": input_text,
        "continuation": get_output(model_name="Essay", 
                                input_text = input_text,
                                length = 50),
        "label": 1
    }
    essay_df.loc[df_idx] = row2
    df_idx += 1

In [0]:
essay_df["continuation"] = essay_df['continuation'].str.replace("\n", "")

In [0]:
essay_df.to_csv("evaluation/essay_test.csv", index=False)

In [0]:
essay_df.head()

Unnamed: 0,input,continuation,label
0,The obsticles that the builders had to face we...,If the wind blew the blimp to much it could ca...,0
1,The obsticles that the builders had to face we...,Most floors in single story buildings began wi...,1
2,A lot of teens and kids on on their computers ...,but sometimes teen talk to people they dont k...,0
3,A lot of teens and kids on on their computers ...,At first you shouldn't build a desktop applica...,1
4,Let's plant it right now. This shows that eve...,This means that no matter how beat-down or dis...,0


In [0]:
stemmer = WordNetLemmatizer()

In [0]:
essay_df['clean_output'] = essay_df['continuation'].apply(lambda doc: clean_doc(doc, stemmer))

In [0]:
essay_df.head()

Unnamed: 0,input,continuation,label,clean_output
0,The obsticles that the builders had to face we...,If the wind blew the blimp to much it could ca...,0,if the wind blew the blimp to much it could ca...
1,The obsticles that the builders had to face we...,Most floors in single story buildings began wi...,1,most floor in single story building began with...
2,A lot of teens and kids on on their computers ...,but sometimes teen talk to people they dont k...,0,but sometimes teen talk to people they don t k...
3,A lot of teens and kids on on their computers ...,At first you shouldn't build a desktop applica...,1,at first you shouldn t build a desktop applica...
4,Let's plant it right now. This shows that eve...,This means that no matter how beat-down or dis...,0,this mean that no matter how beat down or disa...


In [0]:
X = tfidfconverter.fit_transform(list(essay_df["clean_output"])).toarray()

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, essay_df['label'].astype(int))

In [0]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [0]:
predictions = lr_clf.predict(X_test)

In [0]:
print("Accuracy", accuracy_score(y_test, predictions))
print("F1", f1_score(y_test, predictions))

Accuracy 0.6666666666666666
F1 0.6382978723404256


### Calculating Perplexity

##### Essay

In [0]:
from collections import defaultdict

In [0]:
essay_text = essays["essay"].str.cat(sep = " ")

In [0]:
essay_tokens = nltk.word_tokenize(essay_text)

In [0]:
def unigram(tokens):    
    model = defaultdict(lambda: 0.01)
    for f in tokens:
        try:
            model[f] += 1
        except KeyError:
            model [f] = 1
            continue
    N = float(sum(model.values()))
    for word in model:
        model[word] = model[word]/N
    return model

In [0]:
essay_unigram_model = unigram(essay_tokens)

In [0]:
def calc_perplexity(testset, model):
    perplexities = []
    for i, sentence in enumerate(testset):
        sentence = sentence.split()
        perplexity = 1
        N = 0
        for word in sentence:
            N += 1
            perplexity = perplexity * (1/model[word])
        try:
            perplexity = pow(perplexity, 1/float(N)) 
        except:
            continue
        perplexities.append(perplexity)
    return perplexities

In [0]:
testset_sentences = list(essay_df[essay_df["label"] == 1]["continuation"].str.split("\. "))

In [0]:
testset = []
for l in testset_sentences:
    testset.extend(l)

In [0]:
essay_perplexities = calc_perplexity(testset, essay_unigram_model)

In [0]:
import numpy as np

In [0]:
np.prod(np.reciprocal(essay_perplexities))

0.0

In [0]:
essay_perplexity

1.0013860976076292

In [0]:
pow(2, -1 * np.sum(np.log2(essay_perplexities)) / len(essay_unigram_model))

0.8791857019977695

In [0]:
pow(2, -1 * np.sum(np.log2(essay_perplexities)) / len(essay_unigram_model))

0.8791857019977695

In [0]:
np.mean(essay_perplexities)

606.3497527577097

##### ROC

In [0]:
corpus = " ".join(open("evaluation/ShortStoriesTest.txt").read().split("\n"))

In [0]:
roc_tokens = nltk.word_tokenize(corpus)

In [0]:
roc_unigram_model = unigram(roc_tokens)

In [0]:
roc_df = pd.read_csv("evaluation/roc_test.csv")

In [0]:
roc_sentences = list(roc_df[roc_df.label == 1]["continuation"].dropna().str.split("\. "))

In [0]:
testset = []

In [0]:
for sentence in roc_sentences:
#     print(sentence)
    testset.extend(sentence)

In [0]:
np.mean(calc_perplexity(testset, roc_unigram_model))

807.0305980382706

##### GOT

In [0]:
corpus = " ".join(open("evaluation/got_validation.txt").read().split("\n"))

In [0]:
got_tokens = nltk.word_tokenize(corpus)

In [0]:
got_unigram_model = unigram(got_tokens)

In [0]:
got_df = pd.read_csv("evaluation/got_validation.csv")

In [0]:
got_sentences = list(got_df[got_df.label == 1]["continuation"].dropna().str.split("\. "))

In [0]:
testset = []
for sentence in got_sentences:
    testset.extend(sentence)

In [0]:
np.mean(calc_perplexity(testset, got_unigram_model))

312.3814644394952

### Cosine Similarity

In [0]:
import pandas as pd
import numpy as np

In [31]:
from spacy.lang.en.stop_words import STOP_WORDS
import gensim, re
import numpy as np
from scipy.spatial.distance import cosine



In [0]:
w2v = gensim.models.KeyedVectors.load_word2vec_format("models/GoogleNews-vectors-negative300-SLIM.bin", binary=True)

In [0]:
def clean(text):
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r"[\W\d]", " ", text)
    text = re.sub(r"\s+", " ", text)
    for word in STOP_WORDS:
        text = text.replace(" " + word + " ", " ")
    return text

In [0]:
def getAverageVector(text, w2v):
    words = clean(text).split()
    vectors = []
    for word in words:
        try:
            vectors.append(w2v[word])
        except:
            continue
    return np.mean(vectors, axis=0)

In [0]:
def getSimilarity(actual, generated, w2v=w2v):
    v1 = getAverageVector(actual, w2v)
    v2 = getAverageVector(generated, w2v)
    return 1 - cosine(v1, v2)

##### ROC

In [0]:
df_roc = pd.read_csv("evaluation/roc_test.csv")

In [0]:
df_roc.head()

Unnamed: 0,input,continuation,label
0,Our manager introduced us to a new employee.,"Since she was attractive, all of the guys were...",0
1,Our manager introduced us to a new employee.,The new employee was nice and young. I though...,1
2,Carl was tasked with finding his family's Chri...,He was fifteen and he felt very manly and impo...,0
3,Carl was tasked with finding his family's Chri...,"He searched all over town for his dad. There,...",1
4,Bob was always upset about his looks.,So he saved up money to get plastic surgery. H...,0


In [0]:
comparative_df = pd.concat([df_roc[df_roc.label == 0].set_index('input'), df_roc[df_roc.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "generated", "label1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["generated"])

In [0]:
comparative_df.head()

Unnamed: 0,input,actual,generated,cosine_similarity
0,Our manager introduced us to a new employee.,"Since she was attractive, all of the guys were...",The new employee was nice and young. I though...,0.649152
1,Carl was tasked with finding his family's Chri...,He was fifteen and he felt very manly and impo...,"He searched all over town for his dad. There,...",0.724642
2,Bob was always upset about his looks.,So he saved up money to get plastic surgery. H...,"When she nurses the Metro, he would always co...",0.302044
3,The man was not honest or ethical.,He would get people to invest in his business ...,He was just an evil man. A fact he cared very...,0.488777
4,Randy was a guard in a prison.,"One day, he became distracted watching The Sim...",He watched as his fellow guards acted strange...,0.6227


In [0]:
comparative_df["cosine_similarity"].mean()

0.5393762010700849

##### Essays

In [0]:
df_essay = pd.read_csv("evaluation/essay_test.csv")

In [0]:
df_essay.head()

Unnamed: 0,input,continuation,label
0,The obsticles that the builders had to face we...,If the wind blew the blimp to much it could ca...,0
1,The obsticles that the builders had to face we...,Most floors in single story buildings began wi...,1
2,A lot of teens and kids on on their computers ...,but sometimes teen talk to people they dont k...,0
3,A lot of teens and kids on on their computers ...,At first you shouldn't build a desktop applica...,1
4,Let's plant it right now. This shows that eve...,This means that no matter how beat-down or dis...,0


In [0]:
comparative_df = pd.concat([df_essay[df_essay.label == 0].set_index('input'), df_essay[df_essay.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "generated", "label1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["generated"])

In [0]:
comparative_df.head()

Unnamed: 0,input,actual,generated,cosine_similarity
0,The obsticles that the builders had to face we...,If the wind blew the blimp to much it could ca...,Most floors in single story buildings began wi...,0.545437
1,A lot of teens and kids on on their computers ...,but sometimes teen talk to people they dont k...,At first you shouldn't build a desktop applica...,0.616358
2,Let's plant it right now. This shows that eve...,This means that no matter how beat-down or dis...,There are several reasons this could be.First...,0.714871
3,"This show that she is resilight, because she d...",The failing of the last test did not ultimetel...,"In fairness, she certainly has her raison d'et...",0.502626
4,A Welchs Grape juice factory made him despair...,There are many features in the setting of Do ...,It took weeks of research and countless meetin...,0.343065


In [0]:
comparative_df["cosine_similarity"].mean()

0.47609058863455705

##### GOT

In [0]:
import glob, re, gensim

In [0]:
book_filenames = sorted(glob.glob("../got/*.txt"))

In [0]:
corpus = ""
for file in book_filenames:
    with open(file) as fp:
        corpus += fp.read() + " "

In [0]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [0]:
raw_sentences = tokenizer.tokenize(corpus)

In [0]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [0]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [0]:
got_w2v = gensim.models.Word2Vec(sentences)

In [0]:
df_got = pd.read_csv("evaluation/got_validation.csv")

In [0]:
df_got.head()

Unnamed: 0,input,continuation,label
0,From a sword through his belly if you have you...,"Stannis pressed his lips together. ""Serve me ...",0
1,From a sword through his belly if you have you...,Neither were eating well enough unless he joi...,1
2,""" — you will avenge my death, and seat my daug...","Ser Justin put one hand on his sword hilt. ""O...",0
3,""" — you will avenge my death, and seat my daug...",“Your lady mother has squandered the last thre...,1
4,"""They would.""","""We might question them more sharply... """,0


In [0]:
comparative_df = pd.concat([df_got[df_got.label == 0].set_index('input'), df_got[df_got.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "generated", "label1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["generated"], got_w2v)

  
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [0]:
comparative_df.head()

Unnamed: 0,input,actual,generated,baseline,cosine_similarity
0,From a sword through his belly if you have you...,"Stannis pressed his lips together. ""Serve me ...",Neither were eating well enough unless he joi...,Tyrion walked into the tavern.,0.415075
1,""" — you will avenge my death, and seat my daug...","Ser Justin put one hand on his sword hilt. ""O...",“Your lady mother has squandered the last thre...,Tyrion walked into the tavern.,0.666389
2,"""They would.""","""We might question them more sharply... """,Ser Jorah Mormont grew bored. “Give me you thi...,Tyrion walked into the tavern.,0.62601
3,"""It was not a compliment."" Stannis gave Theon...","She stood. ""The Braavosi ransomed my seven of...",The sunlight stained their green tunic over th...,Tyrion walked into the tavern.,0.323279
4,The memory left Theon writhing in his chains. ...,"Then other bird said, ""Theon,"" clear as day, a...","�Whitewash, tree, tree, tree.”“A broth you won...",Tyrion walked into the tavern.,0.443098


In [0]:
comparative_df['cosine_similarity'].mean()

0.47870406252567216

### First Baseline

In [0]:
baseline = "The quick brown fox jumped over the lazy dog"

#### GOT

In [23]:
got_df.head()

Unnamed: 0,input,continuation,label,pred
0,From a sword through his belly if you have you...,"Stannis pressed his lips together. ""Serve me ...",0,1
1,From a sword through his belly if you have you...,Neither were eating well enough unless he joi...,1,1
2,""" — you will avenge my death, and seat my daug...","Ser Justin put one hand on his sword hilt. ""O...",0,1
3,""" — you will avenge my death, and seat my daug...",“Your lady mother has squandered the last thre...,1,1
4,"""They would.""","""We might question them more sharply... """,0,1


In [0]:
comparative_df = pd.concat([got_df[got_df.label == 0].set_index('input'), got_df[got_df.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "pred0", "generated", "label1", "pred1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["baseline"] = baseline

##### Cosine Similarity

In [51]:
comparative_df["baseline_cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["baseline"], got_w2v)

  
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [52]:
comparative_df["baseline_cosine_similarity"].mean()

0.27118041500392737

##### Perplexity

In [68]:
calc_perplexity([baseline], got_unigram_model)

[308.10781495358776]

##### Classification

In [0]:
import random

In [0]:
got_df["pred"] = [random.randint(0, 1) for i in got_df.index]

In [74]:
print("Accuracy: ", accuracy_score(got_df["label"], got_df["pred"]))

Accuracy:  0.5148514851485149


In [75]:
print("F1: ", f1_score(got_df["label"], got_df["pred"]))

F1:  0.5148514851485149


#### ROC

In [77]:
roc_df.head()

Unnamed: 0,input,continuation,label
0,Our manager introduced us to a new employee.,"Since she was attractive, all of the guys were...",0
1,Our manager introduced us to a new employee.,The new employee was nice and young. I though...,1
2,Carl was tasked with finding his family's Chri...,He was fifteen and he felt very manly and impo...,0
3,Carl was tasked with finding his family's Chri...,"He searched all over town for his dad. There,...",1
4,Bob was always upset about his looks.,So he saved up money to get plastic surgery. H...,0


In [0]:
comparative_df = pd.concat([roc_df[roc_df.label == 0].set_index('input'), roc_df[roc_df.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "generated", "label1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["baseline"] = baseline

##### Cosine Similarity

In [0]:
comparative_df["baseline_cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["baseline"])

In [85]:
comparative_df["baseline_cosine_similarity"].mean()

0.4847811259785477

##### Perplexity

In [86]:
calc_perplexity([baseline], roc_unigram_model)

[2835.1219901488835]

##### Classification

In [0]:
roc_df["pred"] = [random.randint(0, 1) for i in roc_df.index]

In [88]:
print("Accuracy: ", accuracy_score(roc_df["label"], roc_df["pred"]))

Accuracy:  0.4306930693069307


In [89]:
print("F1: ", f1_score(roc_df["label"], roc_df["pred"]))

F1:  0.42786069651741293


#### Essay

In [90]:
essay_df.head()

Unnamed: 0,input,continuation,label,pred
0,The obsticles that the builders had to face we...,If the wind blew the blimp to much it could ca...,0,1
1,The obsticles that the builders had to face we...,Most floors in single story buildings began wi...,1,1
2,A lot of teens and kids on on their computers ...,but sometimes teen talk to people they dont k...,0,1
3,A lot of teens and kids on on their computers ...,At first you shouldn't build a desktop applica...,1,1
4,Let's plant it right now. This shows that eve...,This means that no matter how beat-down or dis...,0,1


In [0]:
comparative_df = pd.concat([essay_df[essay_df.label == 0].set_index('input'), essay_df[essay_df.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "pred0", "generated", "label1", "pred1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["baseline"] = baseline

##### Cosine Similarity

In [0]:
comparative_df["baseline_cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["baseline"])

In [96]:
comparative_df["baseline_cosine_similarity"].mean()

0.3989821539656951

##### Perplexity

In [97]:
calc_perplexity([baseline], essay_unigram_model)

[2605.53710835149]

##### Classification

In [0]:
essay_df["pred"] = [random.randint(0, 1) for i in essay_df.index]

In [99]:
print("Accuracy: ", accuracy_score(essay_df["label"], essay_df["pred"]))

Accuracy:  0.4900990099009901


In [100]:
print("F1: ", f1_score(essay_df["label"], essay_df["pred"]))

F1:  0.5024154589371981


### Second Baseline

In [0]:
got_baseline = "Neither were eating well enough unless he joined in their merry custom."
roc_baseline = "The new employee was nice and young."
essay_baseline = "Most floors in single story buildings began with a series of blocks bolted to the floor."

#### Baseline Cosine Similarity

##### GOT

In [0]:
df_got.head()

Unnamed: 0,input,continuation,label
0,From a sword through his belly if you have you...,"Stannis pressed his lips together. ""Serve me ...",0
1,From a sword through his belly if you have you...,Neither were eating well enough unless he joi...,1
2,""" — you will avenge my death, and seat my daug...","Ser Justin put one hand on his sword hilt. ""O...",0
3,""" — you will avenge my death, and seat my daug...",“Your lady mother has squandered the last thre...,1
4,"""They would.""","""We might question them more sharply... """,0


In [0]:
comparative_df = pd.concat([df_got[df_got.label == 0].set_index('input'), df_got[df_got.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "generated", "label1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["baseline"] = got_baseline

In [0]:
comparative_df["baseline_cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["baseline"], got_w2v)

In [0]:
comparative_df['baseline_cosine_similarity'].mean()

0.3343758965381468

##### ROC

In [0]:
df_roc.head()

Unnamed: 0,input,continuation,label
0,Our manager introduced us to a new employee.,"Since she was attractive, all of the guys were...",0
1,Our manager introduced us to a new employee.,The new employee was nice and young. I though...,1
2,Carl was tasked with finding his family's Chri...,He was fifteen and he felt very manly and impo...,0
3,Carl was tasked with finding his family's Chri...,"He searched all over town for his dad. There,...",1
4,Bob was always upset about his looks.,So he saved up money to get plastic surgery. H...,0


In [0]:
comparative_df = pd.concat([df_roc[df_roc.label == 0].set_index('input'), df_roc[df_roc.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "generated", "label1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["baseline"] = roc_baseline

In [0]:
comparative_df["baseline_cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["baseline"])

In [0]:
comparative_df["baseline_cosine_similarity"].mean()

0.4653388394080863

##### Essays

In [0]:
df_essay.head()

Unnamed: 0,input,continuation,label
0,The obsticles that the builders had to face we...,If the wind blew the blimp to much it could ca...,0
1,The obsticles that the builders had to face we...,Most floors in single story buildings began wi...,1
2,A lot of teens and kids on on their computers ...,but sometimes teen talk to people they dont k...,0
3,A lot of teens and kids on on their computers ...,At first you shouldn't build a desktop applica...,1
4,Let's plant it right now. This shows that eve...,This means that no matter how beat-down or dis...,0


In [0]:
comparative_df = pd.concat([df_essay[df_essay.label == 0].set_index('input'), df_essay[df_essay.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "generated", "label1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["baseline"] = essay_baseline

In [0]:
comparative_df["baseline_cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["baseline"])

In [0]:
comparative_df["baseline_cosine_similarity"].mean()

0.3526170787834885

#### Baseline Perplexity

##### GOT

In [0]:
calc_perplexity([got_baseline], got_unigram_model)

[378.3495565381178]

##### Essay

In [0]:
calc_perplexity([essay_baseline], essay_unigram_model)

[1389.9412225225656]

##### ROC

In [0]:
calc_perplexity([roc_baseline], roc_unigram_model)

[376.9086160395709]

#### Baseline Classification

In [0]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

##### GOT

In [0]:
got_df = pd.read_csv("evaluation/got_validation.csv")

In [0]:
# Predict all as machine generated
got_df["pred"] = 1

In [11]:
print("Accuracy: ", accuracy_score(got_df["label"], got_df["pred"]))

Accuracy:  0.5


In [12]:
print("F1: ", f1_score(got_df["label"], got_df["pred"]))

F1:  0.6666666666666666


##### ROC

In [0]:
roc_df = pd.read_csv("evaluation/roc_test.csv")

In [0]:
roc_df["pred"] = 1

In [15]:
print("Accuracy: ", accuracy_score(roc_df["label"], roc_df["pred"]))

Accuracy:  0.5


In [16]:
print("F1: ", f1_score(roc_df["label"], roc_df["pred"]))

F1:  0.6666666666666666


##### Essay

In [0]:
essay_df = pd.read_csv("evaluation/essay_test.csv")

In [0]:
essay_df["pred"] = 1

In [19]:
print("Accuracy: ", accuracy_score(essay_df["label"], essay_df["pred"]))

Accuracy:  0.5


In [20]:
print("F1: ", f1_score(essay_df["label"], essay_df["pred"]))

F1:  0.6666666666666666
