### Connect to Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%cd "/content/drive/My Drive/StoryCompletion/src"

/content/drive/My Drive/StoryCompletion/src


### Read Model and Generate Text

In [0]:
import json
import os
import numpy as np
import tensorflow as tf

import model, sample, encoder

In [0]:
def get_output(
    model_name='345M',
    seed=None,
    nsamples=1,
    batch_size=1,
    length=None,
    temperature=1,
    top_k=0,
    top_p=1,
    models_dir='models',
    raw_text=""
):
    """
    Interactively run the model
    :model_name=124M : String, which model to use
    :seed=None : Integer seed for random number generators, fix seed to reproduce
     results
    :nsamples=1 : Number of samples to return total
    :batch_size=1 : Number of batches (only affects speed/memory).  Must divide nsamples.
    :length=None : Number of tokens in generated text, if None (default), is
     determined by model hyperparameters
    :temperature=1 : Float value controlling randomness in boltzmann
     distribution. Lower temperature results in less random completions. As the
     temperature approaches zero, the model will become deterministic and
     repetitive. Higher temperature results in more random completions.
    :top_k=0 : Integer value controlling diversity. 1 means only 1 word is
     considered for each step (token), resulting in deterministic completions,
     while 40 means 40 words are considered at each step. 0 (default) is a
     special setting meaning no restrictions. 40 generally is a good value.
     :models_dir : path to parent folder containing model subfolders
     (i.e. contains the <model_name> folder)
    """
    models_dir = os.path.expanduser(os.path.expandvars(models_dir))
    if batch_size is None:
        batch_size = 1
    assert nsamples % batch_size == 0

    enc = encoder.get_encoder(model_name, models_dir)
    hparams = model.default_hparams()
    with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
        hparams.override_from_dict(json.load(f))

    if length is None:
        length = hparams.n_ctx // 2
    elif length > hparams.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)

    with tf.Session(graph=tf.Graph()) as sess:
        context = tf.placeholder(tf.int32, [batch_size, None])
        np.random.seed(seed)
        tf.set_random_seed(seed)
        output = sample.sample_sequence(
            hparams=hparams, length=length,
            context=context,
            batch_size=batch_size,
            temperature=temperature, top_k=top_k, top_p=top_p
        )

        saver = tf.train.Saver()
        ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name))
        saver.restore(sess, ckpt)

        
        context_tokens = enc.encode(raw_text)
        generated = 0
        for _ in range(nsamples // batch_size):
            out = sess.run(output, feed_dict={
                context: [context_tokens for _ in range(batch_size)]
            })[:, len(context_tokens):]
            for i in range(batch_size):
                generated += 1
                text = enc.decode(out[i])
        return text

In [None]:
get_output(model_name="run1", raw_text="Ron was talking on the phone")

### LR- ROC Stories

In [0]:
roc = open("evaluation/ShortStoriesTest.txt").read()

In [0]:
roc = roc.split("\n")

In [0]:
roc_df = pd.DataFrame(columns=["input", "continuation", "label"])

random_indices = []
df_idx = 0

while True:
    if roc_df.shape[0] > 200:
        break
    index = random.randint(0, len(roc) - 2)
    if index in random_indices:
        continue
    random_indices.append(index)
    input_text = roc[index].split('. ')[0] + "."
    output_text = '. '.join(roc[index].split('. ')[1:])
    row1 = {
        "input": input_text,
        "continuation": output_text,
        "label": 0
    }
    roc_df.loc[df_idx] = row1
    df_idx += 1
    row2 = {
        "input": input_text,
        "continuation": get_output(model_name="ROC", 
                                input_text = input_text,
                                length = 50).split('\n')[0],
        "label": 1
    }
    roc_df.loc[df_idx] = row2
    df_idx += 1

In [0]:
roc_df.to_csv("evaluation/roc_test.csv", index=False)

In [0]:
roc_df['clean_output'] = roc_df['continuation'].apply(lambda doc: clean_doc(doc, stemmer))

In [0]:
X = tfidfconverter.fit_transform(list(roc_df["clean_output"])).toarray()

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, roc_df['label'].astype(int))

In [0]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [0]:
predictions = lr_clf.predict(X_test)

In [0]:
print("Accuracy", accuracy_score(y_test, predictions))
print("F1", f1_score(y_test, predictions))

Accuracy 0.49019607843137253
F1 0.4999999999999999


### Calculating Perplexity

In [0]:
corpus = " ".join(open("evaluation/ShortStoriesTest.txt").read().split("\n"))

In [0]:
roc_tokens = nltk.word_tokenize(corpus)

In [0]:
roc_unigram_model = unigram(roc_tokens)

In [0]:
roc_df = pd.read_csv("evaluation/roc_test.csv")

In [0]:
roc_sentences = list(roc_df[roc_df.label == 1]["continuation"].dropna().str.split("\. "))

In [0]:
testset = []

In [0]:
for sentence in roc_sentences:
#     print(sentence)
    testset.extend(sentence)

In [0]:
np.mean(calc_perplexity(testset, roc_unigram_model))

807.0305980382706

### Cosine Similarity

In [0]:
import pandas as pd
import numpy as np

In [31]:
from spacy.lang.en.stop_words import STOP_WORDS
import gensim, re
import numpy as np
from scipy.spatial.distance import cosine



In [0]:
w2v = gensim.models.KeyedVectors.load_word2vec_format("models/GoogleNews-vectors-negative300-SLIM.bin", binary=True)

In [0]:
def clean(text):
    text = text.lower()
    text = text.replace("\n", " ")
    text = re.sub(r"[\W\d]", " ", text)
    text = re.sub(r"\s+", " ", text)
    for word in STOP_WORDS:
        text = text.replace(" " + word + " ", " ")
    return text

In [0]:
def getAverageVector(text, w2v):
    words = clean(text).split()
    vectors = []
    for word in words:
        try:
            vectors.append(w2v[word])
        except:
            continue
    return np.mean(vectors, axis=0)

In [0]:
def getSimilarity(actual, generated, w2v=w2v):
    v1 = getAverageVector(actual, w2v)
    v2 = getAverageVector(generated, w2v)
    return 1 - cosine(v1, v2)

In [0]:
df_roc = pd.read_csv("evaluation/roc_test.csv")

In [0]:
df_roc.head()

Unnamed: 0,input,continuation,label
0,Our manager introduced us to a new employee.,"Since she was attractive, all of the guys were...",0
1,Our manager introduced us to a new employee.,The new employee was nice and young. I though...,1
2,Carl was tasked with finding his family's Chri...,He was fifteen and he felt very manly and impo...,0
3,Carl was tasked with finding his family's Chri...,"He searched all over town for his dad. There,...",1
4,Bob was always upset about his looks.,So he saved up money to get plastic surgery. H...,0


In [0]:
comparative_df = pd.concat([df_roc[df_roc.label == 0].set_index('input'), df_roc[df_roc.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "generated", "label1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["generated"])

In [0]:
comparative_df.head()

Unnamed: 0,input,actual,generated,cosine_similarity
0,Our manager introduced us to a new employee.,"Since she was attractive, all of the guys were...",The new employee was nice and young. I though...,0.649152
1,Carl was tasked with finding his family's Chri...,He was fifteen and he felt very manly and impo...,"He searched all over town for his dad. There,...",0.724642
2,Bob was always upset about his looks.,So he saved up money to get plastic surgery. H...,"When she nurses the Metro, he would always co...",0.302044
3,The man was not honest or ethical.,He would get people to invest in his business ...,He was just an evil man. A fact he cared very...,0.488777
4,Randy was a guard in a prison.,"One day, he became distracted watching The Sim...",He watched as his fellow guards acted strange...,0.6227


In [0]:
comparative_df["cosine_similarity"].mean()

0.5393762010700849

### First Baseline

In [0]:
baseline = "The quick brown fox jumped over the lazy dog"

In [77]:
roc_df.head()

Unnamed: 0,input,continuation,label
0,Our manager introduced us to a new employee.,"Since she was attractive, all of the guys were...",0
1,Our manager introduced us to a new employee.,The new employee was nice and young. I though...,1
2,Carl was tasked with finding his family's Chri...,He was fifteen and he felt very manly and impo...,0
3,Carl was tasked with finding his family's Chri...,"He searched all over town for his dad. There,...",1
4,Bob was always upset about his looks.,So he saved up money to get plastic surgery. H...,0


In [0]:
comparative_df = pd.concat([roc_df[roc_df.label == 0].set_index('input'), roc_df[roc_df.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "generated", "label1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["baseline"] = baseline

##### Cosine Similarity

In [0]:
comparative_df["baseline_cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["baseline"])

In [85]:
comparative_df["baseline_cosine_similarity"].mean()

0.4847811259785477

##### Perplexity

In [86]:
calc_perplexity([baseline], roc_unigram_model)

[2835.1219901488835]

##### Classification

In [0]:
roc_df["pred"] = [random.randint(0, 1) for i in roc_df.index]

In [88]:
print("Accuracy: ", accuracy_score(roc_df["label"], roc_df["pred"]))

Accuracy:  0.4306930693069307


In [89]:
print("F1: ", f1_score(roc_df["label"], roc_df["pred"]))

F1:  0.42786069651741293


### Second Baseline

In [0]:
roc_baseline = "The new employee was nice and young."

#### Baseline Cosine Similarity

In [0]:
df_roc.head()

Unnamed: 0,input,continuation,label
0,Our manager introduced us to a new employee.,"Since she was attractive, all of the guys were...",0
1,Our manager introduced us to a new employee.,The new employee was nice and young. I though...,1
2,Carl was tasked with finding his family's Chri...,He was fifteen and he felt very manly and impo...,0
3,Carl was tasked with finding his family's Chri...,"He searched all over town for his dad. There,...",1
4,Bob was always upset about his looks.,So he saved up money to get plastic surgery. H...,0


In [0]:
comparative_df = pd.concat([df_roc[df_roc.label == 0].set_index('input'), df_roc[df_roc.label == 1].set_index('input')], axis = 1).reset_index()

In [0]:
comparative_df.columns = ["input", "actual", "label0", "generated", "label1"]

In [0]:
comparative_df = comparative_df[["input", "actual", "generated"]].dropna().copy()

In [0]:
comparative_df["baseline"] = roc_baseline

In [0]:
comparative_df["baseline_cosine_similarity"] = np.vectorize(getSimilarity)(comparative_df["actual"], comparative_df["baseline"])

In [0]:
comparative_df["baseline_cosine_similarity"].mean()

0.4653388394080863

#### Baseline Perplexity

In [0]:
calc_perplexity([roc_baseline], roc_unigram_model)

[376.9086160395709]

#### Baseline Classification

In [0]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

In [0]:
roc_df = pd.read_csv("evaluation/roc_test.csv")

In [0]:
roc_df["pred"] = 1

In [15]:
print("Accuracy: ", accuracy_score(roc_df["label"], roc_df["pred"]))

Accuracy:  0.5


In [16]:
print("F1: ", f1_score(roc_df["label"], roc_df["pred"]))

F1:  0.6666666666666666
