In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

import re

In [2]:
poetry_df = pd.read_csv("../data/poetry/kaggle_poem_dataset.csv").drop(['Unnamed: 0'],axis=1)
poetry_df

Unnamed: 0,Author,Title,Poetry Foundation ID,Content
0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...
...,...,...,...,...
15647,Hannah Gamble,Your Invitation to a Modest Breakfast,56059,"It’s too cold to smoke outside, but if you com..."
15648,Eleni Sikelianos,Your Kingdom\n \n \n \n Launch Audio in a N...,145220,if you like let the body feel\nall its own evo...
15649,Susan Elizabeth Howe,“Your Luck Is About To Change”,41696,(A fortune cookie)\nOminous inscrutable Chines...
15650,Andrew Shields,Your Mileage May Vary,90177,1\nOur last night in the house was not our las...


In [3]:
from nltk.tokenize import WordPunctTokenizer
import nltk

In [4]:
def tokenize(poem):
    return WordPunctTokenizer().tokenize(re.sub(r"\n", r" NN ", poem))

poetry_df["Tokenized"] = poetry_df["Content"].apply(lambda row: tokenize(row))

In [5]:
def count_token(tokenized, token_to_count):
    return sum([token==token_to_count for token in tokenized])

def num_lines(tokenized):
    return (count_token(tokenized, "NN") + 1)

def num_words(tokenized):
    filtered = [token for token in tokenized if (token.isalnum() and token!="NN")]
    return len(filtered)

def num_punctuation(tokenized):
    filtered = [token for token in tokenized if (not token.isalnum() and token!="NN")]
    return len(filtered)

# (num_words(nltk_tokens), len(nltk_tokens), num_punctuation(nltk_tokens))

In [6]:
poetry_df["Num_lines"] = poetry_df["Tokenized"].apply(lambda row: num_lines(row), convert_dtype=False)
poetry_df["Num_words"] = poetry_df["Tokenized"].apply(lambda row: num_words(row), convert_dtype=False)
poetry_df["Num_punctuation"] = poetry_df["Tokenized"].apply(lambda row: num_punctuation(row), convert_dtype=False)

In [7]:
samples = poetry_df[poetry_df["Num_lines"] == 14]["Tokenized"]

In [8]:
def tokenize(content):
    return ' '.join(content.split('\n'))

def content2output_vec(content, num_lines):
    # Output is vector 
    # [5, 4, 1, 10, ... , 16], len = num_lines

    return list(map(len, [i.split() for i in content.split("\n")]))

labels = np.stack(poetry_df[poetry_df["Num_lines"]==14].apply(lambda s: content2output_vec(s["Content"], s["Num_lines"]), axis=1).values)/1.0
samples = [tokenize(i) for i in poetry_df[poetry_df["Num_lines"] == 14]["Content"].values]


In [9]:
samples[100]

'Extreme exertion isolates a person from help, discovered Atlas. Once a certain shoulder-to-burden ratio collapses, there is so little others can do: they can’t lend a hand with Brazil and not stand on Peru.'

In [10]:
import tensorflow as tf

In [11]:
from tensorflow.keras.layers import TextVectorization

In [12]:
train_samples = samples[:800]
train_labels = labels[:800]

test_samples = samples[800:]
test_labels = labels[800:]

next time:
- look at other NLP/deep learning preprocessing pipelines/models
- santiy check with existing poetry generator -- if we feed our corpus of poems into an existing model, does it produce good poem-y poems? is there issues with newlines/punctuation?
- check how glove pretrained model deals with punctuation --> how to encode
- look into how many words in our vocabulary are unknown by the glove pretrained model
- create embedding matrix using pretrained glove model (dealing with unknown/infrequent words?)
- still use word indexing (like we did with previous model); embedding matrix maps the word index to glove word vector
- use keras embedding layer to turn vector of word indices into a matrix and create new model using convolution etc

In [13]:
train_dataset = tf.data.Dataset.from_tensor_slices((np.array(train_samples), np.array(train_labels)))

2022-03-10 22:02:22.235602: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
import re

# find max number of words in a poem
tokenized = [re.findall(r"[\w']+|[.,!?;-]", s) for s in samples]
input_size = max(list(map(len, tokenized)))
input_size

331

In [15]:
import tensorflow_text as tf_text   

def pad_punc_lower(strs):
    strs = tf.strings.lower(strs)
    strs = tf.strings.regex_replace(strs, '([.,!?();—:\'’‘”/"&“])', ' \1 ')
    strs = tf.strings.regex_replace(strs, '\s{2,}', ' ')
    return strs

vectorizer = TextVectorization(
    output_sequence_length=input_size,
    standardize=pad_punc_lower
)
text_ds = tf.data.Dataset.from_tensor_slices(samples).batch(128)
vectorizer.adapt(text_ds)

In [16]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [17]:
glove_fname = "../glove.6B.100d.txt"

embeddings_index = {}
with open(glove_fname, 'r') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("found %d word vectors" % len(embeddings_index))

found 400000 word vectors


In [18]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
#         print(word)
        misses += 1
print("converted %d words (%d misses)" % (hits, misses))

converted 10952 words (1570 misses)


In [19]:
from tensorflow.keras.layers import Embedding
from keras.initializers import Constant

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False
)

In [20]:
from tensorflow.keras import layers
import keras


model = tf.keras.Sequential()
model.add(keras.Input(shape=(331), dtype="int64"))
model.add(embedding_layer)
model.add(layers.Conv1D(128, 5, activation="relu"))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation="relu"))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation="relu"))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(128, activation="relu"))
model.add(layers.Dense(14, activation="relu"))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 331, 100)          1252400   
                                                                 
 conv1d (Conv1D)             (None, 327, 128)          64128     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 65, 128)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 61, 128)           82048     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 12, 128)          0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 8, 128)            8

In [21]:
def tokens2input_vec(tokens):
    return vectorizer(np.array([[s] for s in tokens])).numpy()

X_train = tokens2input_vec(train_samples)
X_val = vectorizer(np.array([[s] for s in test_samples])).numpy()

Y_train = np.array(train_labels)
Y_val = np.array(test_labels)

In [38]:
def poem_loss(y_true, y_pred):  
#     tf.print("true", type(y_true))
#     tf.print("pred", type(y_pred))
    mse = tf.reduce_mean(tf.math.square(y_pred - y_true), axis=-1)
    wsl = (tf.math.reduce_sum(y_true, 1) - tf.math.reduce_sum(y_pred, 1))**2
    asdf = tf.math.reduce_sum(y_pred, 1)
    
    return mse + wsl

model.compile(
    loss="mse", optimizer="adam", metrics=["acc"]
)
model.fit(X_train, Y_train, batch_size=20, epochs=100, validation_data=(X_val, Y_val))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100


Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f94269ca940>

In [39]:
model.evaluate(X_val, Y_val)



[3.9057087898254395, 0.05050504952669144]

In [25]:
test_poem = poetry_df[poetry_df["Num_lines"] == 14].iloc[10]

In [26]:
tokens = tokenize(test_poem["Content"])
input_vec = vectorizer(tokens)
output_vec = content2output_vec(test_poem["Content"], 14)
input_vec in X_train

True

In [27]:
model.predict(np.array([input_vec]))

array([[ 7.8897214,  8.807972 ,  9.261697 ,  7.320697 ,  6.6976423,
         8.729455 ,  7.1269984,  4.8953433,  5.5677967,  7.316001 ,
         6.3076706,  7.7160506,  0.       , 10.626973 ]], dtype=float32)

In [28]:
output_vec

[8, 9, 9, 7, 6, 9, 7, 5, 5, 7, 6, 8, 10, 10]

In [None]:
X_train.shape

In [40]:
news = """So, for now, many governments are more urgently focused on reducing near term energy shocks, 
aiming to boost global oil production to replace the millions of barrels per day that Russia has historically 
exported but which is now being shunned by Western nations. The two goals aren’t necessarily at odds, officials 
in the United States and Europe say. Yet some fear that countries could become so consumed by the immediate 
energy crisis that they neglect longer term policies to cut reliance on fossil fuels — a myopia 
that could set the world up for more oil and gas shocks in the future as well as a dangerously overheated planet.""".replace('\n', '')

In [41]:
input_vec = np.array([vectorizer(news)])

In [42]:
pred = model.predict(input_vec)
pred

array([[5.3594093, 6.985545 , 6.8548045, 6.4940505, 7.777894 , 6.937112 ,
        6.7589064, 6.7502375, 5.866041 , 5.93838  , 7.7057476, 8.508035 ,
        5.830035 , 7.255543 ]], dtype=float32)

In [43]:
from math import ceil

def poemize_prediction(content, pred):
    content = content.split(' ')
    s = 0
    for i in pred[0]:
        s += i+1
        content.insert(int(ceil(s)), '\n')
    content = ' '.join(content)
    return '\n'.join([i.strip() for i in content.split('\n')])

def poemize_prediction2(content, pred):
    content = content.split(' ')
    print(content)

In [44]:
print(poemize_prediction(news, pred))

So, for now, many governments are more
urgently focused on reducing near term energy
shocks, aiming to boost global oil production
to replace the millions of barrels
per day that Russia has historically exported but
which is now being shunned by Western
nations. The two goals aren’t necessarily at
odds, officials in the United States
and Europe say. Yet some fear
that countries could become so consumed
by the immediate energy crisis that they neglect
longer term policies to cut reliance on fossil
fuels — a myopia that could
set the world up for more oil and
gas shocks in the future as well as a dangerously overheated planet.


In [270]:
news = "Bank of France intervened to buy small amounts of dollars and sell yen in Paris today to stabilize exchange rates agreed at last month's meeting of Finance Ministers of the Group of Five and Canada. Dealers say recent central bank intervention in foreign exchange markets appeared to be a limited reaction to temporary pressures rather than a major defence operation."
input_vec = np.array([vectorizer(news)])

pred = model.predict(input_vec)
print(poemize_prediction(news, pred))

Bank of France intervened to buy
small amounts of dollars
and sell yen in
Paris today to

stabilize exchange
rates agreed at last
month's meeting of
Finance Ministers of the Group
of Five and Canada. Dealers
say recent central bank
intervention in foreign exchange
markets appeared to be
a limited
reaction to temporary pressures rather than a major defence operation.


In [215]:
X_test.shape

NameError: name 'X_test' is not defined

**Input:** <br>
So, for now, many governments are more urgently focused on reducing near term energy shocks, 
aiming to boost global oil production to replace the millions of barrels per day that Russia has historically 
exported but which is now being shunned by Western nations. The two goals aren’t necessarily at odds, officials 
in the United States and Europe say. Yet some fear that countries could become so consumed by the immediate 
energy crisis that they neglect longer term policies to cut reliance on fossil fuels — a myopia 
that could set the world up for more oil and gas shocks in the future as well as a dangerously overheated planet.<br>
<br>
**Model 2 output:** <br>
So, for now, many governments are more<br>
urgently focused on reducing near term energy<br>
shocks, aiming to boost global oil production<br>
to replace the millions of barrels<br>
per day that Russia has historically exported but<br>
which is now being shunned by Western<br>
nations. The two goals aren’t necessarily at<br>
odds, officials in the United States<br>
and Europe say. Yet some fear<br>
that countries could become so consumed<br>
by the immediate energy crisis that they neglect<br>
longer term policies to cut reliance on fossil<br>
fuels — a myopia that could<br>
set the world up for more oil and<br>
gas shocks in the future as well as a dangerously overheated planet.<br>