In [22]:
import numpy as np
from numpy import array
import pandas as pd
import random
from random import randint
from pickle import dump, load
from sklearn.model_selection import train_test_split
import sys
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Embedding
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import textstat
import nltk
from itertools import compress
from sklearn.metrics.pairwise import cosine_similarity


# i'm not even using these anymore
nltk.download('stopwords')

random.seed(952)


def clean_text(input):
    # tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    
    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    # remove non alphabetic 
    tokens = [word for word in tokens if word.isalpha()]
  
    # make lower case
    tokens = [word.lower() for word in tokens]
    
    # remove tokens of length 1
    tokens_len = [len(i) > 1 for i in tokens]
    tokens_filter = list(compress(tokens, tokens_len))
    tokens = tokens_filter
    
    return tokens


def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# save tokens to file, one sequence per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()


def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
        
    return ' '.join(result)

[nltk_data] Downloading package stopwords to /home/peter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# load the model
model = load_model('../results/ikea_word_model.h5')

# load the tokenizer
tokenizer = load(open('../results/word_tokenizer.pkl', 'rb'))

# load the testing data
in_filename = '../results/ikea_word_test_sequences.txt'
test_sequences = load_doc(in_filename)
test_lines = test_sequences.split('\n')

# move on to processing the test set into the right shape
# i've split train/test by objects, not sequences
# make the testing data the right shape to test with
ikea_test = pd.read_csv('../results/ikea_word_test.csv')

test_desc_single = ' '.join(ikea_test.description)

test_tokens = clean_text(test_desc_single)

In [24]:
print('Total Tokens: %d' % len(test_tokens))
print('Unique Tokens: %d' % len(set(test_tokens)))

print('Total Sequences: %d' % len(test_lines))

Total Tokens: 15279
Unique Tokens: 1718
Total Sequences: 15228


In [25]:
# explore other similarity metrics
ikea_test = pd.read_csv('../results/ikea_word_test.csv')
seq_length = len(test_lines[0].split()) - 1

test_long = []
for item in ikea_test.description:
    if len(item.split()) > 50:
        test_long.append(item)


opens = []
refs = []
gens = []
rans = []
distances = []
for tl in test_long:
    exam = clean_text(tl)

    opener = exam[:50]  # seed text
    opens.append(' '.join(opener))
    
    closer = exam[50:]  # reference continue
    refs.append(' '.join(closer))
    
    # print(' '.join(closer))
    
    #  generated text
    res = generate_seq(model, 
                       tokenizer, 
                       seq_length, 
                       ' '.join(opener), 
                       len(closer))
    gens.append(res)
    
    # random text
    rand_tokens = test_tokens
    random.shuffle(rand_tokens)
    rand_out = ' '.join(rand_tokens[:len(closer)])
    rans.append(rand_out)
    
    # to liked format
    ref = tokenizer.texts_to_matrix([' '.join(closer)], mode='tfidf')[0]
    gen = tokenizer.texts_to_matrix([res], mode='tfidf')[0]
    ran = tokenizer.texts_to_matrix([rand_out], mode='tfidf')[0]
    
    ref_a = ref.reshape(1, len(ref))
    gen_a = gen.reshape(1, len(gen))
    ran_a = ran.reshape(1, len(ran))
    
    ref2gen = cosine_similarity(ref_a, gen_a)[0][0]
    
    ref2ran = cosine_similarity(ref_a, ran_a)[0][0]
    
    # how much closer is gen to ref than ran is to ref
    distances.append(ref2gen - ref2ran)

In [26]:
output = pd.DataFrame(list(zip(opens, refs, gens, rans, distances)), 
                      columns = ['start', 
                                 'reference', 
                                 'generated', 
                                 'random', 
                                 'distance'])
output.to_csv('../results/text_comparison.csv')

In [27]:
output.head()

Unnamed: 0,start,reference,generated,random
0,gives directional light that is good for focus...,spaces,spaces,can
1,solid pine with crafted details and brushed su...,together to save space the surface is durable ...,together to save space easy to keep clean just...,and it in beautifully in you trellis free the ...
2,you can choose to assemble this coffee table a...,wipe clean with damp cloth,clean you can easily adapt,which open as the charge
3,the trellis makes it easy to decorate your wal...,enjoy the natural expression of the wood the f...,enjoy the natural expression of the wood the f...,pot to piece filling kitchen shopping hide she...
4,you sit comfortably thanks to the pocket sprin...,designed to be comfortable for you to lean aga...,designed to be comfortable for you to lean aga...,be and needs limited hide different the keep f...
