In [0]:
from google.colab import drive
drive.mount('/content/drive')

import nltk
nltk.download('stopwords')

import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Found GPU at: /device:GPU:0


In [0]:
!cp drive/My\ Drive/ikea.csv .

In [0]:
!cp drive/My\ Drive/ikea_2.csv .
# !cp ikea_2.csv drive/My\ Drive/ikea_2.csv

In [None]:
import numpy as np
from numpy import array
import pandas as pd
import random
from random import randint
from pickle import dump, load
from sklearn.model_selection import train_test_split
import sys
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Embedding
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


random.seed(952)


def clean_text(input):
  # tokenizer
  tokenizer = RegexpTokenizer(r'\w+')
  tokens = tokenizer.tokenize(input)
  
  # remove punctuation
  table = str.maketrans('', '', string.punctuation)
  tokens = [w.translate(table) for w in tokens]

  # remove non alphabetic 
  tokens = [word for word in tokens if word.isalpha()]
  
	# make lower case
  tokens = [word.lower() for word in tokens]
  return tokens


def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text


# save tokens to file, one sequence per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

  
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text


# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

In [0]:
ikea_items = pd.read_csv('ikea_2.csv')

# some items do not have descriptions from the specific box
ikea_items = ikea_items.dropna()

# some descriptions are identical
desc_uni = ikea_items.drop_duplicates(subset='description')

# average description length for future generation
# desc_avg = round(sum( map(len, desc_uni) ) / len(desc_uni))
# desc_std = map(len, desc_uni).std()

# split train and test
desc_train, desc_test = train_test_split(desc_uni, test_size=0.2)
pd.DataFrame(desc_train).to_csv('ikea_word_train.csv')
pd.DataFrame(desc_test).to_csv('ikea_word_test.csv')

# make one corpus
desc_single = ' '.join(desc_train.description)

In [0]:
tokens = clean_text(desc_single)

print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

# make sequences of words from the full corpus
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
  
#print('Total Sequences: %d' % len(sequences))

out_filename = 'ikea_word_train_sequences.txt'
save_doc(sequences, out_filename)

Total Tokens: 62705
Unique Tokens: 2860


In [0]:
!cp ikea_word_test.csv drive/My\ Drive/.
!cp ikea_word_train.csv drive/My\ Drive/.

!cp ikea_word_train_sequences.txt drive/My\ Drive/.

# !cp drive/My\ Drive/ikea_word_train_sequences.txt .

In [0]:
# load
in_filename = 'ikea_word_train_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

print('total sequences: %d' % len(lines))
# code as integers

# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
dump(tokenizer, open('word_tokenizer.pkl', 'wb'))

# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

# define model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=seq_length))
model.add(LSTM(100))#, return_sequences=True))
# You must set return_sequences=True when stacking LSTM layers so that the 
# second LSTM layer has a three-dimensional sequence input
# model.add(Dropout(0.1))
# model.add(LSTM(100))
# model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
# print(model.summary())

# compile model
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

# helpful checkpoints
filepath = "word_model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, 
                             monitor='loss',  # 'accuracy'
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')
desired_callbacks = [checkpoint]

total sequences: 62654


In [0]:
# fit model
model.fit(X, y, epochs=100, batch_size=256, callbacks=desired_callbacks)

Epoch 1/100

Epoch 00001: loss improved from inf to 6.29491, saving model to word_model_weights_saved.hdf5
Epoch 2/100

Epoch 00002: loss improved from 6.29491 to 5.88350, saving model to word_model_weights_saved.hdf5
Epoch 3/100

Epoch 00003: loss improved from 5.88350 to 5.59549, saving model to word_model_weights_saved.hdf5
Epoch 4/100

Epoch 00004: loss improved from 5.59549 to 5.24618, saving model to word_model_weights_saved.hdf5
Epoch 5/100

Epoch 00005: loss improved from 5.24618 to 4.93466, saving model to word_model_weights_saved.hdf5
Epoch 6/100

Epoch 00006: loss improved from 4.93466 to 4.67890, saving model to word_model_weights_saved.hdf5
Epoch 7/100

Epoch 00007: loss improved from 4.67890 to 4.45807, saving model to word_model_weights_saved.hdf5
Epoch 8/100

Epoch 00008: loss improved from 4.45807 to 4.24551, saving model to word_model_weights_saved.hdf5
Epoch 9/100

Epoch 00009: loss improved from 4.24551 to 4.03662, saving model to word_model_weights_saved.hdf5
Epoch

<keras.callbacks.History at 0x7fee104f9438>

In [0]:
# save the model
model.save('ikea_word_model.h5')
# save the tokenizer
dump(tokenizer, open('word_tokenizer.pkl', 'wb'))

In [0]:
!cp ikea_word_model.h5 drive/My\ Drive/.
!cp word_tokenizer.pkl drive/My\ Drive/.
!cp word_model_weights_saved.hdf5 drive/My\ Drive/.

In [0]:
## load cleaned text sequences
in_filename = 'ikea_word_train_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

seq_length = len(lines[0].split()) - 1

# load the model
model = load_model('ikea_word_model.h5')

# load the tokenizer
tokenizer = load(open('word_tokenizer.pkl', 'rb'))

# demo it works on the training dataset
# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

# move on to processing the test set into the right shape
# i've split train/test by objects, not sequences
# make the testing data the right shape to test with
ikea_test = pd.read_csv('ikea_word_test.csv')

test_desc_single = ' '.join(ikea_test.description)

test_tokens = clean_text(test_desc_single)

print('Total Tokens: %d' % len(test_tokens))
print('Unique Tokens: %d' % len(set(test_tokens)))

# make sequences of words from the full corpus
length = 50 + 1
test_sequences = list()
for i in range(length, len(test_tokens)):
	# select sequence of tokens
	seq = test_tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	test_sequences.append(line)
  
print('Total Sequences: %d' % len(test_sequences))

out_filename = 'ikea_word_test_sequences.txt'
save_doc(test_sequences, out_filename)

are adjustable so you can customize your storage as needed stationary shelf for high stability adjustable feet for stability on uneven floors the door s integrated dampers allow it to close slowly silently and softly hinges with snap on function are easy to fit without screws built in cable management for

collecting cables and cords out of sight but close at hand when you need them smooth running drawers with pull out stop you can easily customize the size of the drawer by moving the divider you can easily see and reach your things because the drawers pull out fully drawers
Total Tokens: 15007
Unique Tokens: 1812
Total Sequences: 14956


In [0]:
!cp ikea_word_test_sequences.txt drive/My\ Drive/.

In [0]:
# load
in_filename = 'ikea_word_test_sequences.txt'
doc = load_doc(in_filename)
test_lines = doc.split('\n')

In [0]:
seq_length = len(test_lines[0].split()) - 1

out = []
for ii in range(0, len(test_lines) - 1):
  test_x = ' '.join(test_lines[ii].split()[:-1])
  test_y = test_lines[ii].split()[-1]
  # print(test_x)
  # print(test_y)

  res = generate_seq(model, tokenizer, seq_length, test_x, 1)
  out.append(res == test_y)

  
acc_my = sum(out) / len(out)
print(acc_my)
# X, y = test_sequences[:,:-1], test_sequences[:,-1]

dump(acc_my, open('word_test_accuracy.pkl', 'wb'))
# generate_seq(model, tokenizer, seq_length, test_lines[0], 1)

0.6974924774322969


In [0]:
!cp word_test_accuracy.pkl drive/My\ Drive/.