In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import nltk
nltk.download('stopwords')

import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [0]:
!cp drive/My\ Drive/ikea.csv .

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

ikea_items = pd.read_csv('ikea.csv')

# some items do not have descriptions from the specific box
ikea_items = ikea_items.dropna()

# some descriptions are identical
desc_uni = ikea_items.drop_duplicates(subset='description')

# average description length for future generation
# desc_avg = round(sum( map(len, desc_uni) ) / len(desc_uni))
# desc_std = map(len, desc_uni).std()

# split train and test
desc_train, desc_test = train_test_split(desc_uni, test_size=0.2)
pd.DataFrame(desc_train).to_csv('ikea_train.csv')
pd.DataFrame(desc_test).to_csv('ikea_test.csv')

# make one corpus
desc_single = ' '.join(desc_train.description)

def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)
  
processed_inputs = tokenize_words(desc_single)

In [0]:
!cp ikea_test.csv drive/My\ Drive/.
!cp ikea_train.csv drive/My\ Drive/.

In [62]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 342238
Total vocab: 48


In [63]:
# Now that we've transformed the data into the form it needs to be in, 
# we can begin making a dataset out of it, which we'll feed into our network. 
# We need to define how long we want an individual sequence 
# (one complete mapping of inputs characters as integers) to be. 
# We'll set a length of 100 for now, 
# and declare empty lists to store our input and output data:

seq_length = 100
x_data = []
y_data = []

# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    

n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 342138


In [0]:
X = np.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)
y = np_utils.to_categorical(y_data)

In [75]:
y_data

[11,
 12,
 22,
 15,
 0,
 29,
 18,
 15,
 22,
 32,
 15,
 29,
 0,
 11,
 28,
 28,
 11,
 24,
 17,
 15,
 14,
 0,
 11,
 13,
 13,
 25,
 28,
 14,
 19,
 24,
 17,
 0,
 24,
 15,
 15,
 14,
 29,
 0,
 29,
 31,
 28,
 16,
 11,
 13,
 15,
 0,
 23,
 11,
 14,
 15,
 0,
 24,
 11,
 30,
 31,
 28,
 11,
 22,
 0,
 33,
 25,
 25,
 14,
 0,
 32,
 15,
 24,
 15,
 15,
 28,
 0,
 12,
 25,
 25,
 21,
 13,
 11,
 29,
 15,
 0,
 18,
 15,
 19,
 17,
 18,
 30,
 0,
 15,
 34,
 30,
 15,
 24,
 29,
 19,
 25,
 24,
 0,
 31,
 24,
 19,
 30,
 0,
 11,
 22,
 22,
 25,
 33,
 19,
 24,
 17,
 0,
 23,
 11,
 21,
 15,
 0,
 33,
 11,
 22,
 22,
 0,
 11,
 28,
 15,
 11,
 0,
 21,
 15,
 15,
 26,
 0,
 33,
 28,
 19,
 30,
 19,
 24,
 17,
 0,
 13,
 11,
 22,
 13,
 31,
 22,
 11,
 30,
 19,
 25,
 24,
 29,
 0,
 24,
 15,
 11,
 30,
 0,
 12,
 22,
 11,
 24,
 21,
 0,
 26,
 11,
 17,
 15,
 29,
 0,
 22,
 25,
 25,
 29,
 15,
 0,
 22,
 15,
 11,
 16,
 0,
 22,
 19,
 24,
 15,
 29,
 0,
 12,
 25,
 34,
 15,
 29,
 0,
 19,
 24,
 13,
 22,
 31,
 14,
 15,
 14,
 0,
 26,
 31,
 30,
 0,
 12,


In [0]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [0]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, 
                             monitor='loss', 
                             verbose=1, 
                             save_best_only=True, 
                             mode='min')
desired_callbacks = [checkpoint]

In [67]:
model.fit(X, y, epochs=20, batch_size=256, callbacks=desired_callbacks)

Epoch 1/20

Epoch 00001: loss improved from inf to 2.88779, saving model to model_weights_saved.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.88779 to 2.49840, saving model to model_weights_saved.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.49840 to 2.24287, saving model to model_weights_saved.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.24287 to 2.06901, saving model to model_weights_saved.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.06901 to 1.94871, saving model to model_weights_saved.hdf5
Epoch 6/20

Epoch 00006: loss improved from 1.94871 to 1.86029, saving model to model_weights_saved.hdf5
Epoch 7/20

Epoch 00007: loss improved from 1.86029 to 1.79318, saving model to model_weights_saved.hdf5
Epoch 8/20

Epoch 00008: loss improved from 1.79318 to 1.73272, saving model to model_weights_saved.hdf5
Epoch 9/20

Epoch 00009: loss improved from 1.73272 to 1.68716, saving model to model_weights_saved.hdf5
Epoch 10/20

Epoch 00010: loss improved from 1.68716 to 1.6

<keras.callbacks.History at 0x7f318df7a668>

In [0]:
!cp model_weights_saved.hdf5 drive/My\ Drive/.

In [0]:
import pickle

with open('model.pkl', 'wb') as output:
    pickle.dump(model, output, pickle.HIGHEST_PROTOCOL)

In [0]:
!cp model.pkl drive/My\ Drive/.

In [84]:
# After it has finished training, 
# we'll specify the file name and load in the weights. 
# Then recompile our model with the saved weights:
#filename = "./drive/My\ Drive/model_weights_saved.hdf5"
#model.load_weights(filename)
#model.compile(loss='categorical_crossentropy', optimizer='adam')

# Since we converted the characters to numbers earlier, 
# we need to define a dictionary variable that will convert 
# the output of the model back into numbers:
num_to_char = dict((i, c) for i, c in enumerate(chars))
#
# To generate characters, 
# we need to provide our trained model with a random seed character 
# that it can generate a sequence of characters from:
start = np.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")


# We'll ask the model to predict what comes next based off of the random seed, 
# convert the output numbers to characters and then append it to the pattern, 
# which is our list of generated characters plus the initial seed:
for i in range(100):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

Random Seed:
" health fabric tightlywoven makes bedding extra durable easily change bedroom look since duvet cover  "
easy keep clean since removable machine washable soft comfortable semovable machine washable soft co