In [1]:
import numpy as np
from numpy import array
import pandas as pd
import random
from random import randint
from pickle import dump, load
from sklearn.model_selection import train_test_split
import sys
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Embedding
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import textstat
import nltk
from itertools import compress, cycle, islice
from sklearn.metrics.pairwise import cosine_similarity

# i'm not even using these anymore
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

random.seed(952)


def clean_text(input):
    # tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]

    # remove non alphabetic
    tokens = [word for word in tokens if word.isalpha()]

    # make lower case
    tokens = [word.lower() for word in tokens]

    # remove tokens of length 1
    tokens_len = [len(i) > 1 for i in tokens]
    tokens_filter = list(compress(tokens, tokens_len))
    tokens = tokens_filter

    return tokens


# save tokens to file, one sequence per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()


def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text

    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]

        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')

        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)

        # map predicted word index to word
        out_word = ''

        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break

        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)


def detect_labels(path):
    """Detects labels in LOCAL file."""
    client = vision.ImageAnnotatorClient()

    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.types.Image(content=content)

    response = client.label_detection(image=image)
    labels = response.label_annotations

    # list of labels (ignoring uncertainty)
    labels = [x.description for x in labels]
    return labels

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package stopwords to /home/peter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/peter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/peter/nltk_data...
[nltk_data]

In [23]:
# load ikea catalogue
ikea_cat = pd.read_csv('../results/ikea_2.csv')

ikdes = ikea_cat.description

tester = ['Shelving', 'Brown', 'Bookcase', 'Shelf', 'Hutch']
tester = [x.lower() for x in tester]

# find which descriptions have any of the right words
ikea_words = []
for ii in range(0, len(ikdes)):
    ct = clean_text(str((ikdes[ii])))
    ikea_words.append(any(item in tester for item in ct))

list(compress(ikdes, ikea_words))

['The ÄPPLARÖ/KLASEN storage cabinet provides an extra storage area which can be moved easily. Also works perfectly next to the ÄPPLARÖ/KLASEN grill as a place to put serving plates and barbecue accessories. The stainless steel shelf has a durable surface that’s easy to keep clean. For added durability and so you can enjoy the natural expression of the wood, the furniture has been pre-treated with several layers of semi-transparent wood stain. ',
 'Solid wood is a durable natural material. A coffee table with drop leaves is easy to make larger or smaller according to your different needs. Pull-out stop ensures that the drawer cannot be pulled out too far accidently. Practical storage space underneath the table top. Separate shelf for magazines, etc. helps you keep your things organized and the table top clear. ',
 'Solid wood is a durable natural material. Separate shelf for magazines, etc. helps you keep your things organized and the table top clear. ',
 'You can place the shelf at th

In [7]:
def replace_nouns(text, replace):
    tokenized = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokenized)
    
    tt = []
    for ii in range(0, len(tagged)):
        tt.append(tagged[ii][1][0] == 'N')
        
    replacements = list(islice(cycle(tester), sum(tt)))
    
    jj = 0
    for ii in range(0, len(tagged)):
        if tt[ii]:
            tokenized[ii] = replacements[jj]
            jj = jj + 1
            
    return ' '.join(tokenized)

In [15]:
out = nltk.pos_tag(tester)

tt = []
for ii in range(0, len(out)):
    tt.append(out[ii][1][0] == 'N')
    

print(list(compress(tester, tt)))

['bookcase', 'shelf', 'hutch']
