In [4]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from keras.layers import Flatten
from keras.layers import MaxPooling1D
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from nltk.corpus import stopwords

### Download stopwords corpus from NLTK

In [3]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()

### loading preprocessed pre-trained google news vectors

assigning index to each categories and reverse the keys and item values

In [5]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 30
EMBEDDING_DIM = 300

EMBEDDING_FILE = "GoogleNews-vectors-negative300.bin"
category_index = {"clothing":0, "camera":1, "home-appliances":2}
category_reverse_index = dict((y,x) for (x,y) in category_index.items())
STOPWORDS = set(stopwords.words("english"))

In [18]:
print(category_index)
print(category_reverse_index)
print(STOPWORDS)

{'clothing': 0, 'camera': 1, 'home-appliances': 2}
{0: 'clothing', 1: 'camera', 2: 'home-appliances'}
{'what', 'themselves', 'now', 'having', 'couldn', 'wasn', "she's", 'through', 'we', 'because', 'it', 'a', 'of', 'nor', 'same', 'ma', 'into', 'on', 'd', 'hadn', 'the', "that'll", 'y', "hadn't", 'm', 'did', 'don', 'hers', "weren't", 'between', "mustn't", 'any', "mightn't", 'wouldn', 'that', 'do', 'those', 'over', 'aren', 'their', 'you', 'whom', 'both', 'in', 'shan', 'our', 'ain', 'herself', 'there', 'i', 'who', 'will', 'she', "it's", 'have', 'which', "you're", 'they', 'ours', 't', "shouldn't", 'were', 'theirs', 'an', 'being', 'about', "you'll", 'some', "needn't", "couldn't", 'won', 's', 'few', 'such', 'yours', 'itself', 'mightn', 'her', 'been', 'above', 'does', 'below', 'has', "don't", 'himself', 'when', 'myself', 'its', 'more', 'as', "haven't", 'shouldn', 'me', 'was', 'my', 'then', 'll', 'and', 'after', 'to', 'too', 'up', 'but', 'he', 'had', 'haven', "won't", 'am', 'weren', 'why', 'is',

### Loading clothing, cameras, home_appliance csv file

In [19]:
clothing = pd.read_csv("product-titles-cnn-data/clothing.tsv", sep='\t')
cameras = pd.read_csv("product-titles-cnn-data/cameras.tsv", sep='\t')
home_appliances = pd.read_csv("product-titles-cnn-data/home.tsv", sep='\t')

datasets = [clothing, cameras, home_appliances]

print("Make sure there are no null values in the datasets")
for data in datasets:
    print("Has null values: ", data.isnull().values.any())

Make sure there are no null values in the datasets
Has null values:  False
Has null values:  False
Has null values:  False


In [20]:
clothing.iloc[100].title

"1410 Casual Short Sleeve Graphic Print Women's Maroon, Grey Top"

In [27]:
cameras.iloc[100].title

'autosity detective survilliance black hd camera button spy product camcorder(black)'

In [28]:
home_appliances.iloc[100].title

'connect z bt-m51n corded landline phone(black & white)'

### Remove stopwords and apply on all dataset

In [21]:
def preprocess(text):
    text= text.strip().lower().split()
    text = filter(lambda word: word not in STOPWORDS, text)
    return " ".join(text)
    
for dataset in datasets:
    dataset['title'] = dataset['title'].apply(preprocess)

In [29]:
all_texts = clothing['title'] + cameras['title'] + home_appliances['title']
all_texts = all_texts.drop_duplicates(keep=False)

In [30]:
all_texts[100]

"1410 casual short sleeve graphic print women's maroon, grey topautosity detective survilliance black hd camera button spy product camcorder(black)connect z bt-m51n corded landline phone(black & white)"

### Tokenize all datasets into pre-set max_nb_words (20000)

In [31]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(all_texts)

In [32]:
all_texts[100]

"1410 casual short sleeve graphic print women's maroon, grey topautosity detective survilliance black hd camera button spy product camcorder(black)connect z bt-m51n corded landline phone(black & white)"

### convert texts into sequence

In [34]:
clothing_sequences = tokenizer.texts_to_sequences(clothing['title'])
electronics_sequences = tokenizer.texts_to_sequences(cameras['title'])
home_appliances_sequences = tokenizer.texts_to_sequences(home_appliances['title'])

In [36]:
print(clothing.iloc[100].title)
print(len(clothing_sequences[100]))
print(clothing_sequences[100])

1410 casual short sleeve graphic print women's maroon, grey top
10
[72, 10, 135, 31, 268, 64, 3, 149, 67, 494]


### make each row the same length

In [38]:
clothing_data = pad_sequences(clothing_sequences, maxlen=MAX_SEQUENCE_LENGTH)
electronics_data = pad_sequences(electronics_sequences, maxlen=MAX_SEQUENCE_LENGTH)
home_appliances_data = pad_sequences(home_appliances_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [42]:
print(clothing.iloc[100].title)
print(len(clothing_data[100]))
print(clothing_data[100])
print('-'*100)
print(clothing.iloc[150].title)
print(len(clothing_data[150]))
print(clothing_data[150])

1410 casual short sleeve graphic print women's maroon, grey top
30
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0  72  10 135  31 268  64   3 149  67 494]
----------------------------------------------------------------------------------------------------
1410 women's a-line pink, white dress
30
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0   72    3   84   93   30
    4 2671]


### vetically stack up all data from three datasets of three categories

In [43]:
## The category (y-axis or label) is converted to convnet’s understandable format by using the 
## keras.util method to_categorical.
print("clothing: \t\t", to_categorical(category_index["clothing"], 3))
print("camera: \t\t", to_categorical(category_index["camera"], 3))
print("home appliances: \t", to_categorical(category_index["home-appliances"], 3))

clothing: 		 [1. 0. 0.]
camera: 		 [0. 1. 0.]
home appliances: 	 [0. 0. 1.]


In [48]:
data = np.vstack((clothing_data, electronics_data, home_appliances_data)) #vertically conbine all data
category = pd.concat([clothing['category'], cameras['category'], home_appliances['category']]).values
category = to_categorical(category)
print("combined data shape: ", data.shape)
print("combined category/label shape: ", category.shape)

combined data shape:  (405493, 30)
combined category/label shape:  (405493, 3)


### split training set and validation set

In [51]:
VALIDATION_SPLIT = 0.3
indices = np.arange(data.shape[0]) # get sequence of row index
np.random.shuffle(indices) # shuffle the row indexes
data = data[indices] # shuffle data/product-titles/x-axis
category = category[indices] # shuffle labels/category/y-axis
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = category[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = category[-nb_validation_samples:]

### load pretrained vectors from google news

In [52]:
## Word2Vec brings in semantic similarity info which can be leveraged by the convnets. 
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

Found 3000000 word vectors of word2vec


In [53]:
print("Odd word out:", word2vec.doesnt_match("banana apple grapes carrot".split()))
print("-"*10)
print("Cosine similarity between TV and HBO:", word2vec.similarity("tv", "hbo"))
print("-"*10)
print("Cosine similarity between TV and carrot:", word2vec.similarity("tv", "carrot"))
print("-"*10)
print("Most similar words to Computers:", ", ".join(map(lambda x: x[0], word2vec.most_similar("computers"))))
print("-"*10)

Odd word out: carrot
----------
Cosine similarity between TV and HBO: 0.6130649
----------
Cosine similarity between TV and carrot: 0.055450663
----------
Most similar words to Computers: computer, laptops, PCs, laptop_computers, desktop_computers, Computers, laptop, notebook_computers, Dell_OptiPlex_desktop, automated_seismographs
----------


### Checking number of words in data not found in pre-trained google news
The null word embeddings indicate the number of words not found in our pre-trained vectors (In this case Google News). This could possibly be unique words for brands in this context.

In [54]:
from keras.layers import Embedding

word_index = tokenizer.word_index
nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 1473


In [80]:
embedding_layer = Embedding(embedding_matrix.shape[0], # or len(word_index) + 1
                            embedding_matrix.shape[1], # or EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [81]:
embedding_layer #3D tensor with shape: (batch_size, sequence_length, output_dim).

<keras.layers.embeddings.Embedding at 0x23aa646a0>

## Model

In [115]:
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, Flatten
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

## model1
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.1))
model.add(Conv1D(300, 3, padding='valid',activation='relu',strides=2))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.1))
model.add(Dense(150,activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(3,activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 300)           817200    
_________________________________________________________________
dropout_42 (Dropout)         (None, 30, 300)           0         
_________________________________________________________________
conv1d_36 (Conv1D)           (None, 14, 300)           270300    
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 300)               0         
_________________________________________________________________
dropout_43 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 150)               45150     
_________________________________________________________________
dropout_44 (Dropout)         (None, 150)               0         
__________

In [116]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128)
score = model.evaluate(x_val, y_val, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 283846 samples, validate on 121647 samples
Epoch 1/2
Epoch 2/2
Test loss: 0.001613334781730237
Test accuracy: 0.9997698258074593


In [120]:
example_product = "Drawcord on elastic waist; mesh insert below back waist"
example_product = preprocess(example_product)
example_sequence = tokenizer.texts_to_sequences([example_product])
example_padded_sequence = pad_sequences(example_sequence, maxlen=MAX_SEQUENCE_LENGTH)

print("-"*10)
print("Predicted category: ", category_reverse_index[model.predict_classes(example_padded_sequence, verbose=0)[0]])
print("-"*10)
probabilities = model.predict(example_padded_sequence, verbose=1)
probabilities = probabilities[0]
print("Clothing Probability: ",probabilities[category_index["clothing"]] )
print("Camera Probability: ",probabilities[category_index["camera"]] )
print("home appliances probability: ",probabilities[category_index["home-appliances"]] )

----------
Predicted category:  clothing
----------
Clothing Probability:  1.0
Camera Probability:  3.271633e-19
home appliances probability:  3.826175e-11
