In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential 
from keras.layers import Input
from keras.layers import Dense, GlobalAveragePooling1D, Activation
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

Using TensorFlow backend.


In [2]:
%%time
df = pd.read_csv('/content/drive/Shared drives/DSO 560 NLP Project/train.csv')

CPU times: user 153 ms, sys: 48.1 ms, total: 201 ms
Wall time: 223 ms


In [0]:
df = df[df['attribute_name'] == 'style']

In [0]:
df['attribute_value'] = df['attribute_value'].apply(lambda x: 'businesscasual' if x == 'business casual' else x)

In [5]:
df.head()

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,attribute_name,attribute_value
0,01DPGV4YRP3Z8J85DASGZ1Y99W,frame,les second medium noir,"minimal , modern styling meet refined luxury l...",accessory,style,casual
5,01DPH1DEN9G2WM7WAMJMD0A9W4,j crew,tie waist shirtdress stripe,take classic button silhouette turn ultra flat...,dressesandjumpsuits,style,casual
12,01E2KYW52BAG606GQ7A9H5R0KD,alo,interval microfleece pullover hoodie,articulate seam extra wide rib hem create shap...,unknown,style,casual
13,01DT513RRYT3SKH6X25G5VCH6B,chlo,leather ankle boot,heel measure approximately 55 mm 2 inch 30 mm ...,shoe boots ankle,style,androgynous
20,01E2KM0KW6NB1JKMZVRXR6H8G2,alo,stadium quarter zip hoodie,supersoft hoodie design elastic hem cuff perfe...,unknown,style,casual


In [0]:
df['description'] = df['description'].apply(str)

In [7]:
df['attribute_value'].value_counts()

casual            3067
modern            2372
classic           1923
businesscasual    1492
androgynous       1080
edgy               919
romantic           765
boho               587
glam               582
athleisure         401
retro              295
Name: attribute_value, dtype: int64

In [0]:
y = df['attribute_value']

In [0]:
y = pd.get_dummies(y)
label_list = y.columns.tolist()
y = y.values

In [0]:
def create_tokenizer(lines):
    tokenizer = Tokenizer(num_words=2000)
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [0]:
def max_length(lines):
	return max([len(s.split()) for s in lines])

In [0]:
def encode_text(tokenizer, lines, length):
	# integer encode
	encoded = tokenizer.texts_to_sequences(lines)
	# pad encoded sequences
	padded = pad_sequences(encoded, maxlen=length, padding='post')
	return padded

In [0]:
tokenizer = create_tokenizer(df['description'].values)
length = max_length(df['description'].values)
vocab_size = len(tokenizer.word_index) + 1

In [14]:
length

148

In [15]:
num_classes = y.shape[1]
num_classes

11

In [0]:
X = encode_text(tokenizer, df['description'].values, length)

In [17]:
X.shape

(13483, 148)

In [18]:
y.shape

(13483, 11)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [20]:
X_train.shape

(12134, 148)

In [0]:
def define_model(length, vocab_size):

    inputs = Input(shape=(length, ))
    embedding1 = Embedding(vocab_size, 100)(inputs)
    conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
    pool1 = MaxPooling1D(pool_size=2)(conv1)
    drop1 = Dropout(0.5)(pool1)

    lstm1 = LSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.15)(embedding1)
    lstm2 = LSTM(128, return_sequences=False, dropout=0.2, recurrent_dropout=0.15)(lstm1)
    drop2 = Dropout(0.5)(lstm2)
    

    #flat1 = Flatten()(drop1)

    dense1 = Dense(32, activation='relu')(drop2)
    outputs = Dense(num_classes, activation='sigmoid')(dense1)

    model = Model(inputs=inputs, outputs=outputs)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [22]:
model = define_model(length, vocab_size)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 148)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 148, 100)          500600    
_________________________________________________________________
lstm_1 (LSTM)                (None, 148, 256)          365568    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_2 (Dense)              (None, 11)                363 

In [0]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=128, epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 12134 samples, validate on 1349 samples
Epoch 1/10
Epoch 2/10
 1280/12134 [==>...........................] - ETA: 42s - loss: 0.2897 - accuracy: 0.9091

In [0]:
model.evaluate(X_test, y_test)

In [0]:
model.predict(X)[0]