In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential 
from keras.layers import Input
from keras.layers import Dense, GlobalAveragePooling1D, Activation
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.merge import concatenate
from keras.layers.convolutional import MaxPooling1D
from gensim.models.phrases import Phraser, Phrases
from keras import layers, backend
from sklearn.feature_extraction.text import TfidfVectorizer

Using TensorFlow backend.


In [2]:
%%time
df = pd.read_csv('/content/drive/Shared drives/DSO 560 NLP Project/train.csv')

CPU times: user 177 ms, sys: 55 ms, total: 232 ms
Wall time: 2.29 s


In [0]:
df = df[df['attribute_name'] == 'style']

In [0]:
df['attribute_value'] = df['attribute_value'].apply(lambda x: 'businesscasual' if x == 'business casual' else x)

In [5]:
df.head()

Unnamed: 0,product_id,brand,product_full_name,description,brand_category,attribute_name,attribute_value
0,01DPGV4YRP3Z8J85DASGZ1Y99W,frame,les second medium noir,"minimal , modern styling meet refined luxury l...",accessory,style,casual
5,01DPH1DEN9G2WM7WAMJMD0A9W4,j crew,tie waist shirtdress stripe,take classic button silhouette turn ultra flat...,dressesandjumpsuits,style,casual
12,01E2KYW52BAG606GQ7A9H5R0KD,alo,interval microfleece pullover hoodie,articulate seam extra wide rib hem create shap...,unknown,style,casual
13,01DT513RRYT3SKH6X25G5VCH6B,chlo,leather ankle boot,heel measure approximately 55 mm 2 inch 30 mm ...,shoe boots ankle,style,androgynous
20,01E2KM0KW6NB1JKMZVRXR6H8G2,alo,stadium quarter zip hoodie,supersoft hoodie design elastic hem cuff perfe...,unknown,style,casual


In [0]:
df['text'] = df['brand'] + ' ' + df['product_full_name'] + ' ' + df['description'] + ' ' + df['brand_category']

In [0]:
df['text'] = df['text'].apply(str)

In [8]:
df['attribute_value'].value_counts()

casual            3067
modern            2372
classic           1923
businesscasual    1492
androgynous       1080
edgy               919
romantic           765
boho               587
glam               582
athleisure         401
retro              295
Name: attribute_value, dtype: int64

In [0]:
y = df['attribute_value']

In [0]:
y = pd.get_dummies(y)
label_list = y.columns.tolist()
y = y.values

In [11]:
num_classes = y.shape[1]
num_classes

11

In [0]:
tokenizer = TfidfVectorizer(max_features=500, min_df = 5, ngram_range=(1,2))

In [0]:
X = tokenizer.fit_transform(df['text'].values)

In [0]:
def create_tokenizer(lines, n):
    tokenizer = TfidfVectorizer(max_features=500, min_df = 5, ngram_range=(n,n))
    tokenizer.fit(lines)
    return tokenizer

In [0]:
def max_length(lines):
	return max([len(s.split()) for s in lines])

In [0]:
def encode_text(tokenizer, lines, length):
	# integer encode
	encoded = tokenizer.transform(lines).todense()
	# pad encoded sequences
	padded = pad_sequences(encoded, maxlen=length, padding='post')
	return padded

In [15]:
df['attribute_value'].isna().sum()

0

In [0]:
def transfrom_text(n):
    tokenizer = create_tokenizer(df['text'].values, n)
    length = max_length(df['text'].values)
    vocab_size = 500 + 1
    return encode_text(tokenizer, df['text'].values, length), vocab_size, length

In [0]:
X_1, vocab_size1, length1 = transfrom_text(1)

In [0]:
X_2, vocab_size2, length2 = transfrom_text(2)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [0]:
def define_model(length, vocab_size, n):

    inputs1 = Input(shape=(500, ))
    embedding1_1 = Embedding(vocab_size, 100)(inputs1)
    conv1_1 = Conv1D(filters=32, kernel_size=10, activation='relu')(embedding1_1)
    pool1_1 = MaxPooling1D(pool_size=2)(conv1_1)
    drop1_1 = Dropout(0.5)(pool1_1)

    lstm1_1 = LSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.15)(embedding1_1)
    lstm2_1 = LSTM(128, return_sequences=False, dropout=0.2, recurrent_dropout=0.15)(lstm1_1)
    drop2_1 = Dropout(0.5)(lstm2_1)
    
    #inputs2 = Input(shape=(length2, ))
    #embedding1_2 = Embedding(vocab_size2, 100)(inputs2)
    #conv1_2 = Conv1D(filters=32, kernel_size=10, activation='relu')(embedding1_2)
    #pool1_2 = MaxPooling1D(pool_size=2)(conv1_2)
    #drop1_2 = Dropout(0.5)(pool1_2)

    #lstm1_2 = LSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.15)(embedding1_2)
    #lstm2_2 = LSTM(128, return_sequences=False, dropout=0.2, recurrent_dropout=0.15)(lstm1_2)
    #drop2_2 = Dropout(0.5)(lstm2_2)

    #merged = concatenate([drop2_1, drop2_2])
    #flat1 = Flatten()(drop1)

    dense1 = Dense(32, activation='relu')(drop2_1)
    outputs = Dense(num_classes, activation='sigmoid')(dense1)

    model = Model(inputs=[inputs1], outputs=outputs)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

In [40]:
model = define_model(500, 500, 2)
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 500, 100)          50000     
_________________________________________________________________
lstm_13 (LSTM)               (None, 500, 256)          365568    
_________________________________________________________________
lstm_14 (LSTM)               (None, 128)               197120    
_________________________________________________________________
dropout_14 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_10 (Dense)             (None, 11)                363 

In [0]:
model.fit(X_train, y_train, validation_split=0.2, batch_size=128, epochs=10)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 9707 samples, validate on 2427 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10

In [0]:
model.evaluate(X_test, y_test)

In [0]:
model.predict(X)[0]