## Kaggle Submission notebook
### Author Robert Allan
#### Student number 20905488

Data Paths: csv text data (train.csv and test.csv) are unzipped into the same directory as the notebook. Likewise the images are unzipped into the .\suffled-images\shuffled-images\ path (as per the zip file). The notebook was run on a windows anaconda installation with a 1660ti (6 Gb VRAM) and 32Gb ram.

In [38]:
import nltk
from nltk.corpus import stopwords
import re
import string
import scipy
import sklearn
import numpy as np
import pandas as pd

In [40]:
import tensorflow as tf
import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold

In [42]:
import pathlib

In [43]:
df = pd.read_csv ('train.csv')

In [44]:
df.head()

Unnamed: 0,id,category,gender,baseColour,season,usage,noisyTextDescription
0,36274,Scarves,Women,Grey,Summer,Casual,Femella Women Ankle-Length Grey AQ-S800WD-1EVD...
1,15129,Flip Flops,Unisex,Green,Summer,Casual,Converse Unisex Casual Skirts Slipper
2,58976,Topwear,Women,Red,Summer,Ethnic,Velia Women Acetone Kurta
3,32922,Sandal,Men,Brown,Summer,Casual,Enroute Men Leather Brown Sandals
4,29561,Topwear,Women,Pink,Fall,Ethnic,Aneri Exclusive Anu Pink Inspirartion


In [45]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aetiu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Text Cleaning
Import text from noisy text description and clean and tokenize. First we turn everything to lower case, remove punctuation and digits and stopwords.

In [46]:
text = df['noisyTextDescription']

In [47]:
def process_text(text):
    """Process text function.
    Input:
        text: a string containing a text
    Output:
        text_clean: a list of words containing the processed text
 
    """
    # turn it to lower case
    text = text.lower()
     
    # replace apostrophe with space
    text = re.sub('\'', ' ', text)
     
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
     
    # remove digits
    text = re.sub('\d', '', text)
     
    stopwords_english = stopwords.words('english')
 
 
    text_clean = ""
    text = text.split(' ')
    for word in text:
        if (word not in stopwords_english):  # remove punctuation
            # text_clean.append(word)
            text_clean = text_clean +" "+word
     
    # remove double spaces
    text_clean = re.sub(' +', ' ', text_clean)
    # strip text
    text_clean = text_clean.strip()
 
    return text_clean

In [48]:
df['clean_text'] = df['noisyTextDescription'].apply(lambda x: process_text(str(x)))

In [49]:
df['clean_text'].head()

0    femella women anklelength grey aqswdevdfad
1         converse unisex casual skirts slipper
2                     velia women acetone kurta
3             enroute men leather brown sandals
4         aneri exclusive anu pink inspirartion
Name: clean_text, dtype: object

Encode labels as one hot encoded vectors (as we're using a neural network)

In [50]:
y = df["category"]


enc = OneHotEncoder(handle_unknown='ignore')

enc.fit(np.array(y).reshape(-1, 1))

enc.categories_
y_oneHot = enc.transform(np.array(y).reshape(-1, 1)).toarray()


In [51]:
y_oneHot.shape[1]

27

In [52]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [53]:
# import nltk
# nltk.download('punkt')

### Turn text into TF IDF Vectors

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

## TF IDF vectorization top 1000 terms only to save on resources

In [55]:
# df['clean_text']
def createTFIDF(arrayClean) :
    tf_idf_vec = TfidfVectorizer(use_idf=True, 
                            smooth_idf=False,  max_features=1000,
#                               smooth_idf=False,  max_features=2000,
                            ngram_range=(1,1),stop_words='english') # to use only  bigrams ngram_range=(2,2)
    #fit
    tf_idf_vec = tf_idf_vec.fit(arrayClean)
    #transform
    tf_idf_data = tf_idf_vec.transform(arrayClean)

    #create dataframe
    tf_idf_dataframe=pd.DataFrame(tf_idf_data.toarray(),columns=tf_idf_vec.get_feature_names())
    return tf_idf_dataframe.values , tf_idf_vec 

In [56]:
tf_idf_dataframe, fittedTFIDFVectorizer  = createTFIDF(df['clean_text'])

#tf_idf_data =  fittedTFIDFVectorizer.transform(arrayClean)

In [57]:
tf_idf_dataframe.shape

(21627, 1000)

In [58]:
def get_model_name(k):
    return 'model_'+str(k)+'.h5'

In [61]:
yenc = enc.fit(np.array(df['category']).reshape(-1, 1))
y_onehot = yenc.transform(np.array(df['category']).reshape(-1, 1)).toarray()

In [62]:
y_onehot.shape

(21627, 27)

In [63]:
gender_oneHot = enc.transform(np.array(df["gender"]).reshape(-1, 1)).toarray()
baseCol_oneHot = enc.transform(np.array(df["baseColour"]).reshape(-1, 1)).toarray()
season_oneHot = enc.transform(np.array(df["season"]).reshape(-1, 1)).toarray()
usage_oneHot = enc.transform(np.array(df["usage"]).reshape(-1, 1)).toarray()


In [64]:
y = df['category']

## VGG and image processing

In [65]:
import glob
from keras.applications import VGG16

Import images and associate with observation (via the id column)

In [66]:
train_images_paths = []

for ident in df['id']:
    img_path=glob.glob('.\\suffled-images/*/{}.jpg'.format(ident))[0]
    train_images_paths.append(img_path)

In [67]:
train_images_paths[1:3]

['.\\suffled-images\\shuffled-images\\15129.jpg',
 '.\\suffled-images\\shuffled-images\\58976.jpg']

In [68]:
import os

In [69]:
def readImage(file):
    img = tf.io.read_file(file)
    return (tf.image.decode_jpeg(img, channels=1))


In [70]:
def GetImageDS(df):
    train_images_paths = []

    for ident in df['id']:
        img_path=glob.glob('.\\suffled-images/*/{}.jpg'.format(ident))[0]
        train_images_paths.append(img_path)
        data = []
#     print("finish first for loop")
    j = 0
    for i in train_images_paths:   
#         if (j%50) == 0:
#             print (j)
        image=tf.keras.preprocessing.image.load_img(i, color_mode='rgb', 
    #     target_size= (224,224))
        target_size= (60,60))                                                
        image=np.array(image)
        data.append(image)
        
        j +=1
    arrayImageX = np.array(data)
    return arrayImageX

In [71]:
imageX = GetImageDS(df)
y_image = df['category']

In [72]:
y_image

0           Scarves
1        Flip Flops
2           Topwear
3            Sandal
4           Topwear
            ...    
21622     Innerwear
21623         Belts
21624         Shoes
21625       Watches
21626       Topwear
Name: category, Length: 21627, dtype: object

In [73]:
tf_idf_dataframe.shape

(21627, 1000)

# FINAL MODEL

Final model is 2 combined neural networks one image processing based using a VGG16 which is initialised from imagenet weights and also one for the text and categorical features, a simple 1024 wide 1 deep neural network to process the TF IDF information as well as one hot encoded categorical data from the others. After this the output of the VGG and text neural network is combined and passed through another 256 wide 1 deep neural network (dense type) and then predicts the final categories (as a vector) using softmax.

In [121]:
kfold = KFold(n_splits=5, random_state=6, shuffle=True)
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(np.array(y_image).reshape(-1, 1))
fold_var =1
tfidfX = tf_idf_dataframe
genderenc = OneHotEncoder(handle_unknown='ignore')
genderenc = genderenc.fit(np.array(df["gender"]).reshape(-1, 1))
gender_oneHot =genderenc.transform(np.array(df["gender"]).reshape(-1, 1)).toarray()


baseColourenc = OneHotEncoder(handle_unknown='ignore')
baseColourenc = baseColourenc.fit(np.array(df["baseColour"]).reshape(-1, 1))
baseCol_oneHot = baseColourenc.transform(np.array(df["baseColour"]).reshape(-1, 1)).toarray()

seasonenc = OneHotEncoder(handle_unknown='ignore')
seasonenc = seasonenc.fit(np.array(df["season"]).reshape(-1, 1))
season_oneHot = seasonenc.transform(np.array(df["season"]).reshape(-1, 1)).toarray()

usageenc = OneHotEncoder(handle_unknown='ignore')
usageenc = usageenc.fit(np.array(df["usage"]).reshape(-1, 1))
usage_oneHot = usageenc.transform(np.array(df["usage"]).reshape(-1, 1)).toarray()
tfidfX = tf_idf_dataframe
tfidfX = np.hstack((np.array(tfidfX),gender_oneHot,baseCol_oneHot,
                season_oneHot,usage_oneHot))



def create_new_modelVGGX_TFIDF() :
    inputA = tf.keras.Input(shape=(60, 60, 3))
    inputB = tf.keras.Input(shape=(tfidfX.shape[1],))
    vgg_model = VGG16(weights='imagenet',include_top=False,input_shape=(60, 60, 3))#(inputA) #, 
    # Freeze four convolution blocks
    for layer in vgg_model.layers[:15]:
#         layer.trainable = False# Make sure you have frozen the correct layers
        layer.trainable = True
    for layer in vgg_model.layers[15:]:
        layer.trainable = True
#     for i, layer in enumerate(vgg_model.layers):
#         print(i, layer.name, layer.trainable)
#     x = vgg_model(inputA).output
    x = vgg_model.output
    x = tf.keras.layers.Flatten()(x)
    y = tf.keras.layers.Dense(1024, activation="relu")(inputB)
    y = tf.keras.Model(inputs=inputB, outputs=y)
    
    combined = keras.layers.concatenate([x, y.output])
    
    z = tf.keras.layers.Flatten()(combined) # Flatten dimensions to for use in FC layers
#     z = tf.keras.layers.Dense(256, activation='relu')(z)
#     z = tf.keras.layers.Dropout(0.2)(z) # Dropout layer to reduce overfitting
    z = tf.keras.layers.Dense(256, activation='relu')(z)
    z = tf.keras.layers.Dense(27, activation='softmax')(z) # Softmax for multiclass
    transfer_model = tf.keras.Model(inputs=[vgg_model.input,y.input], outputs=z)
    
    return transfer_model

for train, test in kfold.split(imageX, tfidfX, y_image ):
    trainImageX, valImageX = imageX[train], imageX[test]
    traintfidfX, valtfidfX = tfidfX[train], tfidfX[test]
    trainY, valY = y_image[train], y_image[test]
    trainY_oneHot = enc.transform(np.array(trainY).reshape(-1, 1)).toarray()
    valY_oneHot = enc.transform(np.array(valY).reshape(-1, 1)).toarray()
    #transfer_model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(lr=5e-5), metrics=["accuracy"])

   # history = transfer_model.fit(imageX, y_onehot, batch_size = 128, epochs=20)
    
    model2 = create_new_modelVGGX_TFIDF()
    # COMPILE NEW MODEL
    model2.compile(loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(lr=5e-5),
    metrics=['accuracy'])
    earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=6)
#     checkpoint = tf.keras.callbacks.ModelCheckpoint("VGGModelX_"+get_model_name(fold_var), 
    checkpoint = tf.keras.callbacks.ModelCheckpoint("VGGModelX3_"+get_model_name(fold_var), 
                            monitor='val_accuracy', verbose=1, 
                            save_best_only=True, mode='max')
    callbacks_list = [checkpoint,earlystop]
    
    history = model2.fit(x=[trainImageX, traintfidfX], y= trainY_oneHot, epochs=50, batch_size = 25,
                        callbacks=callbacks_list,validation_data= ([valImageX, valtfidfX], valY_oneHot ))
    #saved_model = load_model('best_model.h5')
    #print('Model evaluation ',model2.evaluate(valX , valY_oneHot))
    #model.fit(trainX, trainY,epochs=20)
    
    fold_var += 1

Epoch 1/50
Epoch 00001: val_accuracy improved from -inf to 0.89413, saving model to VGGModelX3_model_1.h5
Epoch 2/50
Epoch 00002: val_accuracy improved from 0.89413 to 0.93504, saving model to VGGModelX3_model_1.h5
Epoch 3/50
Epoch 00003: val_accuracy improved from 0.93504 to 0.94475, saving model to VGGModelX3_model_1.h5
Epoch 4/50
Epoch 00004: val_accuracy did not improve from 0.94475
Epoch 5/50
Epoch 00005: val_accuracy improved from 0.94475 to 0.95446, saving model to VGGModelX3_model_1.h5
Epoch 6/50
Epoch 00006: val_accuracy improved from 0.95446 to 0.95885, saving model to VGGModelX3_model_1.h5
Epoch 7/50
Epoch 00007: val_accuracy did not improve from 0.95885
Epoch 8/50
Epoch 00008: val_accuracy improved from 0.95885 to 0.96301, saving model to VGGModelX3_model_1.h5
Epoch 9/50
Epoch 00009: val_accuracy improved from 0.96301 to 0.96486, saving model to VGGModelX3_model_1.h5
Epoch 10/50
Epoch 00010: val_accuracy improved from 0.96486 to 0.96556, saving model to VGGModelX3_model_1.h

Review the K fold's looking for best validation accuracy, in this case it is VGGModelX3_model_4.h5


In [122]:
fittedTFIDFVectorizer

TfidfVectorizer(max_features=1000, smooth_idf=False, stop_words='english')

###### Import test.csv and do sample data processing so we can use the best model prediction to make our prediction CSV

In [74]:
dfTest = pd.read_csv ('test.csv')
# dfTest = pd.read_csv ('train.csv')
dfTest.head()

Unnamed: 0,id,gender,baseColour,season,usage,noisyTextDescription
0,26266,Men,Black,Summer,Casual,Chromozome Men Black Fashion Vest
1,22134,Women,Green,Summer,Casual,Elle Women Green Color Clash Top
2,28358,Women,Black,Winter,Casual,Baggit Women Chotu Mayur Black Palms
3,15554,Men,Black,Fall,Casual,Greensboro Colors Of Ap Men Latnam Black Casua...
4,53408,Women,Purple,Summer,Casual,Fish White Smooth daddy (SH100)


In [75]:
#Test data transformation

#clean text
dfTest['clean'] = dfTest['noisyTextDescription'].apply(lambda x: process_text(str(x)))
tf_idf_Testdata =  fittedTFIDFVectorizer.transform(dfTest['clean'])
#tf_idf_Testdata =  fittedTFIDFVectorizer.transform(dfTest['clean'])
tf_idf_Testdata=pd.DataFrame(tf_idf_Testdata.toarray(),columns=fittedTFIDFVectorizer.get_feature_names())
print (tf_idf_Testdata.shape)

# genderenc = OneHotEncoder(handle_unknown='ignore')
# genderenc = genderenc.fit(np.array(df["gender"]).reshape(-1, 1))
gender_oneHotTest =genderenc.transform(np.array(dfTest["gender"]).reshape(-1, 1)).toarray()


# baseColourenc = OneHotEncoder(handle_unknown='ignore')
# baseColourenc = baseColourenc.fit(np.array(df["baseColour"]).reshape(-1, 1))
baseCol_oneHotTest = baseColourenc.transform(np.array(dfTest["baseColour"]).reshape(-1, 1)).toarray()

# seasonenc = OneHotEncoder(handle_unknown='ignore')
# seasonenc = seasonenc.fit(np.array(df["season"]).reshape(-1, 1))
season_oneHotTest = seasonenc.transform(np.array(dfTest["season"]).reshape(-1, 1)).toarray()

# usageenc = OneHotEncoder(handle_unknown='ignore')
# usageenc = usageenc.fit(np.array(df["usage"]).reshape(-1, 1))
usage_oneHotTest = usageenc.transform(np.array(dfTest["usage"]).reshape(-1, 1)).toarray()
# tfidfX = tf_idf_dataframe
tfidfX_TEST = np.hstack((np.array(tf_idf_Testdata),gender_oneHotTest,baseCol_oneHotTest,
                season_oneHotTest,usage_oneHotTest))

# imageX for test data

imageX_test = GetImageDS(dfTest)

(21628, 1000)


In [76]:
tf_idf_Testdata.shape

(21628, 1000)

In [77]:
# saved2_model = load_model('VGGModelX_model_1.h5')
saved2_model = load_model('VGGModelX3_model_4.h5')


saved2_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
# print('Model evaluation ',saved_model.evaluate([imageX_test,tfidfX_TEST] , Y_testOneHot))
result = saved2_model.predict([imageX_test,tfidfX_TEST])

In [78]:
result.shape

(21628, 27)

In [79]:
predict_array = enc.inverse_transform(tf.one_hot(tf.argmax(result, axis=1), depth = 27))
np.transpose(np.array((dfTest["id"],predict_array[:,0])))
saveFrame  = pd.DataFrame(data=np.transpose(np.array((dfTest["id"],predict_array[:,0]))), columns=['id', 'category'])
saveFrame.head()

Unnamed: 0,id,category
0,26266,Innerwear
1,22134,Topwear
2,28358,Belts
3,15554,Shoes
4,53408,Innerwear


Will export the dataset as a CSV file

In [133]:
saveFrame.to_csv('submission7_03_12.csv', header=True, index=False) 