In [15]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import PIL
import io
from keras.applications import resnet50
import h5py
import matplotlib.image as mat_img
from PIL import Image
import urllib.request
from keras.preprocessing.image import load_img, img_to_array
from sklearn.feature_extraction.text import TfidfVectorizer

The main Idea of the Challenge is combining image features and text features to train a fully connected neural networks. 

To generate image features from image urls, I download the images and convert them into numpy array (matrix like). And apply ResNet50 without fully connected part to generate image features. The reason of using ResNet50 is that it has competitive accuracy with lower operations. 

For title part, I apply tfidf vectorization to titles. It gives text features based on frequencies. With image and text features prepared, I use keras to build a one-hidden-layer of text and two-hidden-layer of image neural networks with dropout regularization. 

The model can reach 99.9% accuracy on training set. After training the model, I spilt the unclassified dataframe into eleven small dataframes because image processing is really time consuming.

In [16]:
train_df=pd.read_excel('Training_Data_Assessment.xlsx')

In [17]:
#Save training images
#def save_images(url):
#    import re
#    with urllib.request.urlopen(url) as URL:
#        with open('temp.jpg','wb') as f:
#            f.write(URL.read())
#        img=Image.open('temp.jpg').convert('RGB')
#        img=img.resize((224,224))
#        img.save('train_images/'+url.split('/')[-1])
#    return
#train_df['ImageUrl'].map(save_images)

In [18]:
def url_to_name(url):
    name=url.split('/')[-1]
    return name

In [19]:
train_df['Images']=train_df['ImageUrl'].map(url_to_name)

In [None]:
#Save images to classify
def save_images_to_classify(url):
    with urllib.request.urlopen(url) as URL:
        with open('temp.jpg','wb') as f:
            f.write(URL.read())
        img=Image.open('temp.jpg').convert('RGB')
        img=img.resize((224,224))
        img.save('data/images_to_classify/'+url.split('/')[-1])
    return

In [20]:
#Create names for images
to_classify_df=pd.read_excel('Data To Classify_Assessment.xlsx')
to_classify_df['Images']=to_classify_df['ImageUrl'].map(url_to_name)
to_classify_df.head(10)

Unnamed: 0,ASIN,BrandName,Title,ImageUrl,Images
0,B005DIRI6I,Portta,Portta Digital Coaxial Toslink to Analog (L/R)...,http://ecx.images-amazon.com/images/I/01KGAAk9...,01KGAAk9oOL.jpg
1,B000OYR9S8,Savage,Savage SV-107X12-56 Seamless Background Paper ...,http://ecx.images-amazon.com/images/I/01OWR5or...,01OWR5or7hL.jpg
2,B00WT9UV3Q,Avtech,AVTech AVS228 8CH HD-SDI DVR,http://ecx.images-amazon.com/images/I/11%2BWuk...,11%2BWukuvN5L.jpg
3,B010F69FRC,Dahua,Dahua NVR4416-P / EX-NVRDR-P Dual Core CPU - 1...,http://ecx.images-amazon.com/images/I/111McGzd...,111McGzdcWL.jpg
4,B00U4S0FE4,HP,HP KVM Console G3 Switch 0x1x8 - 8 Ports - USB...,http://ecx.images-amazon.com/images/I/112CVCFg...,112CVCFgDnL.jpg
5,B000JCXDQC,Techcraft,TechCraft SWP60 60-Inch Wide Flat Panel TV Cre...,http://ecx.images-amazon.com/images/I/113gKN1r...,113gKN1rFlL.jpg
6,B00YHI9ZD4,Generic,Logitech PTZ Pro Camera - T - 960-001021 by Ge...,http://ecx.images-amazon.com/images/I/113L4%2B...,113L4%2BWrNZL.jpg
7,B001795LDY,Sanus,"Sanus LT25-B1 Large Mount with Tilt for 37"" To...",http://ecx.images-amazon.com/images/I/113nNf7j...,113nNf7jYpL.jpg
8,B00LQEVYS6,NEC,NEC 6700 Lumens 1280 x 800 WXGA 6000:1 Advance...,http://ecx.images-amazon.com/images/I/114ahyEn...,114ahyEnMrL.jpg
9,B00MOWF5F2,Smart Security Club,Dahua NVR5416-16P 16ch 1.5U 16 PoE NVR 5 Megap...,http://ecx.images-amazon.com/images/I/1153odH6...,1153odH6UvL.jpg


In [None]:
#to_classify_df['ImageUrl'].map(save_images_to_classify)

In [21]:
categories=pd.read_excel('Categories_Assessment.xlsx',header=None)
categories.columns=['Category']
categories.head(5)

Unnamed: 0,Category
0,Headphones
1,Cables
2,Security & Surveillance
3,Streaming Media
4,Television Accessories


In [None]:
#load ResNet50
model=resnet50.ResNet50(include_top=False,weights='imagenet',input_shape=(224,224,3))

In [None]:
#def resnet_feature(img_name):
#    img=load_img('data/train_images//'+img_name)
#    img=img_to_array(img)
#    img=np.expand_dims(img,axis=0)
#    resnetFeature=model.predict(img)
#    return resnetFeature

In [None]:
#train_df['res_feature']=train_df['Images'].map(resnet_feature)

In [None]:
#res_features=np.array(train_df['res_feature'].tolist())
#res_features.shape

In [None]:
#load data of image features and flat arrays
resnet_feature=np.load('res_features.npy')
resnet_feature=np.array([array.flatten() for array in resnet_feature])
resnet_feature.shape

In [22]:
# Generate text features
text_list=train_df['Title'].tolist()
brand=train_df['BrandName'].tolist()
vectorizer=TfidfVectorizer(ngram_range=(1,2),stop_words='english',sublinear_tf=False,use_idf=True,max_df=0.95,min_df=5)

In [23]:
#to_classify_df['ImageUrl'].map(save_images_to_classify)
#to_classify_df['res_features']=to_classify_df['Images'].map(resnet_feature_to_classify)

In [24]:
tfidf=vectorizer.fit_transform(text_list)

In [None]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.layers import Dropout, concatenate

In [None]:
# Build model to train features of image and text
text_features=Input(shape=tfidf.shape[1:],dtype='float32')
img_features=Input(shape=resnet_feature.shape[1:],dtype='float32')

In [None]:
x_text=Dense(256,activation='relu')(text_features)
x_text=Dropout(0.5)(x_text)

In [None]:
x_img=Dense(256,activation='relu')(img_features)
x_img=Dropout(0.5)(x_img)
x_img=Dense(256,activation='relu')(x_img)
x_img=Dropout(0.5)(x_img)

In [None]:
img_text=concatenate([x_img,x_text])
predictions=Dense(63,activation='softmax')(img_text)

In [None]:
model=Model(inputs=[img_features,text_features],outputs=[predictions])
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [35]:
# Create one hot vectors for categories
y_train=train_df['CategoryName'].tolist()
from sklearn.preprocessing import MultiLabelBinarizer
mlb=MultiLabelBinarizer()
y_train=mlb.fit_transform(s.split(',') for s in y_train)

In [None]:
# Build a generator to feed data by using a function in amazon_products from github
from amazon_products import text_generators
generator=text_generators.sparse_batch_generator(resnet_feature,tfidf,y_train,shuffle=True)

In [None]:
batch_size=32
steps=int(np.ceil(tfidf.shape[0]/batch_size))
model.fit_generator(generator,steps_per_epoch=steps,epochs=50)

In [None]:
model.save('stackline_img_text_model.hdf5')

In [None]:
#Transfer images in imagest_to_classifty into resnet feature array
def resnet_feature_to_classify(img_name):
    img=load_img('data/images_to_classify/'+img_name)
    img=img_to_array(img)
    img=np.expand_dims(img,axis=0)
    resnetFeature=model.predict(img)
    return resnetFeature


In [10]:
# Split to_classify_df
to_classify1,to_classify2,to_classify3,to_classify4,to_classify5,to_classify6,to_classify7,to_classify8,to_classify9,to_classify10,to_classify11=to_classify_df[0:5000],to_classify_df[5000:10000],to_classify_df[10000:15000],to_classify_df[15000:20000],to_classify_df[20000:25000],to_classify_df[25000:30000],to_classify_df[30000:35000],to_classify_df[35000:40000],to_classify_df[40000:45000],to_classify_df[45000:50000],to_classify_df[50000:]

In [None]:
#Generate ResNet Features by using ResNet50 model. I manully do it to every part and save the data into npy files. It is very time comsuming.
#Repeat this part for every splited dataframe
#to_classify3['res_features']=to_classify3['Images'].map(resnet_feature_to_classify)
#resnet_classify3=np.array(to_classify3['res_features'].tolist())
#np.save('resnet_classify3.npy',resnet_classify3)

In [25]:
from keras.models import load_model
mymodel=load_model('stackline_img_text_model.hdf5')

In [93]:
# Using my trained model to predict data to unclassified data.
text_list=to_classify7['Title'].tolist()

In [94]:
tfidf=vectorizer.transform(text_list)

In [95]:
resnet_classify7=np.load('resnet_classify7.npy')
resnet_classify7=np.array([array.flatten() for array in resnet_classify7])

In [29]:
# Function to give a generator for prediction
import numpy as np
import scipy.sparse as sparse
def generator_for_predict(image_features,text_features,batch_size=32):
    if image_features.shape[0] != text_features.shape[0]:
        raise ValueError('Features have different number of samples!')
    n_samples = image_features.shape[0]
    n_batches = np.ceil(n_samples / batch_size)
    
    counter = 0
    sample_index = np.arange(n_samples)
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        image_batch = image_features[batch_index, :]
        if sparse.issparse(image_batch):
            image_batch = image_batch.toarray()
        text_batch = text_features[batch_index, :]
        if sparse.issparse(text_batch):
            text_batch = text_batch.toarray()
            
        counter += 1
        yield [image_batch, text_batch]

In [96]:
generator=generator_for_predict(resnet_classify7,tfidf)

In [97]:
batch_size=32
steps=int(np.ceil(tfidf.shape[0]/batch_size))
predict_to_classify7=mymodel.predict_generator(generator,steps=steps)

In [98]:
predict_to_classify7

array([[  6.55707800e-06,   1.63365126e-04,   7.59741582e-04, ...,
          3.36577541e-05,   2.35780230e-04,   7.88879515e-06],
       [  1.20914424e-11,   4.83713555e-11,   3.63866576e-10, ...,
          7.58824915e-13,   3.51811608e-10,   6.04865984e-12],
       [  1.46517992e-10,   3.25827892e-10,   4.61791405e-09, ...,
          4.29598668e-09,   2.89855109e-08,   1.77839588e-09],
       ..., 
       [  1.26384905e-13,   2.71612094e-10,   1.13035102e-10, ...,
          1.43905218e-11,   3.16305271e-09,   1.82623236e-10],
       [  1.90975552e-05,   1.07671858e-05,   4.67324935e-05, ...,
          5.51407720e-06,   7.47852173e-05,   1.56427987e-07],
       [  3.67630819e-05,   3.49612804e-07,   3.66448404e-07, ...,
          2.05495301e-08,   5.16394721e-07,   1.36452227e-08]], dtype=float32)

In [99]:
# Convert prediction matrix into one-hot vectors based on largest values in arrays
for i in range(len(predict_to_classify7)):
    max_value=np.max(predict_to_classify7[i])
    predict_to_classify7[i]=np.where(predict_to_classify7[i]==max_value,1,0)

In [100]:
categories_prediction=mlb.inverse_transform(predict_to_classify7)

In [101]:
categories_prediction=[list(i)[0] for i in categories_prediction]

In [102]:
cat_df=pd.DataFrame(categories_prediction,columns=['CategoryName'])
cat_df.head()

Unnamed: 0,CategoryName
0,Office Electronics
1,Calculators
2,Televisions
3,GPS & Navigation
4,Car Subwoofers & Amplifiers


In [103]:
#Merge Categories prediction with origin data.
to_classify7=to_classify7.copy()
cat_df=cat_df.set_index(to_classify7.index)
to_classify7=pd.concat([to_classify7,cat_df],axis=1)
to_classify7.head()

Unnamed: 0,ASIN,BrandName,Title,ImageUrl,Images,CategoryName
30000,B00MH2NKPG,Obihai,Obihai OBi1032 IP Phone with Power Supply - Up...,https://images-na.ssl-images-amazon.com/images...,413cCxpbRsL.jpg,Office Electronics
30001,B00PSTN09G,Unknown,Texas Instruments compatible USB Cable for TI ...,https://images-na.ssl-images-amazon.com/images...,413cjpT78iL.jpg,Calculators
30002,B013LHDV9A,Samsung,Samsung DB55E 55-Inch 1920x1080 Resolution Led TV,https://images-na.ssl-images-amazon.com/images...,413Ck0IlUrL.jpg,Televisions
30003,B072BVBBVB,Multi-Tech,Multi-tech MTCDP-EV2-GP-N16-1.0 CELLULAR GATEW...,https://images-na.ssl-images-amazon.com/images...,413cKz9HTAL.jpg,GPS & Navigation
30004,B00CLFM596,Pioneer,"Pioneer Gm-D9601 2,400-Watt Class D Mono Amp",https://images-na.ssl-images-amazon.com/images...,413CVQiVerL.jpg,Car Subwoofers & Amplifiers


In [27]:
to_classify1.to_excel('to_classify1.xlsx',sheet_name='0-4999')

In [39]:
to_classify2.to_excel('to_classify2.xlsx',sheet_name='5000-9999')

In [51]:
to_classify5.to_excel('to_classify5.xlsx',sheet_name='20000-24999')

In [63]:
to_classify3.to_excel('to_classify3.xlsx',sheet_name='10000-14999')

In [75]:
to_classify4.to_excel('to_classify4.xlsx',sheet_name='15000-19999')

In [40]:
to_classify11.to_excel('to_classify11.xlsx',sheet_name='50000-')

In [53]:
to_classify10.to_excel('to_classify10.xlsx',sheet_name='45000-49999')

In [65]:
to_classify9.to_excel('to_classify9.xlsx',sheet_name='40000-44999')

In [77]:
to_classify6.to_excel('to_classify6.xlsx',sheet_name='25000-29999')

In [89]:
to_classify8.to_excel('to_classify8.xlsx',sheet_name='35000-49999')

In [104]:
to_classify7.to_excel('to_classify7.xlsx',sheet_name='30000-34999')

In [108]:
# Merge all prediction excels
df1=pd.read_excel('to_classify1.xlsx')
df2=pd.read_excel('to_classify2.xlsx')
df3=pd.read_excel('to_classify3.xlsx')
df4=pd.read_excel('to_classify4.xlsx')
df5=pd.read_excel('to_classify5.xlsx')
df6=pd.read_excel('to_classify6.xlsx')
df7=pd.read_excel('to_classify7.xlsx')
df8=pd.read_excel('to_classify8.xlsx')
df9=pd.read_excel('to_classify9.xlsx')
df10=pd.read_excel('to_classify10.xlsx')
df11=pd.read_excel('to_classify11.xlsx')

In [112]:
df=pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11])
df.to_excel('classified.xlsx',index=False)