In [1]:
from keras.applications.vgg16 import VGG16
from keras.utils.vis_utils import plot_model
# prepare the image for the VGG model
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import load_img
from keras.layers import Input, Flatten, Dense
from keras.models import Model
from keras import optimizers
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import os
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
img_rows, img_cols, img_channel = 224, 224, 3
productMatrix = pd.read_csv("./data/train/ProductMatrix")

totalClasses = len(np.unique(productMatrix["TargetCode"]))
train,test = train_test_split(productMatrix,test_size = .1)

In [25]:
#train.groupby("TargetCode").count().sort_values(by = "productid",ascending = False)
submissionDFUsers = pd.read_csv("./data/test.csv")
txnDF = pd.read_csv("./data/train/train.csv")
productAggData = txnDF.groupby(["productid","UserId"])["Quantity"].count().reset_index()
productAggData.columns = ["productid","UserId","PurchaseCount"]

In [9]:
batch_size = 32

In [None]:
tmp = []
for row in train[0:10].values:
    x = np.array(row[0])
    tmp.append(x)
np.asarray(tmp).shape

In [4]:
np.max(productMatrix["TargetCode"])

11

In [5]:
def imageLoader(data, batch_size):
    
    L = len(data)
    
    def loadImageAndTarget(df):
        imagesList = []
        targetList = []
        
        for row in df.values:
            imagePath = os.path.join("./data/train/images",str(row[0]) + ".jpg")
            targets = np.zeros(shape = (1,totalClasses),dtype = "int")
            targets[0,row[1]]= 1
            image = load_img(imagePath, target_size=(img_rows, img_cols))
            # convert the image pixels to a numpy array
            image = img_to_array(image)
            imagesList.append(image)
            targetList.append(targets)
        return imagesList,targetList
        
        
    #this line is just to make the generator infinite, keras needs that    
    while True:

        batch_start = 0
        batch_end = batch_size

        while batch_start < L:
            limit = min(batch_end, L)
            
            X,Y = loadImageAndTarget(data[batch_start:limit])
            
            X=np.asarray(X).reshape(-1,img_rows, img_cols,img_channel)
            Y = np.asarray(Y).reshape(-1,totalClasses)
            yield (X,Y) #a tuple with two numpy arrays with batch_size samples     

            batch_start += batch_size   
            batch_end += batch_size

In [11]:
#--------Define the model
model = VGG16(weights=None,include_top=False,input_shape=(img_rows, img_cols, img_channel),classes = totalClasses)
model.load_weights("./data/train/weights/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5",by_name=True)

for layer in model.layers:
    layer.trainable = False
    
keras_input = Input(shape=(img_rows,img_cols,img_channel),name = 'image_input')

#Use the generated model 
output_vgg16_conv = model(keras_input)

x = Flatten(name='flatten')(output_vgg16_conv)
x = Dense(1024, activation='relu', name='fc1')(x)
x = Dense(1024, activation='relu', name='fc2')(x)
x = Dense(totalClasses, activation='softmax', name='predictions')(x)

#Create your own model 
my_model = Model(input=keras_input , output=x)

optim = optimizers.SGD(lr=0.0001, decay=1e-6, momentum=0.9, nesterov=True)
#optim = optimizers.RMSprop(lr=0.0001, rho=0.9, epsilon=None, decay=1e-6)
#optim = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=1e-6, amsgrad=False)
my_model.compile(loss='categorical_crossentropy', optimizer=optim, metrics=['accuracy'])

#In the summary, weights and layers from VGG part will be hidden, but they will be fit during the training
#my_model.summary()



In [12]:
trainGenerator = imageLoader(train,batch_size)
testGenerator = imageLoader(test,batch_size)
fullDataGenerator = imageLoader(productMatrix,batch_size)
my_model.fit_generator(trainGenerator
                    ,steps_per_epoch= train.shape[0] // batch_size
                    , epochs=10
                    , validation_data = testGenerator
                    , validation_steps = test.shape[0] // batch_size
                    , verbose = 1
                   )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x281e90d6780>

In [13]:
# With a second Model
intermediate_model = Model(inputs=my_model.layers[0].input, 
                              outputs=my_model.get_layer("fc2").output)
output = intermediate_model.predict_generator(fullDataGenerator
                                     , steps= (productMatrix.shape[0] // batch_size) + 1
                                    )

In [14]:
print(output.shape, productMatrix.shape)

(3015, 1024) (3015, 2)


In [21]:
from sklearn.metrics.pairwise import cosine_similarity
productSimilarityMatrix = cosine_similarity(output)

In [16]:
np.argsort(-1 * similarity[0,:])

array([   0, 1864,    1, ...,  906, 2842,  846], dtype=int64)

In [None]:
x = -1 * np.array([[1,0,2],[4,1,0]])
tmp = pd.DataFrame()
#np.argsort(x,axis = 1)
for index in range(x.shape[0]):
    tmp = pd.concat([tmp,pd.DataFrame({"Similarity":[','.join(str(a) for a in x[index,:])]})],axis = 0)
tmp

In [None]:
np.sort(x,axis = 1)

In [17]:
productMatrix.loc[[0,1864,1]]

Unnamed: 0,productid,TargetCode
0,11139192,6
1864,12657628,5
1,11139194,6


In [30]:
def recommendProductsForUserId(UserID, totalRecommendations):
    #UserID = 26784
    global productSimilarityMatrix,productMatrix,productAggData
    
    train = productMatrix

    UserSpecificProducts = set(productAggData[productAggData["UserId"] == UserID]["productid"])
    AllProdsNotPurchaseByThisUser = set(productAggData["productid"]) - UserSpecificProducts

    allProducts = productMatrix["productid"].values #productMatrix.index.values

    simDF = pd.DataFrame()
    for nonPurchasedItem in list(AllProdsNotPurchaseByThisUser):
        #pdb.set_trace()
        sim = 0
        nonPurchasedItemIndexInProductMatrix = np.where(allProducts == nonPurchasedItem)

        for purchasedItem in list(UserSpecificProducts):
            purchasedItemIndexInProductMatrix = np.where(allProducts == purchasedItem)
            sim += productSimilarityMatrix[purchasedItemIndexInProductMatrix,nonPurchasedItemIndexInProductMatrix]
        sim = sim / len(UserSpecificProducts)

        simDF = pd.concat([simDF,pd.DataFrame({"ProductId":[nonPurchasedItem],"Sim":[sim]})])
        simDF = simDF.sort_values(by = "Sim",ascending = False).head(totalRecommendations)

    if len(simDF) < totalRecommendations:
        tmp = (productAggData[productAggData["UserId"] == UserID]
               .sort_values(by = "PurchaseCount", ascending = False)
               .head(10 - len(simDF))
              )

        simDF = pd.concat([simDF,pd.DataFrame({"ProductId":tmp["productid"],"Sim":tmp["PurchaseCount"]})])
    return simDF

In [None]:
submission = pd.DataFrame()
for user in submissionDFUsers["UserId"].values:
    recommendations = "[" + ",".join(str(prod) for prod in recommendProductsForUserId(user,10)["ProductId"].values) + "]"
    submission = pd.concat([submission,pd.DataFrame({"UserId":[user],"product_list":[recommendations]})], axis = 0)

submission.to_csv("./Data/submission.csv",index= False)

In [31]:
productSimilarityMatrix

array([[1.0000001 , 0.8467357 , 0.54576415, ..., 0.58784515, 0.53183925,
        0.47712192],
       [0.8467357 , 1.0000002 , 0.5081514 , ..., 0.5530091 , 0.48624814,
        0.4480817 ],
       [0.54576415, 0.5081514 , 0.9999999 , ..., 0.87206924, 0.8481258 ,
        0.5320573 ],
       ...,
       [0.58784515, 0.5530091 , 0.87206924, ..., 1.0000001 , 0.8407767 ,
        0.5221985 ],
       [0.53183925, 0.48624814, 0.8481258 , ..., 0.8407767 , 1.        ,
        0.5316389 ],
       [0.47712192, 0.4480817 , 0.5320573 , ..., 0.5221985 , 0.5316389 ,
        1.        ]], dtype=float32)