![](https://miro.medium.com/max/1400/1*gYe2FMr9lKys2Wmo1v-s7A.jpeg)

# 1. Importing Libraries

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer
from keras.applications.xception import Xception,preprocess_input
import tensorflow as tf
from keras.preprocessing import image
from keras.layers import Input
from keras.backend import reshape
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import random 
from tqdm import tqdm_notebook as tqdm
import numba 
from numba import cuda 

# 2. Loading and Preprocessing of Data

In [None]:
images_dir = '../input/h-and-m-personalized-fashion-recommendations/images'

In [None]:
df1=pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')
df2=pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', converters={'article_id': str})
article_id=df2['article_id'].tolist()
sorter=df1['customer_id'].tolist()
df=pd.merge(df1, df2, on='customer_id', how ="left")
df3=df[['customer_id', 'article_id', 'prediction']]
df3=df3[df3['customer_id'].isin(df1['customer_id'].tolist())]

dfs = df3.set_index('customer_id')
dfs=dfs.loc[sorter]
dfs = dfs.reset_index()
dfs = dfs.drop_duplicates(subset=['customer_id'], keep='first')
dfs = dfs.reset_index()
del dfs['index']
dfs['customer_id'].tolist() == df1['customer_id'].tolist()
del df, df1, df2, df3

In [None]:
dfs

In [None]:
def getImagePaths(path):
    """
    Function to Combine Directory Path with individual Image Paths
    
    parameters: path(string) - Path of directory
    returns: image_names(string) - Full Image Path
    """
    image_names = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_names.append(fullpath)
    return image_names

def preprocess_img(img_path):
    dsize = (256,256)
    new_image=cv2.imread(img_path)
    new_image=cv2.resize(new_image,dsize,interpolation=cv2.INTER_NEAREST)  
    new_image=np.expand_dims(new_image,axis=0)
    new_image=preprocess_input(new_image)
    return new_image

def load_data():
    output=[]
    images_id=[]
    output=random.sample(getImagePaths(images_dir), 2000)
    images_id=[s.split('/')[-1][0] for s in output]
    return output, images_id

# 3. Defining model and extracting feature for all the training data

In [None]:
def model():
    model=Xception(weights='imagenet',include_top=False)
    for layer in model.layers:
        layer.trainable=False
        #model.summary()
    return model

def feature_extraction(image_data,model):
    features=model.predict(image_data)
    features=np.array(features)
    features=features.flatten()
    return features

# 4. Finding the similar image through cosine similarity

In [None]:
def result_vector_cosine(model,feature_vector,new_img):
    new_feature = model.predict(new_img)
    new_feature = np.array(new_feature)
    new_feature = new_feature.flatten()
    N_result = 1
    nbrs = NearestNeighbors(n_neighbors=N_result, metric="cosine").fit(feature_vector)
    distances, indices = nbrs.kneighbors([new_feature])
    return(indices)

# 5. Result
## Vector Cosine

In [None]:
# I will try the first 100k point until I will find a way to make code run in multiprocessing. 
# Rest of values will be chosen randomly


def input_show(data):
    plt.title("Query Image")
    plt.imshow(data)

def show_result(data,result):
    v=[]
    for i in range(0,1):
        index_result=result[0][i]
        v.append(data[index_result].split('/')[-1].split('.')[0])
    return ' '.join(v)
        
def get_dataframe():
    return dfs
global dfs

def main():  
    dfs=get_dataframe()
    prediction=[]
    features=[]
    output, images_id = load_data()
    main_model=model()
    #Limiting the data for training
    for i in output:
        new_img=preprocess_img(i)
        features.append(feature_extraction(new_img,main_model))
    feature_vec = np.array(features)
    
    i=0
    for p, tq in zip(get_dataframe()['article_id'].tolist(), tqdm(range(len(get_dataframe()['article_id'].tolist())))):
        if i<=16000:
            try:
                path= '../input/h-and-m-personalized-fashion-recommendations/images/' + str(p)[:3] + '/'+ str(p) +'.jpg'
                result=result_vector_cosine(main_model,feature_vec,preprocess_img(path))
                prediction.append(show_result(output,result))
            except:
                prediction.append(' '.join(random.sample(article_id, 12)))
                pred=' '.join(random.sample(article_id, 12))
        elif i>16000:
            pred=' '.join(random.sample(article_id, 12))
            prediction.append(pred)
        i+=1
            
    dfs['prediction']=prediction
    dfs=dfs[['customer_id', 'prediction']]
    dfs.to_csv('submission', index=False)
    
           

if __name__=='__main__':
    main()