In [None]:
import numpy as np
import pandas as pd
import warnings as w
from scipy import spatial
from tqdm.notebook import tqdm
import random, math, cv2, os, string, re, gc

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers as L
import tensorflow.keras as K
from sklearn.model_selection import train_test_split
import cudf, cuml, cupy
from cuml.neighbors import NearestNeighbors

import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
%matplotlib inline
sns.set(style="whitegrid")


w.filterwarnings('ignore')

TRAIN_BASE = '../input/shopee-product-matching/train_images/'
TEST_BASE = '../input/shopee-product-matching/test_images/'
SEED = 101
IMG_SIZE = 250
EPOCHS = 10

sample_sub = pd.read_csv("../input/shopee-product-matching/sample_submission.csv")
test = pd.read_csv("../input/shopee-product-matching/test.csv")
train = pd.read_csv("../input/shopee-product-matching/train.csv")

# test = pd.concat([test, test, test[:2000]], axis = 0)
# test.reset_index(drop = True, inplace = True)
# print("shape: ", test.shape)
# test.drop('label_group', axis = 1, inplace = True)
test.head()

In [None]:
CAP = 2.0
IS_GPU_AVAIL = tf.config.experimental.list_physical_devices('GPU')
if IS_GPU_AVAIL:
    try:
        tf.config.experimental.set_virtual_device_configuration(IS_GPU_AVAIL[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*CAP)])
        lgpu = tf.config.experimental.list_logical_devices('GPU')
        
    except RuntimeError as ex:
        print(ex)
        
print(f'Tensorflow GPU space {CAP}GB GPU RAM')
print(f'RAPIDS GPU space {16 - CAP}GB GPU RAM')

In [None]:
def get_image_predictions(df, embeddings,threshold = 3.4):
    
    if len(df) > 3:
        KNN = 50
    else : 
        KNN = 3
    
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(embeddings)
    distances, indices = model.kneighbors(embeddings)
    
    predictions = []
    for k in tqdm(range(embeddings.shape[0])):
        idx = np.where(distances[k,] < threshold)[0]
        ids = indices[k,idx]
        posting_ids = df['posting_id'].iloc[ids].values
        predictions.append(posting_ids)
        
    del model, distances, indices
    gc.collect()
    return predictions

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
def process_data(df):
    label_to_encoded = {idx:item for idx,item in enumerate(df.label_group.unique())}
    encoded_to_label = {item:idx for idx,item in enumerate(df.label_group.unique())}
    classes = df.label_group.nunique()
    return label_to_encoded, encoded_to_label, classes

seed_everything(SEED)
label_to_encoded, encoded_to_label, NUM_CLASSES = process_data(train)

for col in train.columns:
    print(f"Number of unique {col} entries : ", train[col].nunique(), " against the dataset of size ", train.shape)
    
class DataGeneratorForTest(K.utils.Sequence):
    def __init__(self, df, batchSize, filepath = TEST_BASE, 
                 img_size = IMG_SIZE):
        self.df = df
        self.indexes = np.arange(len(df))
        self.path = filepath
        self.batch = batchSize 
        self.img_size = IMG_SIZE
        
    def __len__(self):
        '''Total number of steps in a epoch'''
        return int(np.floor(self.df.shape[0]/self.batch))
    
    def __getitem__(self, index):
        '''Generate One batch of files'''
        indexes = self.indexes[index*self.batch:(index+1)*self.batch]
        temp_df = self.df.iloc[indexes]
        X = np.zeros(((len(indexes), self.img_size, self.img_size, 3)))
        for idx,(index, row) in enumerate(temp_df.iterrows()):
            img = cv2.imread(self.path + row.image)
            X[idx,] = cv2.resize(img, (self.img_size, self.img_size)) / 255
        return X

def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

def clean(text):
    text = ''.join([k for k in text if k not in string.punctuation])
    text = str(text).lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  
                               u"\U0001F300-\U0001F5FF"  
                               u"\U0001F680-\U0001F6FF"  
                               u"\U0001F1E0-\U0001F1FF"  
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text


def train_test_split_data(df, features, label, test_size = 0.33):
    train_x, val_x, train_y, val_y = train_test_split(df[features], df[label], test_size = test_size,
                                                      random_state = SEED, shuffle = True)
    return train_x, val_x, train_y, val_y

def learning_rate_scheduler():
    starting_pt_lr   = 0.0001
    exp_decay = 0.1
    def lrfn(epoch):
        if epoch < 5:
            return starting_pt_lr
        else:
            return starting_pt_lr * math.exp(-exp_decay * epoch)
    lr = K.callbacks.LearningRateScheduler(lrfn, verbose = True)
    return lr

tqdm.pandas()
# if predict_test:
test['title'] = test['title'].progress_apply(clean)
# else:
#     train['title'] = train['title'].progress_apply(clean)

# complete_generator = None
# if predict_test:
params = {'batchSize': 3,
          'filepath': TEST_BASE,         
          'img_size' : IMG_SIZE}
complete_generator = DataGeneratorForTest(test[['image']], **params)
# else:
#     params = {'batchSize': 5,
#           'code_to_labels': encoded_to_label,
#           'filepath': TRAIN_BASE,
#           'shuffle': False,
#           'img_size' : IMG_SIZE,
#           'classes' : NUM_CLASSES}
#     complete_generator = DataGenerator(train[['image', 'label_group']], **params)

gc.collect()

In [None]:
# def get_model(pretrained_layer = K.applications.EfficientNetB0(
#         include_top=False, weights='../input/notop-weights/efficientnetb0_notop.h5', input_shape=(IMG_SIZE, IMG_SIZE, 3)), 
#               classes = NUM_CLASSES):
    
#     inp = L.Input(shape = (IMG_SIZE, IMG_SIZE, 3))
    
#     x = pretrained_layer(inp)
#     x = L.GlobalAveragePooling2D()(x)
#     x = L.BatchNormalization()(x)
#     image_ = K.models.Model(inputs = inp, 
#                            outputs = x)
#     return image_

# pretrained_layer2 = K.applications.InceptionV3(include_top=False, 
#                                                weights='../input/notop-weights/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5', 
#                                                input_shape=(IMG_SIZE, IMG_SIZE, 3))
# image_embeddings = get_model(pretrained_layer = pretrained_layer2)
# image_embeddings.summary()

In [None]:
image_embeddings = K.applications.EfficientNetB0(
        include_top=False, weights='../input/notop-weights/efficientnetb0_notop.h5', input_shape=(IMG_SIZE, IMG_SIZE, 3), pooling = 'avg')
# image_embeddings = K.applications.InceptionV3(include_top=False, 
#                                                weights='../input/notop-weights/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5', 
#                                                input_shape=(IMG_SIZE, IMG_SIZE, 3), pooling = 'avg')
image_embeddings = image_embeddings.predict(complete_generator,
                                verbose = 1)
"Image embedding shape is :- ", image_embeddings.shape

In [None]:
# def combine_preds(x):
#     all_combined = x['image_preds']+" "+ x['text_preds']
#     return ' '.join( set(all_combined.split(" ")) )

# def get_nearest_neighors(df, embeds, n = 50, image = True, predict_score = PREDICT_SCORE):
#     model = NearestNeighbors(n_neighbors = n)
#     model.fit(embeds)
#     dist_arr, idx_arr = model.kneighbors(embeds)
#     if predict_score:
#         scores = []
#         if image:
#             print("Predicting for image")
#             scores = np.arange(1.0,6.0,0.2)
#         else:
#             print("Predicting for text")
#             scores = np.arange(20,35,0.5)
            
#         allscores = []
#         for th in scores:
#             preds = []
#             for idx in range(embeds.shape[0]):
#                 index_clear_of_th = np.where(dist_arr[idx,] < th)[0]
#                 preds.append(' '.join(df.posting_id.iloc[idx_arr[idx,index_clear_of_th]].values))
            
#             df["pred_values"] = preds
#             df["f1_score"] = f1_score(df['matches'], df['pred_values'])
#             print(f"f1 Score for the threshold {th} is {df['f1_score'].mean()}")
#             allscores.append(df['f1_score'].mean())
        
#         score_df = pd.DataFrame({"All_Scores" : allscores, "Thresholds" : scores})
#         best_record = score_df[score_df.All_Scores == score_df.All_Scores.max()]
#         print(f"Best iteration is with score {best_record.All_Scores.values} and threshold {best_record.Thresholds.values}")
        
#         preds = []
#         th = best_record.Thresholds.values[0]
            
#         for idx in range(embeds.shape[0]):
#             index_clear_of_th = np.where(dist_arr[idx,] < th)[0]
#             preds.append(" ".join(df.posting_id.iloc[idx_arr[idx,index_clear_of_th]].values))
            
#     else:
#         preds = []
#         th = 0
#         if image:
#             print("Predicting for image")
#             th = 2.4
#         else:
#             print("Predicting for text")
#             th = 24.0
            
#         for idx in range(embeds.shape[0]):
#             index_clear_of_th = np.where(dist_arr[idx,] < th)[0]
#             preds.append(" ".join(df.posting_id.iloc[idx_arr[idx,index_clear_of_th]].values))
            
#     return df, preds

# if PREDICT_SCORE:
#     tmp = train.groupby(['label_group'])['posting_id'].unique().to_dict()
#     train['matches'] = train['label_group'].map(tmp)
#     train['matches'] = train['matches'].apply(lambda x: ' '.join(x))
    
#     train, image_preds = get_nearest_neighors(train, image_embeddings, n = 50, image = True)

#     train['image_preds'] = image_preds
#     train['matches'] = image_preds
#     train[['posting_id', 'matches']].to_csv('submission.csv', index = False)
#     print(train.head())
# else:
#     test, image_preds = get_nearest_neighors(test, image_embeddings, n = 50, image = True)
    
#     test['image_preds'] = image_preds
#     test['matches'] = image_preds
#     test[['posting_id', 'matches']].to_csv('submission.csv', index = False)
#     print(test.head())

In [None]:
image_predictions = get_image_predictions(test, image_embeddings, threshold = 2.75)
test['image_predictions'] = image_predictions
# test['text_predictions'] = text_predictions
# test['matches'] = test.apply(combine_predictions, axis = 1)
test['matches'] = test.apply(lambda x: " ".join(np.unique(x['image_predictions'])), axis = 1)#" ".join(set(list(test['image_predictions'])))
test[['posting_id', 'matches']].to_csv('submission.csv', index = False)

In [None]:
test.head()