In [None]:
import numpy as np
import pandas as pd
import keras
import tensorflow as tf 
from math import ceil
from tqdm import tqdm

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
#import efficientnet.tfkeras as efn

IMAGE_SIZE = [512, 512]
N_CLASSES = 11014
CHANNELS = 3

# References
# https://www.kaggle.com/ragnar123/unsupervised-baseline-arcface/data

# Load the data
df_train = pd.read_csv('../input/shopee-product-matching/train.csv')
df_train_pairs = pd.read_csv('../input/pairwise-dataset/pairwise.csv')
df_test = pd.read_csv('../input/shopee-product-matching/test.csv')

#Full image path
df_train['image'] = '../input/train_images/' + df_train['image']
df_train_pairs['image_1'] = '../input/train_images/' + df_train_pairs['image_1']
df_train_pairs['image_2'] = '../input/train_images/' + df_train_pairs['image_2']
df_test['image'] = '../input/test_images/' + df_test['image']

assert len(df_train['label_group'].unique()) == N_CLASSES
assert len(df_train_pairs['label_group_1'].unique()) == N_CLASSES
assert len(df_train_pairs['label_group_2'].unique()) == N_CLASSES

In [None]:
####################################################################
# Add matches for validation
####################################################################
def add_matches(df):
  tmp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
  df['matches'] = df['label_group'].map(tmp)
  # df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
  return df

df_train = add_matches(df_train)

In [None]:
####################################################################
# Text Preprocessing
####################################################################

# Imputer for text
def string_impute(x):
  return x.fillna('missing').astype(str)

imputer = FunctionTransformer(string_impute, validate=False)

text_word_pipeline = make_pipeline(
  imputer,
  TfidfVectorizer(
    strip_accents='unicode',
    lowercase=True,
    analyzer='word',
    token_pattern=r"(?u)\b\w+\b",  # Default is "2 or more letters", lets use 1 letter
    ngram_range=(1, 3),
    max_df=.5,
    min_df=10,
    binary=False,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=False,
    dtype=np.float32
  ),
  'passthrough'
)

text_train = text_word_pipeline.fit_transform(df_train['title'])
text_test = text_word_pipeline.transform(df_test['title'])

text_train_pairs_1 = text_word_pipeline.transform(df_train_pairs['title_1'])
text_train_pairs_2 = text_word_pipeline.transform(df_train_pairs['title_2'])

# Sort indexes
text_train.sort_indices()
text_test.sort_indices()

text_train_pairs_1.sort_indices()
text_train_pairs_2.sort_indices()

In [None]:
####################################################################
# Define models
####################################################################

def build_cosine_sim_model(
  input_shape,
  sparse_input,
  embed_layer,
  opt,
  **kwargs
):
  """
  Class to build a model that compares 2 inputs using cosine similarity
  
  input_shape = shape of the inputs
  
  sparse_input = True or False.  Only workers if layer_source==keras.layers
  
  layer_source = keras or tf.keras.  Keras supports sparse input, tf.keras supports images
  
  embed_layer = A function to generate the shared layer to use to compare the 2 inputs
  
  opt = optimizer to use for mode
  
  **kwargs = passed to embed_layer
  """
  
  # 2 inputs
  input_1 = tf.keras.layers.Input(shape = input_shape, name = 'input_1', sparse=sparse_input)
  input_2 = tf.keras.layers.Input(shape = input_shape, name = 'input_2', sparse=sparse_input)
  
  # Pass both inputs through the SAME dense layer to map them to a shared embedding
  shared_embed = embed_layer(**kwargs)
  embed_1 = shared_embed(input_1)
  embed_2 = shared_embed(input_2)
  
  # Use the Dot layer to compate them
  # Dot with normalize = True computes cosine similarity
  sim = tf.keras.layers.Dot(-1, normalize=True, name='sim')([embed_1, embed_2])

  # Make a model that inputs 2 texts and outputs similarity
  model = tf.keras.models.Model(inputs=[input_1, input_2], outputs = sim)
  
  # Compile the model
  # Consider CosineSimilarity() loss too
  model.compile(optimizer = opt, loss = [tf.keras.losses.MeanSquaredError()])
  model.summary()
  return model

In [None]:
####################################################################
# Fit text model
####################################################################
from annoy import AnnoyIndex
from sklearn.preprocessing import normalize

EMBED_DIM = 512

# Build text model
def build_text_model():
  return build_cosine_sim_model(
    input_shape=(text_train.shape[1]),
    sparse_input=True,
    embed_layer=tf.keras.layers.Dense,
    opt=tf.keras.optimizers.Adam(1e-4),
    units=EMBED_DIM,
    use_bias=False,
    name='shared_embed')

text_model = build_text_model()

x = {'input_1': text_train_pairs_1, 'input_2': text_train_pairs_2}
y = df_train_pairs['target'].to_numpy()

# Fit model
text_model.fit(
  x, y, 
  epochs=6,
  batch_size=256)

In [None]:
####################################################################
# Predict text model
####################################################################

# Predict embeddings
# TODO: make this an actual MODEL when we define the fit
text_embeddings_model = tf.keras.models.Model(
  inputs = text_model.inputs[0], 
  outputs = text_model.layers[-2].output)
  
# Function to map text to pairs in a given data frame
def text_to_matches(df, seed=42):
  
  # Use sklearn to vectorize the text
  text_sparse = text_word_pipeline.transform(df['title'])
  text_sparse.sort_indices()
  
  # Now use our embedding model to turn the sparse text vectors into a dense embedding
  text_embed = text_embeddings_model.predict(text_sparse, batch_size=256)
  text_embed = normalize(text_embed)
  
  # Build a knn model for matches
  knn_model = AnnoyIndex(EMBED_DIM, 'dot')  
  for i in range(text_embed.shape[0]):
      knn_model.add_item(i, text_embed[i,:])
  knn_model.set_seed(42)
  knn_model.build(100)
  
  # Loop over each row and match
  all_neighbors = []
  all_dist = []
  N = 50
  for i in tqdm(range(text_embed.shape[0])):
    neighbors, sim = knn_model.get_nns_by_vector(text_embed[i,:], n=N, include_distances=True)
    neighbors = np.asarray(neighbors)
    sim = np.asarray(sim)
    
    all_neighbors.append([df['posting_id'].iloc[x] for x in neighbors])
    all_dist.append(sim)
    
  df['all_neighbors'] = all_neighbors
  df['all_dist'] = all_dist
  return df

df_train = text_to_matches(df_train)

In [None]:
####################################################################
# Find a good threshold
####################################################################
from scipy.optimize import minimize, minimize_scalar

def f1_score(y_true, y_pred):
    intersection = len(np.intersect1d(y_true, y_pred))
    len_y_pred = y_true.shape[0]
    len_y_true = y_pred.shape[0]
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

def matches_above_thresh(thresh, df):
  y_pred = []
  for i in range(df.shape[0]):
    dist = df['all_dist'][i]
    nn = np.array(df['all_neighbors'][i])
    out = nn[np.where(dist > thresh)[0]]
    if len(out) < 2:
      out = nn[0:2]
    y_pred.append(out)
    
  return y_pred
    
def calc_f1(thresh, df=df_train):
  
  y_true = df['matches']
  y_pred = matches_above_thresh(thresh, df)
  
  assert len(y_true) == len(y_pred)
  
  f1 = 0
  for i in range(len(y_true)):
    f1 += f1_score(y_true[i], y_pred[i])
  f1 = f1 / len(y_true) 

  return -1 * f1
  
calc_f1(-1)
calc_f1(0)
calc_f1(.5)
calc_f1(.8)
calc_f1(1)

best_thresh = minimize_scalar(calc_f1)
print(best_thresh['x'])

In [None]:
####################################################################
# Submit
####################################################################

df_test = text_to_matches(df_test)
matches_test = matches_above_thresh(best_thresh['x'], df_test)
df_test['matches'] = [' '.join(x) for x in matches_test]
df_test[['posting_id', 'matches']].to_csv('submission.csv', index = False)