In [None]:
# import library
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
import matplotlib.pyplot as plt
from PIL import Image

import transformers
import gc

from torch.nn import Parameter
from torch.nn import functional as F

from cuml.neighbors import NearestNeighbors

from tqdm import tqdm
import pickle as pkl
! pwd

In [None]:
# global values and config
NUM_WORKERS = 2
BATCH_SIZE = 16
PRETRAINED_MODEL = "../input/xlnet-model/xlnet-base-cased"
FINE_TUNING_MODEL = dict(list(torch.load("../input/xlnet-model/fine_tuning_xlnet_base_cased.bin").items())[:-1])
MAX_LENGTH = 128

print(torch.cuda.is_available())
device = torch.device('cuda')


In [None]:
def load_data(drop=False):
    train_file = "../input/shopee-product-matching/train.csv"
    df = pd.read_csv(train_file)
    # add matches col in data frame
    temp = df.groupby(['label_group'])['posting_id'].unique().to_dict()
    df['matches'] = df['label_group'].map(temp)
    # convert matches from "[train_1, train_2]" to "train_1, train_2"
    df['matches'] = df['matches'].apply(lambda x: ' '.join(x))
    if drop:
        df = df.drop(['image', 'image_phash', 'label_group'], 1)
        
    return df


In [None]:
class ShopeeTextData(Dataset):
    def __init__(self, data):
        self.data = data.reset_index()
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

    def __len__(self):
        return self.data.shape[0]
        
    def __getitem__(self, index):
        title = self.data.iloc[index]["title"]
        text = self.tokenizer(title, max_length = MAX_LENGTH, truncation=True, padding='max_length', return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        return input_ids, attention_mask

In [None]:
class ShopeeModel(nn.Module):
    #init from pretrained model: xlnet-base
    def __init__(self, n_classes=11014 , pretrained_model=PRETRAINED_MODEL):
        super(ShopeeModel,self).__init__()
        self.transformer = transformers.AutoModel.from_pretrained(pretrained_model)

    #forward 
    def forward(self, input_ids,attention_mask):
        feature = self.transformer(input_ids,attention_mask)[0]
        # remove seq_length
        feature = feature[:,0,:]
        # return [batch_size, hidden_size]
        return F.normalize(feature)

In [None]:
def generate_text_embeddings(df):
    embeddings = []
    model = ShopeeModel()
    model.eval()
    # fine-tuning model Shopee 
    model.load_state_dict(FINE_TUNING_MODEL)
    model = model.to(device)
    # prepare data
    text_data = ShopeeTextData(df)
    text_dataloader = torch.utils.data.DataLoader(text_data, batch_size=BATCH_SIZE, 
                                                  num_workers=NUM_WORKERS,pin_memory=True,
                                                  drop_last=False,)
    # generate text_embeddings
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_dataloader):
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feature = model(input_ids, attention_mask)
            embeddings.append(feature.detach().cpu().numpy())
            
    text_embeddings = np.concatenate(embeddings)
    # delete and collect gabage
    del embeddings, model
    gc.collect()
    print('text_embeddings shape:', text_embeddings.shape)
    return text_embeddings


In [None]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

def get_neighbors_knn(df, text_embeddings, KNN=50, threshold=0.9):
    # create KNN model and calculate distances
    model = NearestNeighbors(n_neighbors = KNN)
    model.fit(text_embeddings)
    distances, ids = model.kneighbors(text_embeddings)
    
    predictions = []
    n_id = text_embeddings.shape[0]
    # iterate all data rows and pick ids that distance < threshold
    for row in range(n_id):
        col = np.where(distances[row,] < threshold)[0]
        pos = ids[row,col]
        posting_ids = ' '.join(df['posting_id'].iloc[pos].values)
        predictions.append(posting_ids)
    del model, distances, ids
    gc.collect()
    return predictions

In [None]:
# plot img in same label
def get_list_img_same_label(label,train_data_labels):
  list_name =[]
  list_title = []
  for i in range(len(train_data_labels)):
    if label == train_data_labels[i]:
      list_name.append("../input/shopee-product-matching/train_images/" + train_data_imgs[i])
      list_title.append(str(i) + "\n" + train_data_titles[i])
  return list_name,list_title

def show_data_img(list_name, list_title):
  list_len = len(list_name)
  fig = plt.figure(figsize=(10, 10))
  i=1
  cols = int(list_len/4) + 1
  rows = int(list_len / (cols)) +1
  for j in range(len(list_name)):
    img_file = list_name[j]
    img_title = list_title[j]
    plt.subplot(rows, cols, i)
    plt.axis(False)
    img = Image.open(img_file)
    plt.title(img_title)
    plt.imshow(img)
    i = i+1
  plt.show()

In [None]:
# prepare data
df = load_data()
df.head()

# plot image
train_data_names, train_data_imgs, train_data_titles = df["posting_id"], df["image"], df["title"]
# choose 1 label to plot
num = 412
label = df["label_group"][num]
train_data_labels = df["label_group"]


  
show_data_img(*get_list_img_same_label(label,train_data_labels))

In [None]:
text_embeddings = generate_text_embeddings(df)
# save text_embeddings
# with open('text_embeddings.pkl', 'wb') as file:
#     pkl.dump(text_embeddings, file)

In [None]:
text_predictions = get_neighbors_knn(df,text_embeddings)

result = df.loc[:,['posting_id','matches']]
result['predictions'] = text_predictions
result['f1'] = f1_score(result['matches'],result['predictions'])
display(result)

plt.figure()
result['f1'][::200].plot()
plt.title("f1 mean: " + str(result['f1'].mean()))


In [None]:
# submit
mysubmission = df.loc[:,['posting_id','matches']]
mysubmission['matches'] = text_predictions
mysubmission.to_csv('submission.csv',index=False)
display(mysubmission)