# Load Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')
import tqdm
from sklearn.neighbors import NearestNeighbors

In [None]:
# root = '../input'

In [None]:
# # download the pretrained model and save it to local

# # load the pretrained model and weights
# model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# # Load pretrained model/tokenizer
# tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
# model = model_class.from_pretrained(pretrained_weights)

# # model.save_pretrained(root)
# # tokenizer.save_pretrained(root)

In [None]:
# load the train data
# path1 = root + '/shopee-product-matching/train.csv'
# path2 = root + '/shopee-product-matching/test.csv'
# train = pd.read_csv(path1)
# df_test = test = pd.read_csv(path2)

In [None]:
# divide train set into train(80%) and dev(20%) sets

# df_dev = train.sample(frac=0.2)
# df_train = train[~train.index.isin(df_dev.index)]

## Load the tokenizer and model from distlBERT

In [None]:
transformer_model = '../input/distilbert-base-indonesian/distilbert-base-indonesian'
TOKENIZER = ppb.AutoTokenizer.from_pretrained(transformer_model)
MODEL = ppb.AutoModel.from_pretrained(transformer_model)

In [None]:
# Setting random seed and device
SEED = 567

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

## Generate Dataset, DataLoader

In [None]:
# generate the dataset
from torch.utils.data import Dataset

class Task2Dataset(Dataset):
    # preprocess dsv files and initialize some parameters
    def __init__(self, mode, df, tokenizer):
        assert mode in ["train", "dev", "test"] 
        self.mode = mode
        self.df = df
        self.len = len(self.df)
        self.tokenizer = tokenizer  # use BERT tokenizer

    # define how to return back a sample
    def __getitem__(self, idx):
        if self.mode == "test":
            #text_a, text_b = self.df.iloc[idx, :2].values
#             text = self.df.iloc[idx, 3]
            text = self.df.title[idx]
            
            #label = self.df.iloc[idx, 1]
            label_tensor = torch.tensor(0)
        
        else:
            text = self.df.iloc[idx, 3]
            label = self.df.iloc[idx, 4]
            label_tensor = torch.tensor(label)

      #text, label = self.df.iloc[idx, :].values
        
      # encode_plus returns 3 tensors: 'input_ids', 'token_type_ids', 'attention_mask'
        encoded_text = self.tokenizer.encode_plus(text, add_special_tokens=True, return_token_type_ids = True)
      
        tokens_tensor = torch.tensor(encoded_text['input_ids'])
        segments_tensor = torch.tensor(encoded_text['token_type_ids'])
      # masks_tensor =torch.tensor(encoded_text['attention_mask'])
      
      #label_tensor = torch.tensor(label)
      
        return (tokens_tensor, segments_tensor, label_tensor)

    def __len__(self):
        
        return self.len
    
    
# initialize Datesets, use 'distilbert-base-uncased' tokenizer
# trainset = Task2Dataset("train", df_train, tokenizer=TOKENIZER)
# devset = Task2Dataset("dev", df_dev, tokenizer=TOKENIZER)
# testset = Task2Dataset("test", df_test, tokenizer=TOKENIZER)

In [None]:
# # test
# idx = 1
# text = train.iloc[idx, 3]
# label = train.iloc[idx, 4]
# label_tensor = torch.tensor(label)

# encoded_text = TOKENIZER.encode_plus(text, add_special_tokens=True, return_token_type_ids = True)
# encoded_text

In [None]:
# generate the dataloader
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn_padd(samples):
    
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    label_ids = torch.stack([s[2] for s in samples])
    
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


BATCH_SIZE = 16

# trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
#                          collate_fn=collate_fn_padd)
# devloader = DataLoader(devset, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)
# # testloader = DataLoader(testset, batch_size=BATCH_SIZE, 
# #                          collate_fn=collate_fn_padd)

In [None]:
# # test
# data = next(iter(trainloader))

# tokens_tensors, segments_tensors, \
#     masks_tensors, label_ids = data

# print(f"""
# tokens_tensors.shape   = {tokens_tensors.shape} 
# {tokens_tensors}
# ------------------------
# segments_tensors.shape = {segments_tensors.shape}
# {segments_tensors}
# ------------------------
# masks_tensors.shape    = {masks_tensors.shape}
# {masks_tensors}
# ------------------------
# label_ids.shape        = {label_ids.shape}
# {label_ids}
# """)

In [None]:
# # the model information
# print("""
# name            module
# ----------------------""")
# for name, module in MODEL.named_children():
#     if name == "bert":
#         for n, _ in module.named_children():
#             print(f"{name}: {n}")
#     else:
#         print("{:15} {}".format(name, module))

## test prediction function

In [None]:
def test_predict(model, dataloader, device):

    model.eval()
    _predicted_metrics = []
    _true_labels = []

    with torch.no_grad():    
        for batch in dataloader:
            
            tokens_tensors, segments_tensors, masks_tensors, labels = batch
            
            tokens_tensors, segments_tensors, masks_tensors, labels = tokens_tensors.to(device), \
          segments_tensors.to(device), masks_tensors.to(device), labels.to(device)

          # inputs,attention_masks = inputs.to(device),attention_masks.to(device)
          # features = model(inputs,attention_masks)[0][:,0,:].detach()

          # features = model(input_ids=tokens_tensors, 
          # token_type_ids=segments_tensors, 
          # attention_mask=masks_tensors,
          # labels=labels)

            _ = model.to(device)

            features = model(input_ids=tokens_tensors, 
            attention_mask=masks_tensors)[0][:,0,:].detach()


            if len(features.shape) != 2:
                features = torch.nn.AdaptiveAvgPool2d(1)(features).cpu().view(-1,features.shape[1]).detach().numpy()
            else:
                features = features.detach().cpu().numpy()

            metric = features.reshape(features.shape[0], features.shape[1])
            _predicted_metrics.append(metric)

    return np.concatenate(_predicted_metrics)

def get_similar(df,embeddings,threshold=0.36,KNN = 50):
    
    if len(df)==3: KNN = 2
    model = NearestNeighbors(n_neighbors=KNN,metric='cosine')
    model.fit(embeddings)
    
    preds = []
    
    CHUNK = 1024*4

    print('Finding similar images...')
    CTS = len(embeddings)//CHUNK
    if len(embeddings)%CHUNK!=0: CTS += 1
    
    # for j in tqdm(range( CTS )):  
    for j in range( CTS ):
    # j = 0
    # while j <= 2:

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(embeddings))
        distances, indices = model.kneighbors(embeddings[a:b,])
        # print(distances)
        # j+=1

#         x_avg=np.mean(distances) # 平均値(定義上は0)
#         x_std=np.std(distances) # 標準偏差(定義上は1)
#         lsl_1=x_avg-x_std*1.55

        for k in range(b-a):
            IDX = np.where(distances[k,]<threshold)[0]
            IDS = indices[k,IDX]
            o = df.iloc[IDS].posting_id.values
            preds.append(o)

#     print(lsl_1)
    print(f"embed={embeddings.shape[1]}_KNN={KNN}_distances={threshold}")
#     plt.hist(distances.flatten(),bins=100)
#     plt.show()
#     del model, distances, indices, image_embeddings#, embeds
#     _ = gc.collect()
    
    return preds


def compute_F1(df):
    
    n,m = df.shape
    s = 0
    for i in range(n):
        row = df.iloc[i,:]
        preds = row[-1]
        label_group = row[-2]
        true_pics = df[df['label_group'] == label_group].posting_id.values
    
        Precission = len(set(preds) & set(true_pics)) /len(preds)
        Recall = len(set(preds) & set(true_pics)) /len(true_pics)
    
        if Precission == 0.0 and Recall == 0.0 and Precission + Recall == 0.0:
            f1 = 0
        else:
            f1 = 2*Precission*Recall/(Precission+Recall)
        s += f1
    return s/n

## generate the submit file



In [None]:
test = pd.read_csv('../input/shopee-product-matching/test.csv')
testset = Task2Dataset("test", test, tokenizer=TOKENIZER)

testloader = DataLoader(testset, batch_size=BATCH_SIZE, 
                         collate_fn=collate_fn_padd)

test_embeddings = test_predict(MODEL,testloader,device)
test['matches'] = get_similar(test,test_embeddings,0.9/100)

test['matches'] = test['matches'].apply(lambda x: " ".join(x))

test[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()

In [None]:
# # train = pd.read_csv('../input/shopee-product-matching/train.csv')
# # trainset = Task2Dataset("train", train, tokenizer=TOKENIZER)

# # trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
# #                          collate_fn=collate_fn_padd)

# dev_embeddings = test_predict(MODEL,devloader,device)
# df_dev['matches'] = get_similar(df_dev,dev_embeddings,0.9/100)

# df_dev['matches'] = df_dev['matches'].apply(lambda x: " ".join(x))

# df_dev[['posting_id','matches']].to_csv('submission.csv',index=False)
# sub = pd.read_csv('submission.csv')
# sub.head()

In [None]:
# train[train['label_group'] == 4093212188]