In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import cv2,math,gc,sys

import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.nn import Parameter

!pip install "../input/efficient-net/dist/efficientnet_pytorch-0.7.0.tar"
from efficientnet_pytorch import EfficientNet

!pip install "../input/faissgpuwheel/faiss_gpu-1.7.0-cp37-cp37m-manylinux2014_x86_64.whl"
import faiss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')
import timm

import warnings
warnings.simplefilter('ignore')

torch.backends.cudnn.benchmark = True

import transformers
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from transformers import get_cosine_with_hard_restarts_schedule_with_warmup

In [None]:
class cfg:
    img_size = (380,380)
    feavec_num1 = 512
    feavec_num2 = 1280
    fea_norm = 64
    margin = 0.35
    batch = 50
    wpath = ["../input/shopee-weight/w_eff6_s380_cl8812_fold1_v2.pt",
             "../input/shopee-weight/w_effb3_s380_cl8811_fold2_0.80.pt",
             "../input/shopee-weight/w_effb5_s380_cl8811_fold3.pt",
             "../input/shopee-weight/w_effb4_s380_cl8811_fold4.pt",
             "../input/shopee-weight/w_effb3_s380_cl8811_fold5_m0.35.pt",
             "../input/shopee-weight/w_effb0_s380_cl11014_fullfold_fc512.pt",
             "../input/shopee-weight/w_effb0_s380_cl11014_fullfold_acc0.980_fc512.pt"]
    mname = ['efficientnet-b6','efficientnet-b3','efficientnet-b5','efficientnet-b4','efficientnet-b3',"efficientnet_b0","efficientnet_b1"]
    clsize = [8812,8811,8811,8811,8811,11014,11014]

In [None]:
class textcfg:
    NUM_WORKERS = 2
    BATCH_SIZE = 8  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TOKENIZER = transformers.AutoTokenizer.from_pretrained('../input/bert-base-uncased')

model_params = {
    'n_classes':8812,
    #'n_classes':11014,
    'model_name':'../input/bert-base-uncased',
    'pooling':'clf',
    'use_fc':False,
    'fc_dim':1280,
    'dropout':0.0,
    #'loss_module':loss_module,
    's':30.0,
    'margin':0.50,
    'ls_eps':0.0,
    'theta_zero':0.785
}

class ShopeeDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        text = row.title
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask
    
class ShopeeNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 pooling='mean_pooling',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0,
                 loss_module='softmax',
                 s=30.0,
                 margin=0.50,
                 ls_eps=0.0,
                 theta_zero=0.785):

        super(ShopeeNet, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained('../input/bert-base-uncased')
        final_in_features = self.transformer.config.hidden_size
        
        self.pooling = pooling
        self.use_fc = use_fc
        self.fc = nn.Linear(final_in_features, fc_dim)
        self._init_params()
        self.loss_module = loss_module
        #self.final = ArcMarginProduct(final_in_features, n_classes)
        self.final = ArcMarginProduct(fc_dim, n_classes)
        
    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        #nn.init.constant_(self.bn.weight, 1)
        #nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask, label=None):
        feature = self.extract_feat(input_ids,attention_mask)
        feature = self.fc(feature)
        if label is not None:
            logits = self.final(feature, label)
        else:
            logits = feature
        return F.normalize(logits,dim=1)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)
            features = self.relu(features)

        return features
    
def get_embedding_bert(df):
    test_dataset = ShopeeDataset(csv=df)
    loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=textcfg.BATCH_SIZE,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
        num_workers=textcfg.NUM_WORKERS)
    model = ShopeeNet(**model_params).to(device)
    model.load_state_dict(torch.load('../input/shopee-weight/bertmodel_acc0.813.pt'))
    model.eval()
    print('start collection')
    embedded = np.empty((0,1280),dtype='float32')
    with torch.no_grad():
        for idx,d in enumerate(loader):
            input_ids, attention_mask = d[0].to(device),d[1].to(device)
            outputs = model(input_ids,attention_mask)
            embedded = np.append(embedded, outputs.cpu().detach().numpy(),axis=0)

            if idx%500==0:
                print(idx,len(loader)) 
                print(embedded.shape)
    print(embedded.shape)
    return embedded

def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

def predict_bert(df,embeddings,topk=50,threshold=0.63):
    N,D = embeddings.shape
    cpu_index = faiss.IndexFlatL2(D)
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    gpu_index.add(embeddings)
    cluster_distance,cluster_index = gpu_index.search(x=embeddings, k=topk)
    
    df['pred_bert'] = ''
    pred = []
    for k in range(embeddings.shape[0]):
        idx = np.where(cluster_distance[k,] < threshold)[0]
        ids = cluster_index[k,idx]
        #posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        posting_ids = df['posting_id'].iloc[ids].values
        pred.append(posting_ids)
    df['pred_bert'] = pred
    if COMPUTE_CV:
        df['pred_bertonly'] = df.pred_bert.apply(lambda x: ' '.join(x))
        df['f1_bert'] = f1_score(df['target'], df['pred_bertonly'])
        score = df['f1_bert'].mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
    return df

In [None]:
COMPUTE_CV = False

#make target clustering
if COMPUTE_CV:
    df = pd.read_csv("../input/shopee-product-matching/train.csv")
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)
    df['target'] = df['target'].apply(lambda x: ' '.join(x))
    df_cu = cudf.DataFrame(df)
else:
    df = pd.read_csv("../input/shopee-product-matching/test.csv")
    df_cu = cudf.DataFrame(df)
    if len(df)==3:
        cfg.batch = 3
    
print('df shape is', df.shape )
df.head()

# Use Image Embeddings

In [None]:
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, s=30.0, m=0.30, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size(), device=device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s
        return output


class Model(nn.Module):
    def __init__(self,name,clustersize,feavec=512):
        super(Model, self).__init__()
        self.eff = EfficientNet.from_name(name)
        self.out = nn.Linear(1000,feavec)
        self.margin = ArcMarginProduct(in_features=feavec, 
                                       out_features = clustersize, 
                                       s=cfg.fea_norm, 
                                       m=cfg.margin)      

    def forward(self, x, labels=None):
        x = self.eff(x)
        x = self.out(x)
        if labels is not None:
            return self.margin(x,labels)
        return F.normalize(x,dim=1)

class timmModel(nn.Module):
    def __init__(self,name,clustersize,feavec=512):
        super(timmModel, self).__init__()
        #self.eff = EfficientNet.from_pretrained('efficientnï¼”t-b2')
        self.eff = timm.create_model(name,pretrained=False)
        #self.eff = torchvision.models.resnet101(pretrained=True)
        #self.eff._fc = nn.Linear(self.eff._fc.in_features,1280)
        self.fc = nn.Linear(1000,feavec)
        #self.out = nn.Linear(2048,512)
        self.dropout = nn.Dropout(p=0.3)
        self.margin = ArcMarginProduct(in_features=feavec, out_features = clustersize)  
        self.bn = nn.BatchNorm1d(512)
        self._init_params()    
    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, x, labels=None):
        x = self.eff(x)
        x = self.dropout(x)
        x = self.fc(x)
        #x = self.bn(x)
        #x = F.softmax(x,dim=1)
        #features = x
        if labels is not None:
            x = self.margin(x,labels)
            #x = F.softmax(x,dim=1)
            return x
        return F.normalize(x,dim=1)

In [None]:
model1 = Model(name=cfg.mname[0],clustersize=cfg.clsize[0]).to(device).half()
model1.load_state_dict(torch.load(cfg.wpath[0], map_location=device))

model2 = Model(name=cfg.mname[1],clustersize=cfg.clsize[1]).to(device).half()
model2.load_state_dict(torch.load(cfg.wpath[1], map_location=device))

model3 = Model(name=cfg.mname[2],clustersize=cfg.clsize[2]).to(device).half()
model3.load_state_dict(torch.load(cfg.wpath[2], map_location=device))

model4 = Model(name=cfg.mname[3],clustersize=cfg.clsize[3]).to(device).half()
model4.load_state_dict(torch.load(cfg.wpath[3], map_location=device))

model5 = Model(name=cfg.mname[4],clustersize=cfg.clsize[4]).to(device).half()
model5.load_state_dict(torch.load(cfg.wpath[4], map_location=device))

model6 = timmModel(name=cfg.mname[5],clustersize=cfg.clsize[5]).to(device).half()
model6.load_state_dict(torch.load(cfg.wpath[5], map_location=device))

model7 = timmModel(name=cfg.mname[6],clustersize=cfg.clsize[6]).to(device).half()
model7.load_state_dict(torch.load(cfg.wpath[6], map_location=device))

In [None]:
# make image Datasets
def load_image(file_name):
    if COMPUTE_CV:
        file_path = f'/kaggle/input/shopee-product-matching/train_images/{file_name}'
    else:
        file_path = f'/kaggle/input/shopee-product-matching/test_images/{file_name}'

    img = cv2.imread(file_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, cfg.img_size)
    tensor_img = torch.tensor(img)
    tensor_img = tensor_img.permute(( 2, 0, 1)).float()/255.0
    return tensor_img

class valDataset(Dataset):
    def __init__(self, df):
        self.img = df.image.values
        
    def __len__(self):
        return len(self.img)

    def __getitem__(self, idx):
        img = self.img[idx]
        img = load_image(img)
        return img

In [None]:
def image_embeddings(df):
    dataset = valDataset(df)
    loader = DataLoader(dataset,
                        batch_size=cfg.batch,
                        shuffle=False,
                        num_workers=2,
                        pin_memory=True,
                        drop_last=False)
    
    
    print('start collection')
    feavec = 512
    embedded = np.empty((0,feavec),dtype='float32')
    with torch.no_grad():
        for idx,images in enumerate(loader):
            images = images.to(device,non_blocking=True).half()
            outputs1 = model1(images)
            outputs2 = model2(images)
            outputs3 = model3(images)
            outputs4 = model4(images)
            outputs5 = model5(images)
            outputs6 = model6(images)
            outputs7 = model7(images)
            outputs = (outputs1 + outputs2 + outputs3 + outputs4 + outputs5 + outputs6 + outputs7)/7
            embedded = np.append(embedded, outputs.cpu().detach().numpy(),axis=0)

            if idx%100==0:
                print(idx,len(loader)) 
                print(embedded.shape)
    #del model1,model2,model3,model4,model5.model6
    return embedded

In [None]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

def predict_img(df,embeddings,topk=50,threshold=0.63):
    N,D = embeddings.shape
    cpu_index = faiss.IndexFlatL2(D)
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    gpu_index.add(embeddings)
    cluster_distance,cluster_index = gpu_index.search(x=embeddings, k=topk)
    
    df['pred_images'] = ''
    pred = []
    for k in range(embeddings.shape[0]):
        idx = np.where(cluster_distance[k,] < threshold)[0]
        ids = cluster_index[k,idx]
        #posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        posting_ids = df['posting_id'].iloc[ids].values
        pred.append(posting_ids)
    df['pred_images'] = pred
    if COMPUTE_CV:
        df['pred_imgonly'] = df.pred_images.apply(lambda x: ' '.join(x))
        df['f1_img'] = f1_score(df['target'], df['pred_imgonly'])
        score = df['f1_img'].mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
    return df

def predict_text(df,embeddings,topk=50,threshold=0.63):
    N,D = embeddings.shape
    cpu_index = faiss.IndexFlatL2(D)
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    gpu_index.add(embeddings)
    cluster_distance,cluster_index = gpu_index.search(x=embeddings, k=topk)
    
    df['pred_text'] = ''
    pred = []
    for k in range(embeddings.shape[0]):
        idx = np.where(cluster_distance[k,] < threshold)[0]
        ids = cluster_index[k,idx]
        #posting_ids = ' '.join(df['posting_id'].iloc[ids].values)
        posting_ids = df['posting_id'].iloc[ids].values
        pred.append(posting_ids)
    df['pred_text'] = pred
    if COMPUTE_CV:
        df['pred_textonly'] = df.pred_images.apply(lambda x: ' '.join(x))
        df['f1_text'] = f1_score(df['target'], df['pred_textonly'])
        score = df['f1_text'].mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
    return df

# Use Text Embeddings

In [None]:
def get_text_predictions(df, max_features = 25000,threshold=0.7):
    from cuml.feature_extraction.text import TfidfVectorizer
    model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    text_embeddings = model.fit_transform(df_cu.title).toarray()
    #print(text_embeddings)
    preds = []
    CHUNK = 1024*4

    print('Finding similar titles...')
    CTS = len(df)//CHUNK
    if len(df)%CHUNK!=0: CTS += 1
    for j in range( CTS ):

        a = j*CHUNK
        b = (j+1)*CHUNK
        b = min(b,len(df))
        print('chunk',a,'to',b)

        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k,]>threshold)[0]
            o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
    df['pred_text'] = preds
    del model,text_embeddings
    gc.collect()
    if COMPUTE_CV:
        df['pred_textonly'] = df.pred_text.apply(lambda x: ' '.join(x))
        df['f1_text'] = f1_score(df['target'], df['pred_textonly'])
        score = df['f1_text'].mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
    return df

In [None]:
class textvalDataset(Dataset):
    def __init__(self, textlist):
        self.text = textlist
        
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = torch.tensor(self.text[idx])
        text = text.float()
        return text

class Model(nn.Module):
    def __init__(self,clustersize,feavec=512):
        super(Model, self).__init__()
        self.linear1 = nn.Linear(24939,4000)
        self.linear2 = nn.Linear(4000,feavec)
        self.dropout = nn.Dropout(p=0.5)
        self.relu = nn.ReLU()
        self.margin = ArcMarginProduct(in_features=feavec, 
                                       out_features = clustersize, 
                                       s=64, 
                                       m=0.7)      

    def forward(self, x, labels=None):
        x = self.linear1(x)
        #x = self.relu(x)
        x = self.linear2(x)
        #x = self.relu(x)
        x = self.dropout(x)
        if labels is not None:
            return self.margin(x,labels)
        return F.normalize(x,dim=1)
    

def get_deeptext_predictions(df):
    from sklearn.feature_extraction.text import TfidfVectorizer
    df_t = pd.read_csv("../input/shopee-product-matching/train.csv")
    models = TfidfVectorizer(stop_words = 'english', binary = True, max_features = 24939)
    models.fit(pd.concat([df,df_t],axis=0).title)
    text = models.transform(df.title).toarray()
    batch = 50
    if len(df)==3:
        batch=3
    test_dataset = textvalDataset(text)
    test_loader = DataLoader(test_dataset,
                            batch_size=batch,
                            shuffle=False,
                            num_workers=2,
                            pin_memory=True)
    model_t1 = Model(8811)
    model_t2 = Model(8811)
    model_t1 = model_t1.to(device)
    model_t2 = model_t2.to(device)
    model_t1.load_state_dict(torch.load('../input/shopee-weight/w_lin_e5_fold1.pt'))
    model_t2.load_state_dict(torch.load('../input/shopee-weight/w_lin_e5_fold2.pt'))
    #model.load_state_dict(torch.load('../input/shopee-weight-text/w_lin_e5_fold0.pt'))
    model_t1.eval()
    model_t2.eval()
    print('start collection')
    embedded1 = np.empty((0,512),dtype='float32')
    embedded2 = np.empty((0,512),dtype='float32')
    with torch.no_grad():
        for idx,(images) in enumerate(test_loader):
            images = images.to(device,non_blocking=True)
            outputs = model_t1(images)
            embedded1 = np.append(embedded1, outputs.cpu().detach().numpy(),axis=0)
            outputs = model_t2(images)
            embedded2 = np.append(embedded2, outputs.cpu().detach().numpy(),axis=0)

            if idx%100==0:
                print(idx,len(test_loader)) 
                print(embedded1.shape)
                print(embedded2.shape)
    print(embedded1.shape,embedded2.shape)
    return embedded1,embedded2

In [None]:
#bert_embeddings = get_embedding_bert(df)

In [None]:
#text_embeddings = get_deeptext_predictions(df)
model1.eval()
model2.eval()
model3.eval()
model4.eval()
model5.eval()
model6.eval()
model7.eval()
image_embeddings = image_embeddings(df)

In [None]:
del model1,model2,model3,model4,model5,model6,model7

In [None]:
#text_embeddings1,text_embeddings2 = get_deeptext_predictions(df)

# Carry out image prediction

In [None]:
#text_embeddings = (text_embeddings1+text_embeddings2)/2
#img_text_embeddings = (image_embeddings + 0.4*text_embeddings)/1.4

In [None]:
if COMPUTE_CV:
    df = predict_img(df,image_embeddings,topk=50,threshold=0.137)
    #df = predict_img(df,img_text_embeddings,topk=50,threshold=0.106)

In [None]:
#2,3,4
df = predict_img(df,image_embeddings,topk=50,threshold=0.11)
#df = predict_bert(df,bert_embeddings,topk=50,threshold=0.54)
#3,4
#df = predict_img(df,image_embeddings,topk=50,threshold=0.30)

In [None]:
theresholds=np.linspace(0.12,0.14,10)
if COMPUTE_CV:
    #for topk in [49,50,51,60]:
    for threshold in theresholds:
        df = predict_img(df,image_embeddings,topk=50,threshold=threshold)

# Carry out text predictions

In [None]:
df = get_text_predictions(df, max_features = 25000,threshold=0.75)
df.head()

# combine_predictions

In [None]:
tmp = df.groupby('image_phash').posting_id.agg('unique').to_dict()
df['oof_hash'] = df.image_phash.map(tmp)

In [None]:
def combine_predictions(row):
    x = np.concatenate([row['pred_images'], row['pred_text']])
    return ' '.join( np.unique(x) )

def combine_predictions_addphash(row):
    x = np.concatenate([row['pred_images'], row['pred_text'], row['oof_hash'],row['pred_bert']])
    return ' '.join( np.unique(x))

def combine_predictions_bert(row):
    x = np.concatenate([row['pred_images'], row['pred_bert']])
    return ' '.join( np.unique(x))

In [None]:
df.head()

In [None]:
df['matches'] = df.apply(combine_predictions, axis=1)
#df['matches'] = df['pred_images'].apply(lambda x: ' '.join(x))
if COMPUTE_CV:
    df['f1'] = f1_score(df['target'], df['matches'])
    score = df['f1'].mean()
    print(f'Final f1 score is {score}')
else:
    with open('submission.csv', 'w') as outf:
        print('posting_id,matches', file=outf)
        for i,(idnum,match) in enumerate(zip(df['posting_id'],df['matches'])):
            print(f'{idnum},{match}', file=outf)

In [None]:
df_t = pd.read_csv("submission.csv")
print(df_t)