In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
import transformers

import gc
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## TfidfVectorizer model

In [2]:
def read_dataset(is_train=True):
    if is_train:
        df = pd.read_csv('train.csv')
        image_paths = 'train_images/' + df['image']
    else:
        df = pd.read_csv('test.csv')
        image_paths = 'test_images/' + df['image']
    return df, image_paths

In [3]:
def combine_predictions(row):
    x = np.concatenate([ row['text_predictions'], row['phash']])
    return ' '.join( np.unique(x) )

In [4]:
def get_text_predictions_torch(df, max_features=25_000, th=0.75):
    model = TfidfVectorizer(stop_words='english', binary=True, max_features=max_features)
    text_embeddings = model.fit_transform(df['title']).toarray().astype(np.float16)
    text_embeddings=torch.from_numpy(text_embeddings).to('cuda:0')
    preds = []
    CHUNK = 1024*4
        
    print('Finding similar titles...')    
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0: CTS += 1

    for j in tqdm(range( CTS )):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        #print('chunk',a,'to',b)
        
        # COSINE SIMILARITY DISTANCE
        cts = torch.matmul(text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = torch.where(cts[k,] > th)[0].cpu().numpy()
            o = df.iloc[IDX].posting_id.values
            preds.append(o)

    del model, text_embeddings
    gc.collect()
    torch.cuda.empty_cache()
    return preds

In [5]:
df,image_paths = read_dataset()
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069


In [6]:
text_predictions = get_text_predictions_torch(df, max_features=25_000)

  0%|          | 0/9 [00:00<?, ?it/s]

Finding similar titles...


100%|██████████| 9/9 [00:26<00:00,  2.92s/it]


### phash

In [7]:
phash = df.groupby('image_phash').posting_id.agg('unique').to_dict()
df['phash'] = df.image_phash.map(phash)
df.head()

Unnamed: 0,posting_id,image,image_phash,title,label_group,phash
0,train_129225211,0000a68812bc7e98c42888dfb1c07da0.jpg,94974f937d4c2433,Paper Bag Victoria Secret,249114794,[train_129225211]
1,train_3386243561,00039780dfc94d01db8676fe789ecd05.jpg,af3f9460c2838f0f,"Double Tape 3M VHB 12 mm x 4,5 m ORIGINAL / DO...",2937985045,[train_3386243561]
2,train_2288590299,000a190fdd715a2a36faed16e2c65df7.jpg,b94cb00ed3e50f78,Maling TTS Canned Pork Luncheon Meat 397 gr,2395904891,[train_2288590299]
3,train_2406599165,00117e4fc239b1b641ff08340b429633.jpg,8514fc58eafea283,Daster Batik Lengan pendek - Motif Acak / Camp...,4093212188,[train_2406599165]
4,train_3369186413,00136d1cf4edede0203f32f05f660588.jpg,a6f319f924ad708c,Nescafe \xc3\x89clair Latte 220ml,3648931069,[train_3369186413]


### TfidfVectorizer + phash

In [8]:
df['text_predictions'] = text_predictions
df['matches'] = df.apply(combine_predictions, axis=1)
df[['posting_id', 'matches']].to_csv('submission.csv', index=False)

LB: 0.652

In [10]:
def getMetric(col):
    def f1score(row):
        n = len(np.intersect1d(row.target, row[col]))
        return 2*n / (len(row.target) + len(row[col]))
    return f1score

def combine_for_cv(row):
    x = np.concatenate([row['phash'], row['text_predictions']])
    return np.unique(x)

df['text_predictions'] = text_predictions
phash = df.groupby('image_phash').posting_id.agg('unique').to_dict()
df['phash'] = df.image_phash.map(phash)
df['matches_CV'] = df.apply(combine_for_cv, axis=1)
tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
df['target'] = df.label_group.map(tmp)
MyCVScore = df.apply(getMetric('matches_CV'), axis=1)
print('CV score =', MyCVScore.mean())

CV score = 0.6569883060104749


## Transformer

In [12]:
class CFG:
    batch_size = 16
    seed = 42
    device = 'cuda'
    classes = 11014
    
    scale = 30 
    margin = 0.5    
    
    CV = False
    
    num_workers=4
    transformer_model = 'sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
    text_model_path = 'best-multilingual-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'
    
    model_params = {
    'n_classes':11014,
    'model_name':transformer_model,
    'use_fc':False,
    'fc_dim':512,
    'dropout':0.3,
    }

In [14]:
tokenizer = transformers.AutoTokenizer.from_pretrained(CFG.transformer_model)

In [None]:
class ShopeeTextDataset(Dataset):
    def __init__(self, csv):
        self.csv = csv.reset_index()

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        text = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]  
        
        return input_ids, attention_mask
    
    
class ShopeeTextNet(nn.Module):

    def __init__(self,
                 n_classes,
                 model_name='bert-base-uncased',
                 use_fc=False,
                 fc_dim=512,
                 dropout=0.0):
        """
        :param n_classes:
        :param model_name: name of model from pretrainedmodels
            e.g. resnet50, resnext101_32x4d, pnasnet5large
        :param pooling: One of ('SPoC', 'MAC', 'RMAC', 'GeM', 'Rpool', 'Flatten', 'CompactBilinearPooling')
        :param loss_module: One of ('arcface', 'cosface', 'softmax')
        """
        super(ShopeeTextNet, self).__init__()

        self.transformer = transformers.AutoModel.from_pretrained(model_name)
        final_in_features = self.transformer.config.hidden_size
        
        self.use_fc = use_fc
    
        if use_fc:
            self.dropout = nn.Dropout(p=dropout)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim


    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, input_ids,attention_mask):
        feature = self.extract_feat(input_ids,attention_mask)
        return F.normalize(feature)

    def extract_feat(self, input_ids,attention_mask):
        x = self.transformer(input_ids=input_ids,attention_mask=attention_mask)
        
        features = x[0]
        features = features[:,0,:]

        if self.use_fc:
            features = self.dropout(features)
            features = self.fc(features)
            features = self.bn(features)

        return features

In [None]:
def get_text_embeddings(df):
    embeds = []
    
    model = ShopeeTextNet(**CFG.model_params)
    model.eval()
    
    model.load_state_dict(dict(list(torch.load(CFG.TEXT_MODEL_PATH).items())[:-1]))
    model = model.to(CFG.device)

    text_dataset = ShopeeTextDataset(df)
    text_loader = torch.utils.data.DataLoader(
        text_dataset,
        batch_size=CFG.batch_size,
        pin_memory=True,
        drop_last=False,
        num_workers=CFG.num_workers
    )    
    
    with torch.no_grad():
        for input_ids, attention_mask in tqdm(text_loader): 
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            feat = model(input_ids, attention_mask)
            text_embeddings = feat.detach().cpu().numpy()
            embeds.append(text_embeddings)
    
    
    del model
    text_embeddings = np.concatenate(embeds)
    print(f'Our text embeddings shape is {text_embeddings.shape}')
    del embeds
    gc.collect()
    return text_embeddings

In [None]:
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

In [None]:
def get_neighbours_cos_sim(df,embeddings, threshold=0.6):
    '''
    When using cos_sim use normalized features else use normal features
    '''
    embeddings = cupy.array(embeddings)
    
    if CFG.GET_CV:
        thresholds = list(np.arange(0.5,0.7,0.05))
        scores = []
        for threshold in thresholds:
            preds = []
            CHUNK = 1024*4

            print('Finding similar titles...for threshold :',threshold)
            CTS = len(embeddings)//CHUNK
            if len(embeddings)%CHUNK!=0: CTS += 1

            for j in range( CTS ):
                a = j*CHUNK
                b = (j+1)*CHUNK
                b = min(b,len(embeddings))

                cts = cupy.matmul(embeddings,embeddings[a:b].T).T

                for k in range(b-a):
                    IDX = cupy.where(cts[k,]>threshold)[0]
                    o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                    o = ' '.join(o)
                    preds.append(o)

            df['pred_matches'] = preds
            df['f1'] = f1_score(df['matches'], df['pred_matches'])
            score = df['f1'].mean()
            print(f'Our f1 score for threshold {threshold} is {score}')
            scores.append(score)
            
        thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
        max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
        best_threshold = max_score['thresholds'].values[0]
        best_score = max_score['scores'].values[0]
        print(f'Our best score is {best_score} and has a threshold {best_threshold}')
            
    else:
        preds = []
        CHUNK = 1024*4

        print('Finding similar texts...for threshold :',threshold)
        CTS = len(embeddings)//CHUNK
        if len(embeddings)%CHUNK!=0: CTS += 1

        for j in range( CTS ):
            a = j*CHUNK
            b = (j+1)*CHUNK
            b = min(b,len(embeddings))
            print('chunk',a,'to',b)

            cts = cupy.matmul(embeddings,embeddings[a:b].T).T

            for k in range(b-a):
                IDX = cupy.where(cts[k,]>threshold)[0]
                o = df.iloc[cupy.asnumpy(IDX)].posting_id.values
                preds.append(o)
                    
    return df, preds

In [None]:
df,df_cu,image_paths = read_dataset()
df.head()

In [None]:
text_embeddings = get_text_embeddings(df)

In [None]:
df, text_predictions = get_neighbours_cos_sim(df, text_embeddings)

### CV Score for transformer

In [None]:
df['text_predictions'] = text_predictions
phash = df.groupby('image_phash').posting_id.agg('unique').to_dict()
df['phash'] = df.image_phash.map(phash)
df['matches_CV'] = df.apply(combine_for_cv, axis=1)
tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
df['target'] = df.label_group.map(tmp)
MyCVScore = df.apply(getMetric('matches_CV'), axis=1)
print('CV score =', MyCVScore.mean())