# Calculate CV with only Pytorch
I had trouble installing rapids to colab, which lead to calculating CV with torch.

In this notebook, we use pytorch cuda to calculate cosine simularity, which is quite fast!

The computaton is fast as cuml neighbors.

(Wish that rapids were easy to install on remote machines as well..)

![](https://i.imgflip.com/561676.jpg)

In [None]:
!pip install timm

In [None]:
# Preliminaries
from tqdm import tqdm
import math
import random
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Visuals and CV2
import cv2

# albumentations for augs
import albumentations
from albumentations.pytorch.transforms import ToTensorV2

#torch
import torch
import timm
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

import warnings
warnings.filterwarnings('ignore')

# Configuration

In [None]:
DIM = (512,512)

NUM_WORKERS = 8
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 20
SEED = 42

device = torch.device('cuda')

model_name = 'efficientnet_b3' #efficientnet_b0-b7
num_ch = 1536

# Utils

In [None]:
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_torch(SEED)

In [None]:
def get_valid_transforms():

    return albumentations.Compose(
        [
            albumentations.Resize(DIM[0],DIM[1],always_apply=True),
            albumentations.Normalize(),
        ToTensorV2(p=1.0)
        ]
    )

# Dataset

In [None]:
class ShopeeDataset(Dataset):
    def __init__(self, csv, transforms=None):

        self.csv = csv.reset_index()
        self.augmentations = transforms

    def __len__(self):
        return self.csv.shape[0]

    def __getitem__(self, index):
        row = self.csv.iloc[index]
        
        text = row.title
        
        image = cv2.imread(row.filepath)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.augmentations:
            augmented = self.augmentations(image=image)
            image = augmented['image']       
        
        
        return image,torch.tensor(row.label_group)

# Model

In [None]:
class Net(nn.Module):

    def __init__(self,
                 model_name='efficientnet_b0'):
        super(Net, self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=True)
        self.backbone.classifier = nn.Identity()
        self.backbone.global_pool = nn.Identity()
        
        self.pooling =  nn.AdaptiveAvgPool2d(1)
        
    def forward(self, x, label):
        feature = self.extract_feat(x)
        return feature

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        return x

# Get embeddings

In [None]:
def get_img_emb(data_loader,model,criterion,device):
    model.eval()
    tk0 = tqdm(enumerate(data_loader), total=len(data_loader))
    outs = np.zeros([len(valid), num_ch])
    with torch.no_grad():      
        for i,(bi,d) in enumerate(tk0):
            batch_size = d[0].size()[0]

            image = d[0]
            targets = d[1]

            image = image.to(device)
            targets = targets.to(device)
            # Inference
            output = model.extract_feat(image)
            outs[i*batch_size:i*batch_size+batch_size] = output.cpu().detach().numpy()      
    return outs

# Setup dataloader

In [None]:
from sklearn.model_selection import GroupKFold
df_train = pd.read_csv("../input/shopee-product-matching/train.csv")
skf = GroupKFold(5)
df_train['fold'] = -1
for i, (train_idx, valid_idx) in enumerate(skf.split(X=df_train, groups=df_train['label_group'])):
    df_train.loc[valid_idx, 'fold'] = i
train_df = df_train
df_train.tail()

data = df_train
data['filepath'] = data['image'].apply(lambda x: os.path.join('../input/shopee-product-matching/train_images', x))
len(data)

In [None]:
fold = 0
train = data[data['fold']!=fold].reset_index(drop=True)
valid = data[data['fold']==fold].reset_index(drop=True)

In [None]:
valid_dataset = ShopeeDataset(
    csv=valid,
    transforms=get_valid_transforms(),
)

valid_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    num_workers=NUM_WORKERS,
    shuffle=False,
    pin_memory=True,
    drop_last=False,
)

In [None]:
# Defining Device
device = torch.device("cuda")

# Defining Model for specific fold
model = Net(model_name)
model = model.to(device)

In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

from sklearn.preprocessing import normalize
import gc

def get_cv(df, outs):
    thresholds = list(np.arange(0.2, 0.8, 0.1))
    scores = []
    
    # set target
    tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
    df['target'] = df.label_group.map(tmp)

    # Normalize
    outsn = normalize(outs)

    # to torch
    outsn_torch = torch.from_numpy(outsn).cuda()
    
    # calculate cosine simularity with torch cuda()
    distances = 1 - torch.matmul(outsn_torch, outsn_torch.T).cpu().T
    
    for threshold in thresholds:
        predictions = []
        for k in range(outs.shape[0]):
            idx = np.where(distances[k,] < threshold)[0]
            o = df.iloc[idx].posting_id.values
            predictions.append(o)
        df["preds"] = predictions
        #df['oof'] = df.apply(combine_for_cv,axis=1)
        df['f1'] = df.apply(getMetric("preds"),axis=1)
        score = df['f1'].mean()
        print(f'Our f1 score for threshold {threshold} is {score}')
        scores.append(score)
    thresholds_scores = pd.DataFrame({'thresholds': thresholds, 'scores': scores})
    max_score = thresholds_scores[thresholds_scores['scores'] == thresholds_scores['scores'].max()]
    best_threshold = max_score['thresholds'].values[0]
    best_score = max_score['scores'].values[0]
    print(f'Our best score is {best_score} and has a threshold {best_threshold}')
    gc.collect()
    torch.cuda.empty_cache()

    return best_score

# Run CV

In [None]:
# get embeddings
outs = get_img_emb(valid_loader,model,None,device)

# calculate CV
best = get_cv(valid, outs)