# About Notebook
Rapid cupy calculates quite quickly cosine distance, but we can do it faster on Pytorch. 
I love pytorch it flexible and can do many amazing things on it  
In this notebook we focus on easy calculation TFIDF

# Import Packages

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F

import gc
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer


# Utils

In [None]:
def read_dataset():
    df = pd.read_csv('../input/shopee-product-matching/test.csv')
    image_paths = '../input/shopee-product-matching/test_images/' + df['image']

    return df, image_paths

In [None]:
def combine_predictions(row):
    x = np.concatenate([ row['text_predictions'], row['phash']])
    return ' '.join( np.unique(x) )

# Text Predictions

In [None]:
def get_text_predictions_torch(df, max_features=25_000,th=0.75):
    model = TfidfVectorizer(stop_words='english', binary=True,
                            max_features=max_features)
    text_embeddings = model.fit_transform(df['title'])
    
    text_embeddings=text_embeddings.toarray().astype(np.float16)
    text_embeddings=torch.from_numpy(text_embeddings).to('cuda:0') #.half()
    CHUNK = 1024 
    CTS = len(df) // CHUNK
    if (len(df)%CHUNK) != 0:
        CTS += 1

    preds = []
    indexes=[]
    for j in tqdm(range( CTS )):
        a = j * CHUNK
        b = (j+1) * CHUNK
        b = min(b, len(df))
        cts = torch.matmul(text_embeddings, text_embeddings[a:b].T).T
        for k in range(b-a):
            IDX = torch.where(cts[k,]>th)[0].cpu().numpy()
            o = df.iloc[IDX].posting_id.values
            preds.append(o)
            indexes.append(IDX)

    del model,text_embeddings
    gc.collect()
    return preds

# Calculating Predictions

In [None]:
df,image_paths = read_dataset()
df.head()

## Text TFIDF

In [None]:
text_predictions = get_text_predictions_torch(df, max_features=25_000)

## Phash

In [None]:
phash = df.groupby('image_phash').posting_id.agg('unique').to_dict()
df['phash'] = df.image_phash.map(phash)
df.head()


# Preparing Submission

In [None]:
df['text_predictions'] = text_predictions
df['matches'] = df.apply(combine_predictions, axis=1)
df[['posting_id', 'matches']].to_csv('submission.csv', index=False)

# Notes
Cool that we can use **fp16** 
* in this code we just use np.float16 it is same as torch.float16
* it consumes less memory
* it is faster  
For my experiments (2080ti)  get_text_predictions_torch at CV runs 0.25 rapids 0.34 without TfidfVectorizer.  
But not speed is main thing. I dont like extra dependencies and  pytorch code more flexible. (I tell you what kind of flexeble, if I win it)