# Cosine 

In [1]:
# default_exp cosine

In [2]:
# export
import numpy as np
import pandas as pd
from forgebox.category import Category

## Cosine Similarity

In [3]:
# export
class CosineSearch:
    """
    Build a index search on cosine distance

    cos = CosineSearch(base_array)
    idx_order = cos(vec)
    """

    def __init__(self, base):
        assert len(base.shape) == 2,\
            f"Base array has to be 2 dimentional, input is {len(base.shape)}"
        self.base = base
        self.base_norm = self.calc_base_norm(self.base)
        self.normed_base = self.base/self.base_norm[:, None]
        self.dim = self.base.shape[1]

    def __len__(self): return base.shape[0]

    @staticmethod
    def calc_base_norm(base: np.ndarray) -> np.ndarray:
        return np.sqrt(np.power(base, 2).sum(1))

    def search(self, vec: np.ndarray, return_similarity: bool = False):
        if return_similarity:
            similarity = (vec * self.normed_base /
                          (np.power(vec, 2).sum())).sum(1)
            order = similarity.argsort()[::-1]
            return order, similarity[order]
        return self(vec)

    def __call__(self, vec: np.ndarray) -> np.ndarray:
        """
        Return the order index of the closest vector to the furthest
        vec: an 1 dimentional vector
        """
        return (vec * self.normed_base).sum(1).argsort()[::-1]


class CosineSearchWithCategory(CosineSearch):
    """
    Combine with the category manager
    The class can return a dataframe with category information
    
    search_dataframe
    """

    def __init__(self, base: np.ndarray, category: np.ndarray):
        super().__init__(base)
        self.category = category
        assert len(self.category) >= len(self), "category number too small"

    def search_dataframe(
        self, vec, return_similarity=True
    ) -> pd.DataFrame:
        """
        return a dataframe from the closest
            category to the furthest
        """
        if return_similarity:
            idx, similarity = self.search(vec, return_similarity)
            return pd.DataFrame({
                "category": self.category.i2c[idx],
                "idx": idx,
                "similarity": similarity})
        idx = self.search(vec, return_similarity)
        return pd.DataFrame({
            "category": self.category.i2c[idx],
            "idx": idx})

## Test search

In [4]:
base = np.random.rand(50000,100)-.2
vec = base[200]

In [5]:
cosine = CosineSearch(base)

In [6]:
cosine(vec)

array([  200, 42154,  2439, ..., 45360, 13398,  9083])

In [7]:
cosine.search(vec, return_similarity=True)

(array([  200, 42154,  2439, ..., 45360, 13398,  9083]),
 array([0.22396417, 0.16316632, 0.16174796, ..., 0.06436053, 0.06367614,
        0.0608117 ]))

In [8]:
# cos_cat = CosineSearchWithCategory(base, Category(list(f"c{i}" for i in range(len(base)))))

In [9]:
%%time
for i in range(100):
    cosine(vec)

CPU times: user 1.21 s, sys: 147 ms, total: 1.36 s
Wall time: 1.37 s
