/
similarity.py
63 lines (52 loc) · 2.4 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description:
Compute similarity:
1. Compute the similarity between two sentences
2. Retrieves most similar sentence of a query against a corpus of documents.
"""
from typing import List, Union, Dict
class SimilarityABC:
"""
Interface for similarity compute and search.
In all instances, there is a corpus against which we want to perform the similarity search.
For each similarity search, the input is a document or a corpus, and the output are the similarities
to individual corpus documents.
"""
def add_corpus(self, corpus: Union[List[str], Dict[str, str]]):
"""
Extend the corpus with new documents.
Parameters
----------
corpus : list of str
"""
raise NotImplementedError("cannot instantiate Abstract Base Class")
def similarity(self, a: Union[str, List[str]], b: Union[str, List[str]]):
"""
Compute similarity between two texts.
:param a: list of str or str
:param b: list of str or str
:param score_function: function to compute similarity, default cos_sim
:return: similarity score, torch.Tensor, Matrix with res[i][j] = cos_sim(a[i], b[j])
"""
raise NotImplementedError("cannot instantiate Abstract Base Class")
def distance(self, a: Union[str, List[str]], b: Union[str, List[str]]):
"""Compute cosine distance between two texts."""
raise NotImplementedError("cannot instantiate Abstract Base Class")
def most_similar(self, queries: Union[str, List[str], Dict[int, str]], topn: int = 10):
"""
Find the topn most similar texts to the query against the corpus.
:param queries: Dict[int(query_id), str(query_text)] or List[str] or str
:param topn: int
:return: Dict[str, Dict[str, float]], {query_id: {corpus_id: similarity_score}, ...}
"""
raise NotImplementedError("cannot instantiate Abstract Base Class")
def search(self, queries: Union[str, List[str], Dict[int, str]], topn: int = 10):
"""
Find the topn most similar texts to the query against the corpus.
:param queries: Dict[int(query_id), str(query_text)] or List[str] or str
:param topn: int
:return: Dict[str, Dict[str, float]], {query_id: {corpus_id: similarity_score}, ...}
"""
return self.most_similar(queries, topn=topn)