In [36]:
import sys

from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import pandas as pd
from tqdm import tqdm

sys.path.append("..")
from src.config import settings

tqdm.pandas()

In [74]:
def get_correct_sentiment_rate(retrieved_docs: list, sentiment: str) -> float:
    count_correct_sentiment = 0
    for docs in retrieved_docs:
        if docs["_source"]["sentiment"] == sentiment:
            count_correct_sentiment += 1
    return count_correct_sentiment / len(retrieved_docs)


def elastic_search_knn(
    query: str, es_client: Elasticsearch, sentence_transformer: SentenceTransformer
) -> dict:
    vector = sentence_transformer.encode(query)
    knn = {
        "field": "review_vector",
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }
    response = es_client.search(
        index=settings.index_name,
        body={
            "knn": knn,
            "_source": ["review", "score", "sentiment", "id"],
        },
    )

    return response["hits"]["hits"]


def elastic_search_cosine_similarity(
    query, es_client: Elasticsearch, sentence_transformer: SentenceTransformer
) -> dict:
    vector = sentence_transformer.encode(query)
    response = es_client.search(
        index=settings.index_name,
        body={
            "query": {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'review_vector') + 1.0",
                        "params": {"query_vector": vector},
                    },
                }
            },
            "size": 5,
            "_source": ["review", "score", "sentiment", "id"],
        },
    )
    return response["hits"]["hits"]


def elastic_search_query(query: str, es_client: Elasticsearch) -> dict:
    response = es_client.search(
        index=settings.index_name,
        body={
            "query": {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": query,
                            "fields": ["review"],
                            "type": "best_fields",
                        }
                    }
                }
            },
            "size": 5,
            "_source": ["review", "score", "sentiment", "id"],
        },
    )
    return response["hits"]["hits"]

In [5]:
es_client = Elasticsearch(["http://localhost:9200"])
sentence_transformer = SentenceTransformer(settings.sentence_transformer_model)



In [44]:
query = "I love this movie!"

results = elastic_search_knn(query, es_client, sentence_transformer)
get_correct_sentiment_rate(results, "positive")

1.0

In [30]:
reviews_eval = pd.read_csv("../data/valid.csv")

In [45]:
reviews_eval["correct_sentiment_rate"] = reviews_eval.progress_apply(
    lambda x: get_correct_sentiment_rate(
        elastic_search_knn(x["review"], es_client, sentence_transformer), x["sentiment"]
    ),
    axis=1,
)
reviews_eval["correct_sentiment_rate"].mean()

100%|██████████| 2500/2500 [02:01<00:00, 20.64it/s]
