# Relevant Search

## Loading TMDB.json into local Elasticsearch

In [40]:
import importlib
importlib.reload(my_utils)

Imported my_utils module


<module 'my_utils' from '/Users/heinss/private-projects/notes-relevant-search/Jupyter/my_utils.py'>

In [17]:
import my_utils
from my_utils import host, index, indexBaseUrl, headers
import requests  # HTTP lib
import json  # json parsing

# docker run --name elasticsearch --net elastic -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -t docker.elastic.co/elasticsearch/elasticsearch:8.6.1

In [18]:
def extract():
    f = open('tmdb.json')
    if f:
        return json.loads(f.read())


def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysisSettings,
            }
        },
        "mappings": mappingSettings
    }

    settingsJson = json.dumps(settings)

    # print(mappingSettings)
    # if mappingSettings:
        # settings['mappings'] = json.dumps(mappingSettings)
        
    print(settingsJson)

    requests.delete(host + index)
    requests.put(host + index, data=settingsJson, headers=headers)

    bulkMovies = ""
    for id, movie in movieDict.items():
        addCmd = {"index": {"_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"

    response = requests.post(indexBaseUrl + "/_bulk",
                             data=bulkMovies, headers=headers)
    return response


movieDict = extract()
reindex(movieDict=movieDict)


{"settings": {"number_of_shards": 1, "index": {"analysis": {}}}, "mappings": {}}


<Response [200]>

## The search function

In [19]:
def search(query: str):
    hits = my_utils.getSearchHits(query)
    my_utils.parseSearchHits(hits)


In [20]:
userSearch = 'basketball with cartoon aliens'
baseQuery = {
    "query": {
        "multi_match": {
            "query": userSearch,
            "fields": ["title^10", "overview"]
        }
    }
}

query = my_utils.mergeDicts(baseQuery, {"size": 15})
search(query)


Num	RelevanceScore	Movie Title
1	85.6		Aliens
2	73.7		The Basketball Diaries
3	71.3		Cowboys & Aliens
4	61.1		Monsters vs Aliens
5	53.5		Aliens vs Predator: Requiem
6	53.5		Aliens in the Attic
7	45.2		Dances with Wolves
8	45.2		Friends with Benefits
9	45.2		Fire with Fire
10	45.2		Friends with Kids
11	39.6		Interview with the Vampire
12	39.6		From Russia With Love
13	39.6		Gone with the Wind
14	39.6		Just Go With It
15	39.6		My Week with Marilyn


### Validate API

In [21]:
def explain(query: str): 
    url = indexBaseUrl + "/_validate/query?explain"
    response = requests.get(url, data = json.dumps(query), headers=headers)
    my_utils.toJsonPrettyPrint(response.text)

In [22]:
explain(baseQuery)

{
  "_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "valid": true,
  "explanations": [
    {
      "index": "tmdb",
      "valid": true,
      "explanation": "((title:basketball title:with title:cartoon title:aliens)^10.0 | (overview:basketball overview:with overview:cartoon overview:aliens))"
    }
  ]
}


### Analyze API

In [23]:
analyzeQuery = {
    "analyzer": "standard",
    "text": "Fire with Fire"
}
response = requests.get(indexBaseUrl + "/_analyze", data=json.dumps(analyzeQuery), headers=headers)
my_utils.toJsonPrettyPrint(response.text)

{
  "tokens": [
    {
      "token": "fire",
      "start_offset": 0,
      "end_offset": 4,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "with",
      "start_offset": 5,
      "end_offset": 9,
      "type": "<ALPHANUM>",
      "position": 1
    },
    {
      "token": "fire",
      "start_offset": 10,
      "end_offset": 14,
      "type": "<ALPHANUM>",
      "position": 2
    }
  ]
}


## Re-indexing with the English analyzer

In [24]:
mappingSettings = {
    "properties": {
        "title": {
            "type": "text",
            "analyzer": "english"
        },
        "overview": {
            "type": "text",
            "analyzer": "english"
        }
    }
}

movieDict=extract()
reindex(mappingSettings=mappingSettings, movieDict=movieDict)

{"settings": {"number_of_shards": 1, "index": {"analysis": {}}}, "mappings": {"properties": {"title": {"type": "text", "analyzer": "english"}, "overview": {"type": "text", "analyzer": "english"}}}}


<Response [200]>

We can see that the title now uses the english analyzer:

In [25]:
currentTitleMapping = requests.get(indexBaseUrl + "/_mapping/field/title")
my_utils.toJsonPrettyPrint(currentTitleMapping.text)

{
  "tmdb": {
    "mappings": {
      "title": {
        "full_name": "title",
        "mapping": {
          "title": {
            "type": "text",
            "analyzer": "english"
          }
        }
      }
    }
  }
}


### Re-analyzing "Fire with Fire" with the new index

In [26]:
analyzeQuery = {
    "field": "title",
    "text": "Fire with Fire"
}
response = requests.get(indexBaseUrl + "/_analyze", data=json.dumps(analyzeQuery), headers=headers)
my_utils.toJsonPrettyPrint(response.text)

{
  "tokens": [
    {
      "token": "fire",
      "start_offset": 0,
      "end_offset": 4,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "fire",
      "start_offset": 10,
      "end_offset": 14,
      "type": "<ALPHANUM>",
      "position": 2
    }
  ]
}


"With" is now also removed from the title and overview tokens in the explanation:

In [27]:
explain(baseQuery)

{
  "_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "valid": true,
  "explanations": [
    {
      "index": "tmdb",
      "valid": true,
      "explanation": "((title:basketbal title:cartoon title:alien)^10.0 | (overview:basketbal overview:cartoon overview:alien))"
    }
  ]
}


In [28]:
search(query)

Num	RelevanceScore	Movie Title
1	78.8		The Basketball Diaries
2	74.1		Alien
3	74.1		Aliens
4	74.1		Alien³
5	59.7		Cowboys & Aliens
6	59.7		Aliens in the Attic
7	59.7		Alien: Resurrection
8	50.0		Monsters vs Aliens
9	43.0		Aliens vs Predator: Requiem
10	43.0		AVP: Alien vs. Predator
11	12.9		Space Jam
12	 7.5		Grown Ups
13	 7.5		Speed Racer
14	 7.2		Semi-Pro
15	 7.2		The Flintstones


In [29]:
userSearch = 'basketball with cartoon aliens'
explainSearchQuery = {
    "query": {
        "multi_match": {
            "query": userSearch,
            "fields": ["title^10", "overview"]
        }
    },
    "explain": "true",
    "size": 15
}

### Raw `_explanation` contents from hits

In [45]:
print(json.dumps(hitshits[0]['_explanation'], indent = 2))

{
  "value": 78.76023,
  "description": "max of:",
  "details": [
    {
      "value": 78.76023,
      "description": "sum of:",
      "details": [
        {
          "value": 78.76023,
          "description": "weight(title:basketbal in 784) [PerFieldSimilarity], result of:",
          "details": [
            {
              "value": 78.76023,
              "description": "score(freq=1.0), computed as boost * idf * tf from:",
              "details": [
                {
                  "value": 22.0,
                  "description": "boost",
                  "details": []
                },
                {
                  "value": 7.6180873,
                  "description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                  "details": [
                    {
                      "value": 1,
                      "description": "n, number of documents containing term",
                      "details": []
                    },
                    {

### Prettier and with maxLevel

In [38]:
hitshits = my_utils.getSearchHitsHits(explainSearchQuery)

In [34]:
my_utils.titleAndExplanation(hitshits[0])

title: The Basketball Diaries
└──78.76023 (max of:)
   └──78.76023 (sum of:)
      └──78.76023 (weight(title:basketbal in 784) [PerFieldSimilarity], result of:)
         └──78.76023 (score(freq=1.0), computed as boost * idf * tf from:)
            └──22.0 (boost)
            └──7.6180873 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               └──1 (n, number of documents containing term)
               └──3051 (N, total number of documents with field)
            └──0.4699356 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               └──1.0 (freq, occurrences of term within document)
               └──1.2 (k1, term saturation parameter)
               └──0.75 (b, length normalization parameter)
               └──2.0 (dl, length of field)
               └──2.1740413 (avgdl, average length of field)


In [41]:
my_utils.titleAndExplanation(hitshits[0], 5)

title: The Basketball Diaries
└──78.76023 (max of:)
   └──78.76023 (sum of:)
      └──78.76023 (weight(title:basketbal in 784) [PerFieldSimilarity], result of:)
         └──78.76023 (score(freq=1.0), computed as boost * idf * tf from:)
            └──22.0 (boost)
               ...
            └──7.6180873 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               ...
            └──0.4699356 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               ...
