# Relevant Search

## Loading TMDB.json into local Elasticsearch

In [70]:
import importlib
if my_utils is not None:
    importlib.reload(my_utils)

Imported my_utils module


In [6]:
import my_utils
from my_utils import host, index, indexBaseUrl, headers
import requests  # HTTP lib
import json  # json parsing

# docker run --name elasticsearch --net elastic -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -t docker.elastic.co/elasticsearch/elasticsearch:8.6.1

In [7]:
def extract():
    f = open('tmdb.json')
    if f:
        return json.loads(f.read())


def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysisSettings,
            }
        },
        "mappings": mappingSettings
    }

    settingsJson = json.dumps(settings)

    # print(mappingSettings)
    # if mappingSettings:
        # settings['mappings'] = json.dumps(mappingSettings)
        
    print(settingsJson)

    requests.delete(host + index)
    requests.put(host + index, data=settingsJson, headers=headers)

    bulkMovies = ""
    for id, movie in movieDict.items():
        addCmd = {"index": {"_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"

    response = requests.post(indexBaseUrl + "/_bulk",
                             data=bulkMovies, headers=headers)
    return response


movieDict = extract()
reindex(movieDict=movieDict)


{"settings": {"number_of_shards": 1, "index": {"analysis": {}}}, "mappings": {}}


<Response [200]>

## The search function

In [20]:
def search(query: str):
    hits = my_utils.getSearchHits(indexBaseUrl ,query)
    my_utils.parseSearchHits(hits)


In [21]:
userSearch = 'basketball with cartoon aliens'
baseQuery = {
    "query": {
        "multi_match": {
            "query": userSearch,
            "fields": ["title^10", "overview"]
        }
    }
}

query = my_utils.mergeDicts(baseQuery, {"size": 15})
search(query)


Num	RelevanceScore	Movie Title
1	78.8		The Basketball Diaries
2	74.1		Alien
3	74.1		Aliens
4	74.1		Alien³
5	59.7		Cowboys & Aliens
6	59.7		Aliens in the Attic
7	59.7		Alien: Resurrection
8	50.0		Monsters vs Aliens
9	43.0		Aliens vs Predator: Requiem
10	43.0		AVP: Alien vs. Predator
11	12.9		Space Jam
12	 7.5		Grown Ups
13	 7.5		Speed Racer
14	 7.2		Semi-Pro
15	 7.2		The Flintstones


### Validate API

In [32]:
def explain(query: str): 
    url = indexBaseUrl + "/_validate/query?explain"
    response = requests.get(url, data = json.dumps(query), headers=headers)
    my_utils.toJsonPrettyPrint(response.text)

In [33]:
explain(baseQuery)

{
  "_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "valid": true,
  "explanations": [
    {
      "index": "tmdb",
      "valid": true,
      "explanation": "((overview:basketbal overview:cartoon overview:alien) | (title:basketbal title:cartoon title:alien)^10.0)"
    }
  ]
}


### Analyze API

In [34]:
analyzeQuery = {
    "analyzer": "standard",
    "text": "Fire with Fire"
}
response = requests.get(indexBaseUrl + "/_analyze", data=json.dumps(analyzeQuery), headers=headers)
my_utils.toJsonPrettyPrint(response.text)

{
  "tokens": [
    {
      "token": "fire",
      "start_offset": 0,
      "end_offset": 4,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "with",
      "start_offset": 5,
      "end_offset": 9,
      "type": "<ALPHANUM>",
      "position": 1
    },
    {
      "token": "fire",
      "start_offset": 10,
      "end_offset": 14,
      "type": "<ALPHANUM>",
      "position": 2
    }
  ]
}


## Re-indexing with the English analyzer

In [35]:
mappingSettings = {
    "properties": {
        "title": {
            "type": "text",
            "analyzer": "english"
        },
        "overview": {
            "type": "text",
            "analyzer": "english"
        }
    }
}

movieDict=extract()
reindex(mappingSettings=mappingSettings, movieDict=movieDict)

{"settings": {"number_of_shards": 1, "index": {"analysis": {}}}, "mappings": {"properties": {"title": {"type": "text", "analyzer": "english"}, "overview": {"type": "text", "analyzer": "english"}}}}


<Response [200]>

We can see that the title now uses the english analyzer:

In [36]:
currentTitleMapping = requests.get(indexBaseUrl + "/_mapping/field/title")
my_utils.toJsonPrettyPrint(currentTitleMapping.text)

{
  "tmdb": {
    "mappings": {
      "title": {
        "full_name": "title",
        "mapping": {
          "title": {
            "type": "text",
            "analyzer": "english"
          }
        }
      }
    }
  }
}


### Re-analyzing "Fire with Fire" with the new index

In [37]:
analyzeQuery = {
    "field": "title",
    "text": "Fire with Fire"
}
response = requests.get(indexBaseUrl + "/_analyze", data=json.dumps(analyzeQuery), headers=headers)
my_utils.toJsonPrettyPrint(response.text)

{
  "tokens": [
    {
      "token": "fire",
      "start_offset": 0,
      "end_offset": 4,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "fire",
      "start_offset": 10,
      "end_offset": 14,
      "type": "<ALPHANUM>",
      "position": 2
    }
  ]
}


"With" is now also removed from the title and overview tokens in the explanation:

In [38]:
explain(baseQuery)

{
  "_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "valid": true,
  "explanations": [
    {
      "index": "tmdb",
      "valid": true,
      "explanation": "((overview:basketbal overview:cartoon overview:alien) | (title:basketbal title:cartoon title:alien)^10.0)"
    }
  ]
}


In [39]:
search(query)

Num	RelevanceScore	Movie Title


In [40]:
userSearch = 'basketball with cartoon aliens'
explainSearchQuery = {
    "query": {
        "multi_match": {
            "query": userSearch,
            "fields": ["title^10", "overview"]
        }
    },
    "explain": "true",
    "size": 15
}

### Raw `_explanation` contents from hits

In [42]:
hitshits = my_utils.getSearchHitsHits(indexBaseUrl, explainSearchQuery)

In [43]:
print(json.dumps(hitshits[0]['_explanation'], indent = 2))

{
  "value": 78.76023,
  "description": "max of:",
  "details": [
    {
      "value": 78.76023,
      "description": "sum of:",
      "details": [
        {
          "value": 78.76023,
          "description": "weight(title:basketbal in 1346) [PerFieldSimilarity], result of:",
          "details": [
            {
              "value": 78.76023,
              "description": "score(freq=1.0), computed as boost * idf * tf from:",
              "details": [
                {
                  "value": 22.0,
                  "description": "boost",
                  "details": []
                },
                {
                  "value": 7.6180873,
                  "description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                  "details": [
                    {
                      "value": 1,
                      "description": "n, number of documents containing term",
                      "details": []
                    },
                    

### Prettier and with maxLevel

In [44]:
my_utils.titleAndExplanation(hitshits[0])

title: The Basketball Diaries
└──78.76023 (max of:)
   └──78.76023 (sum of:)
      └──78.76023 (weight(title:basketbal in 1346) [PerFieldSimilarity], result of:)
         └──78.76023 (score(freq=1.0), computed as boost * idf * tf from:)
            └──22.0 (boost)
            └──7.6180873 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               └──1 (n, number of documents containing term)
               └──3051 (N, total number of documents with field)
            └──0.4699356 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               └──1.0 (freq, occurrences of term within document)
               └──1.2 (k1, term saturation parameter)
               └──0.75 (b, length normalization parameter)
               └──2.0 (dl, length of field)
               └──2.1740413 (avgdl, average length of field)


In [45]:
my_utils.titleAndExplanation(hitshits[0], 5)

title: The Basketball Diaries
└──78.76023 (max of:)
   └──78.76023 (sum of:)
      └──78.76023 (weight(title:basketbal in 1346) [PerFieldSimilarity], result of:)
         └──78.76023 (score(freq=1.0), computed as boost * idf * tf from:)
            └──22.0 (boost)
               ...
            └──7.6180873 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               ...
            └──0.4699356 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               ...


In [46]:
spaceJamHit = hitshits[10]
my_utils.titleAndExplanation(spaceJamHit)

title: Space Jam
└──12.882349 (max of:)
   └──12.882349 (sum of:)
      └──7.8759747 (weight(overview:basketbal in 1357) [PerFieldSimilarity], result of:)
         └──7.8759747 (score(freq=1.0), computed as boost * idf * tf from:)
            └──2.2 (boost)
            └──5.8831587 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               └──8 (n, number of documents containing term)
               └──3050 (N, total number of documents with field)
            └──0.60851467 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               └──1.0 (freq, occurrences of term within document)
               └──1.2 (k1, term saturation parameter)
               └──0.75 (b, length normalization parameter)
               └──14.0 (dl, length of field)
               └──36.697704 (avgdl, average length of field)
      └──5.0063744 (weight(overview:alien in 1357) [PerFieldSimilarity], result of:)
         └──5.0063744 (score(freq=1.0), computed as boost * idf *

In [58]:
alienHit = hitshits[1]
my_utils.titleAndExplanation(alienHit)

title: Alien
└──74.090744 (max of:)
   └──3.3211904 (sum of:)
      └──3.3211904 (weight(overview:alien in 229) [PerFieldSimilarity], result of:)
         └──3.3211904 (score(freq=1.0), computed as boost * idf * tf from:)
            └──2.2 (boost)
            └──3.739638 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               └──72 (n, number of documents containing term)
               └──3050 (N, total number of documents with field)
            └──0.40368396 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               └──1.0 (freq, occurrences of term within document)
               └──1.2 (k1, term saturation parameter)
               └──0.75 (b, length normalization parameter)
               └──48.0 (dl, length of field (approximate))
               └──36.697704 (avgdl, average length of field)
   └──74.090744 (sum of:)
      └──74.090744 (weight(title:alien in 229) [PerFieldSimilarity], result of:)
         └──74.090744 (score(freq=1.0)

In [59]:
alienHit['_source']['overview']

'During its return to the earth, commercial spaceship Nostromo intercepts a distress signal from a distant planet. When a three-member team of the crew discovers a chamber containing thousands of eggs on the planet, a creature inside one of the eggs attacks an explorer. The entire crew is unaware of the impending nightmare set to descend upon them when the alien parasite planted inside its unfortunate host is birthed.'

In [57]:
spaceJamHit['_source']['overview']

'Michael Jordan agrees to help the Looney Tunes play a basketball game against alien slavers to determine their freedom.'

## Fixing our ranking for "Space Jam" vs. "Alien"

In [62]:
my_utils.titleAndExplanation(alienHit, 5)

title: Alien
└──74.090744 (max of:)
   └──3.3211904 (sum of:)
      └──3.3211904 (weight(overview:alien in 229) [PerFieldSimilarity], result of:)
         └──3.3211904 (score(freq=1.0), computed as boost * idf * tf from:)
            └──2.2 (boost)
               ...
            └──3.739638 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               ...
            └──0.40368396 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               ...
   └──74.090744 (sum of:)
      └──74.090744 (weight(title:alien in 229) [PerFieldSimilarity], result of:)
         └──74.090744 (score(freq=1.0), computed as boost * idf * tf from:)
            └──22.0 (boost)
               ...
            └──5.7722607 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               ...
            └──0.5834389 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               ...


In [63]:
my_utils.titleAndExplanation(spaceJamHit, 5)

title: Space Jam
└──12.882349 (max of:)
   └──12.882349 (sum of:)
      └──7.8759747 (weight(overview:basketbal in 1357) [PerFieldSimilarity], result of:)
         └──7.8759747 (score(freq=1.0), computed as boost * idf * tf from:)
            └──2.2 (boost)
               ...
            └──5.8831587 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               ...
            └──0.60851467 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               ...
      └──5.0063744 (weight(overview:alien in 1357) [PerFieldSimilarity], result of:)
         └──5.0063744 (score(freq=1.0), computed as boost * idf * tf from:)
            └──2.2 (boost)
               ...
            └──3.739638 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               ...
            └──0.60851467 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               ...


In [67]:
revisedExplainSearchQuery = {
    "query": {
        "multi_match": {
            "query": userSearch,
            "fields": ["title^0.1", "overview"]
        }
    },
    "explain": "true",
    "size": 15
}

search(revisedExplainSearchQuery)

Num	RelevanceScore	Movie Title
1	12.9		Space Jam
2	 7.5		Grown Ups
3	 7.5		Speed Racer
4	 7.2		Semi-Pro
5	 7.2		The Flintstones
6	 6.9		Coach Carter
7	 6.8		White Men Can't Jump
8	 5.8		Meet Dave
9	 5.8		Aliens vs Predator: Requiem
10	 5.4		Bedazzled
11	 5.3		High School Musical
12	 5.3		The Thing
13	 5.2		The Darkest Hour
14	 5.2		Invasion of the Body Snatchers
15	 5.1		Slither


In [68]:
revisedHitshits = my_utils.getSearchHitsHits(revisedExplainSearchQuery)

In [69]:
spaceJam = revisedHitshits[0]
my_utils.titleAndExplanation(spaceJamHit, 5)

title: Space Jam
└──12.882349 (max of:)
   └──12.882349 (sum of:)
      └──7.8759747 (weight(overview:basketbal in 1357) [PerFieldSimilarity], result of:)
         └──7.8759747 (score(freq=1.0), computed as boost * idf * tf from:)
            └──2.2 (boost)
               ...
            └──5.8831587 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               ...
            └──0.60851467 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               ...
      └──5.0063744 (weight(overview:alien in 1357) [PerFieldSimilarity], result of:)
         └──5.0063744 (score(freq=1.0), computed as boost * idf * tf from:)
            └──2.2 (boost)
               ...
            └──3.739638 (idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:)
               ...
            └──0.60851467 (tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:)
               ...
