# Relevant Search

## Loading TMDB.json into local Elasticsearch

In [1]:
import requests  # HTTP lib
import json  # json parsing

# docker run --name elasticsearch --net elastic -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" -e "xpack.security.enabled=false" -t docker.elastic.co/elasticsearch/elasticsearch:8.6.1
host = "http://localhost:9200/"
index = "tmdb"
indexBaseUrl = host + index
headers = {"Content-Type": "application/json"}

In [2]:
def toJsonPrettyPrint(response):
    print(json.dumps(json.loads(response), indent=2))

In [101]:
def extract():
    f = open('tmdb.json')
    if f:
        return json.loads(f.read())


def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysisSettings,
            }
        },
        "mappings": mappingSettings
    }

    settingsJson = json.dumps(settings)

    # print(mappingSettings)
    # if mappingSettings:
        # settings['mappings'] = json.dumps(mappingSettings)
        
    print(settingsJson)

    requests.delete(host + index)
    requests.put(host + index, data=settingsJson, headers=headers)

    bulkMovies = ""
    for id, movie in movieDict.items():
        addCmd = {"index": {"_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"

    response = requests.post(indexBaseUrl + "/_bulk",
                             data=bulkMovies, headers=headers)
    return response


movieDict = extract()
reindex(movieDict=movieDict)


{"settings": {"number_of_shards": 1, "index": {"analysis": {}}}, "mappings": {}}


<Response [200]>

## The search function

In [102]:
def search(query: str):
    url = indexBaseUrl + "/_search"
    response = requests.get(url, data=json.dumps(query), headers=headers)

    searchHits = json.loads(response.text)['hits']

    print("Num\tRelevanceScore\tMovie Title")
    for idx, hit in enumerate(searchHits['hits']):
        print("%s\t%s\t\t%s" %
              (idx+1, str(round(hit['_score'], 1)).rjust(4, ' '), hit['_source']['title']))


In [103]:
userSearch = 'basketball with cartoon aliens'
query = {
    "query": {
        "multi_match": {
            "query": userSearch,
            "fields": ["title^10", "overview"]
        }
    },
    "size": 15
}
search(query)


Num	RelevanceScore	Movie Title
1	85.6		Aliens
2	73.7		The Basketball Diaries
3	71.3		Cowboys & Aliens
4	61.1		Monsters vs Aliens
5	53.5		Aliens vs Predator: Requiem
6	53.5		Aliens in the Attic
7	45.2		Dances with Wolves
8	45.2		Friends with Benefits
9	45.2		Fire with Fire
10	45.2		Friends with Kids
11	39.6		Interview with the Vampire
12	39.6		From Russia With Love
13	39.6		Gone with the Wind
14	39.6		Just Go With It
15	39.6		My Week with Marilyn


### Validate API

In [71]:
def explain(query: str): 
    url = indexBaseUrl + "/_validate/query?explain"
    response = requests.get(url, data = json.dumps(query), headers=headers)
    toJsonPrettyPrint(response.text)

In [7]:
explain(query)

{
  "_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "valid": true,
  "explanations": [
    {
      "index": "tmdb",
      "valid": true,
      "explanation": "((title:basketball title:with title:cartoon title:aliens)^10.0 | (overview:basketball overview:with overview:cartoon overview:aliens))"
    }
  ]
}


### Analyze API

In [72]:
analyzeQuery = {
    "analyzer": "standard",
    "text": "Fire with Fire"
}
response = requests.get(indexBaseUrl + "/_analyze", data=json.dumps(analyzeQuery), headers=headers)
toJsonPrettyPrint(response.text)

{
  "tokens": [
    {
      "token": "fire",
      "start_offset": 0,
      "end_offset": 4,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "with",
      "start_offset": 5,
      "end_offset": 9,
      "type": "<ALPHANUM>",
      "position": 1
    },
    {
      "token": "fire",
      "start_offset": 10,
      "end_offset": 14,
      "type": "<ALPHANUM>",
      "position": 2
    }
  ]
}


## Re-indexing with the English analyzer

In [104]:
mappingSettings = {
    "properties": {
        "title": {
            "type": "text",
            "analyzer": "english"
        },
        "overview": {
            "type": "text",
            "analyzer": "english"
        }
    }
}

movieDict=extract()
reindex(mappingSettings=mappingSettings, movieDict=movieDict)

{"settings": {"number_of_shards": 1, "index": {"analysis": {}}}, "mappings": {"properties": {"title": {"type": "text", "analyzer": "english"}, "overview": {"type": "text", "analyzer": "english"}}}}


<Response [200]>

We can see that the title now uses the english analyzer:

In [95]:
currentTitleMapping = requests.get(indexBaseUrl + "/_mapping/field/title")
toJsonPrettyPrint(currentTitleMapping.text)

{
  "tmdb": {
    "mappings": {
      "title": {
        "full_name": "title",
        "mapping": {
          "title": {
            "type": "text",
            "analyzer": "english"
          }
        }
      }
    }
  }
}


### Re-analyzing "Fire with Fire" with the new index

In [96]:
analyzeQuery = {
    "field": "title",
    "text": "Fire with Fire"
}
response = requests.get(indexBaseUrl + "/_analyze", data=json.dumps(analyzeQuery), headers=headers)
toJsonPrettyPrint(response.text)

{
  "tokens": [
    {
      "token": "fire",
      "start_offset": 0,
      "end_offset": 4,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "fire",
      "start_offset": 10,
      "end_offset": 14,
      "type": "<ALPHANUM>",
      "position": 2
    }
  ]
}


"With" is now also removed from the title and overview tokens in the explanation:

In [74]:
explain(query)

{
  "_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
  },
  "valid": true,
  "explanations": [
    {
      "index": "tmdb",
      "valid": true,
      "explanation": "((title:basketbal title:cartoon title:alien)^10.0 | (overview:basketbal overview:cartoon overview:alien))"
    }
  ]
}


In [105]:
search(query)

Num	RelevanceScore	Movie Title
1	78.8		The Basketball Diaries
2	74.1		Alien
3	74.1		Aliens
4	74.1		Alien³
5	59.7		Cowboys & Aliens
6	59.7		Aliens in the Attic
7	59.7		Alien: Resurrection
8	50.0		Monsters vs Aliens
9	43.0		Aliens vs Predator: Requiem
10	43.0		AVP: Alien vs. Predator
11	12.9		Space Jam
12	 7.5		Grown Ups
13	 7.5		Speed Racer
14	 7.2		Semi-Pro
15	 7.2		The Flintstones
