In [23]:
import requests
import json

In [24]:
def pretty(parsed):
    print(json.dumps(parsed, indent=4, sort_keys=True))

In [25]:
def extract():
    f = open('tmdb.json')
    if f:
        return json.loads(f.read())

In [26]:
print("Extracting data");
data = extract();
print("Extraction complete");

Extracting data
Extraction complete


In [27]:
def reindex(analysisSettings={}, mappingSettings={}, movieDict={}):
    settings = {
        "settings": {
            "number_of_shards": 1,
            "index": {
                "analysis": analysisSettings,
            }
        }
    }
    headers = {
        "Content-Type": "application/json"
    }
    if mappingSettings:
        settings["mappings"] = mappingSettings
    
    resp = requests.delete("http://localhost:9200/tmdb")
    print("Delete: ", resp)
    print("settings: ", json.dumps(settings))
    resp = requests.put("http://localhost:9200/tmdb",headers=headers, data=json.dumps(settings))
    print("Put settings: ", resp.status_code, resp.json())
    
    bulkMovies = ""
    
    for id, movie in movieDict.items():
        addCmd = {"index": {"_index": "tmdb",
                           "_type": "movie",
                           "_id": movie["id"]}}
        bulkMovies += json.dumps(addCmd) + "\n" + json.dumps(movie) + "\n"
    resp = requests.post("http://localhost:9200/_bulk",headers=headers, data=bulkMovies)
    print("Indexing complete")

In [28]:
reindex(movieDict=data)

Delete:  <Response [200]>
settings:  {"settings": {"number_of_shards": 1, "index": {"analysis": {}}}}
Put settings:  200 {'acknowledged': True, 'shards_acknowledged': True, 'index': 'tmdb'}
Indexing complete


In [29]:
def search(query):
    headers = {
        "Content-Type": "application/json"
    }
    url = "http://localhost:9200/tmdb/movie/_search"
    httpResp = requests.get(url, headers=headers, data=json.dumps(query))
    searchHits = json.loads(httpResp.text)['hits']
    print("Num\tRelevance Score\t\tMovie Title")
    for idx, hit in enumerate(searchHits["hits"]):
        print("%s\t%s\t\t%s" %
        (idx + 1, hit['_score'], hit['_source']["title"]))

In [30]:
userSearch = 'basketball with cartoon aliens'
query = {
    "query": {
        "multi_match": {
            "query": userSearch,
            "fields": ["title^10", "overview"]
        }
    }
}

In [31]:
search(query)

Num	Relevance Score		Movie Title
1	76.18483		Aliens
2	63.674076		Cowboys & Aliens
3	47.931763		Aliens vs Predator: Requiem
4	46.03087		Dances with Wolves
5	46.03087		Friends with Benefits
6	40.340702		Interview with the Vampire
7	40.340702		From Russia With Love
8	40.340702		Gone with the Wind
9	40.340702		Just Go With It
10	32.34417		The Girl with the Dragon Tattoo
