## Search from Tabular Data

## Connect to Cluster

In [5]:
import json
def print_json(x):
    print(json.dumps(x, indent = 2))

In [6]:
from opensearchpy import OpenSearch
from IPython.display import JSON
client = OpenSearch(
    hosts = [{"host": "localhost", "port": 9200}],
    http_auth = ("admin", "admin"),
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)
print_json(client.info())

{
  "name": "fc81333c71df",
  "cluster_name": "docker-cluster",
  "cluster_uuid": "_Xsvjrs0TJOF3p6QlsWnJA",
  "version": {
    "distribution": "opensearch",
    "number": "2.9.0",
    "build_type": "tar",
    "build_hash": "1164221ee2b8ba3560f0ff492309867beea28433",
    "build_date": "2023-07-18T21:23:29.367080729Z",
    "build_snapshot": false,
    "lucene_version": "9.7.0",
    "minimum_wire_compatibility_version": "7.10.0",
    "minimum_index_compatibility_version": "7.0.0"
  },
  "tagline": "The OpenSearch Project: https://opensearch.org/"
}


## Read Data

In [9]:
# Read Data
import pandas as pd
import janitor
csv_url = "https://raw.githubusercontent.com/kiq005/movie-recommendation/master/src/dataset/wiki_movie_plots_deduped.csv"
documents = (
	pd.read_csv(csv_url)
    .dropna()
    .sample(5000, random_state=42)
    .reset_index(drop=True)
    .clean_names()
)

documents.head()

Unnamed: 0,release_year,title,origin_ethnicity,director,cast,genre,wiki_page,plot
0,1984,Songwriter,American,Alan Rudolph,"Willie Nelson, Kris Kristofferson, Melinda Dillon",drama,https://en.wikipedia.org/wiki/Songwriter_(1984...,"The film concerns Doc Jenkins, (Willie Nelson)..."
1,2012,Da Thadiya (ഡാ തടിയാ),Malayalam,Aashiq Abu,"Shekhar Menon, Ann Augustine, Sreenath Bhasi, ...","romance, comedy",https://en.wikipedia.org/wiki/Da_Thadiya,The film tells the love story of an obese yout...
2,1987,Ironweed,American,Héctor Babenco,"Jack Nicholson, Meryl Streep, Tom Waits, Fred ...",drama,https://en.wikipedia.org/wiki/Ironweed_(film),Francis Phelan (Jack Nicholson) is a washed-up...
3,2015,Kaliyachan,Malayalam,Farooq Abdul Rahman,"Manoj K Jayan, and Vaiga",drama,https://en.wikipedia.org/wiki/Kaliyachan,"The film is about the Kathakali actor, Kunhira..."
4,1971,Kati Patang,Bollywood,Shakti Samanta,"Rajesh Khanna, Asha Parekh",romance,https://en.wikipedia.org/wiki/Kati_Patang_(197...,"Madhavi ""Madhu"" (Asha Parekh) is an orphan liv..."


## Create Index

In [13]:
# Create Index
import yaml
body = yaml.safe_load("""
mappings:
  properties:
    title:
      type: text
      analyzer: english
    ethnicity:
      type: text
      analyzer: standard
    director:
      type: text
      analyzer: standard
    cast:
      type: text
      analyzer: standard
    genre:
      type: text
      analyzer: standard
    plot:
      type: text
      analyzer: english
    year:
      type: integer
    wiki_page:
      type: keyword

""")
response = client.indices.create("movies", body = body)
print_json(response)

{
  "acknowledged": true,
  "shards_acknowledged": true,
  "index": "movies"
}


### Add documents sequentially

In [15]:
for i, document in enumerate(documents.to_dict("records")):
    client.index(index = "movies", id = i, body = document)
    
client.indices.refresh(index = "movies")
client.cat.count(index = "movies", format="json")

[{'epoch': '1695693521', 'timestamp': '01:58:41', 'count': '5000'}]

### Add documents in bulk

In [16]:
from opensearchpy.helpers import bulk
documents_bulk = documents.assign(_index = "movies_bulk")
bulk(client, documents_bulk.to_dict(orient = "records"))

client.indices.refresh(index = "movies_bulk")
client.cat.count(index = "movies_bulk", format="json")

[{'epoch': '1695693587', 'timestamp': '01:59:47', 'count': '5000'}]

## Search Documents

In [None]:
def display_search_response(response):
  hits = response['hits']['hits']
  sources = [hit["_source"] for hit in hits]
  return pd.concat([pd.DataFrame(hits), pd.DataFrame(sources)], axis=1)

In [35]:
# Search Documents
body_yaml = """
query:
  bool:
    must:
      match_phrase:
        cast: shammi kapoor
    filter:
      bool:
        must_not:
          match_phrase:
            cast: sharmila tagore
"""
response = client.search(
  index = "movies",
  body = yaml.safe_load(body_yaml)
)
display_search_response(response)

Unnamed: 0,_index,_id,_score,_source,release_year,title,origin_ethnicity,director,cast,genre,wiki_page,plot
0,movies,4736,13.789314,"{'release_year': 1964, 'title': 'Rajkumar', 'o...",1964,Rajkumar,Bollywood,Unknown,"Shammi Kapoor, Sadhana",unknown,https://en.wikipedia.org/wiki/Rajkumar_(1964_f...,The Maharaja is eager to see his foreign-retur...
1,movies,2448,11.969725,"{'release_year': 1978, 'title': 'Shalimar', 'o...",1978,Shalimar,Bollywood,Krishna Shah,"Dharmendra, Zeenat Aman, Shammi Kapoor",drama,https://en.wikipedia.org/wiki/Shalimar_(1978_f...,"On the run from the police, S.S. Kumar, a thie..."
2,movies,3840,11.228866,"{'release_year': 2011, 'title': 'Rockstar', 'o...",2011,Rockstar,Bollywood,Imtiaz Ali,"Ranbir Kapoor, Nargis Fakhri, Shammi Kapoor",romance/drama,https://en.wikipedia.org/wiki/Rockstar_(2011_f...,A large crowd gathers outside the Verona Arena...
3,movies,1149,9.470373,"{'release_year': 1961, 'title': 'Junglee', 'or...",1961,Junglee,Bollywood,Subodh Mukherjee,"Shammi Kapoor, Saira Banu, Anoop Kumar, Lalita...",romance,https://en.wikipedia.org/wiki/Junglee,Chandrashekhar/Shekhar (Shammi Kapoor) belongs...
4,movies,959,9.000531,"{'release_year': 1960, 'title': 'College Girl'...",1960,College Girl,Bollywood,T. Prakash Rao,"Shammi Kapoor, Vyjayanthimala, Om Prakash, Tab...",romance drama,https://en.wikipedia.org/wiki/College_Girl_(19...,"In India, girls from their very birth are take..."
5,movies,636,6.68051,"{'release_year': 1962, 'title': 'Dil Tera Diwa...",1962,Dil Tera Diwana,Bollywood,B. R. Panthulu,"Shammi Kapoor, Mala Sinha, Mehmood, Shobha Kho...",romantic drama,https://en.wikipedia.org/wiki/Dil_Tera_Diwana_...,"Wayward, brash, and disobedient Mohan (Shammi ..."
