## Connect to ElasticSearch

In [1]:
from pprint import pprint
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print('Connected to Elasticsearch!')
pprint(client_info.body)

Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'FQt-ffZfTpeh0Snf3pUAQw',
 'name': 'ae8b5b4be42b',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-08-05T10:05:34.233336849Z',
             'build_flavor': 'default',
             'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.11.1',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.15.0'}}


In [12]:
import json

# Load your original JSON file
with open("../data/beatmusic.tracks.json", "r") as f:
    data = json.load(f)

# Define recursive cleaner function
def clean_value(value):
    if isinstance(value, dict):
        if "$oid" in value:
            return value["$oid"]
        elif "$numberLong" in value:
            return value["$numberLong"]
        else:
            return {k: clean_value(v) for k, v in value.items()}
    elif isinstance(value, list):
        return [clean_value(v) for v in value]
    else:
        return value

# Clean all documents
cleaned_data = [clean_value(doc) for doc in data]

# Save cleaned data into a new file
with open("../data/beatmusic.tracks.cleaned.json", "w") as f:
    json.dump(cleaned_data, f, indent=2)

print("Data cleaned and saved as beatmusic.tracks.cleaned.json")


✅ Data cleaned and saved as beatmusic.tracks.cleaned.json


In [13]:
es.indices.delete(index='my_tracks', ignore_unavailable=True)
es.indices.create(index='my_tracks')

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_tracks'})

In [14]:
import json
from tqdm import tqdm
from elasticsearch import Elasticsearch

# Load data
dummy_data = json.load(open("../data/beatmusic.tracks.cleaned.json"))

# Connect to Elasticsearch
es = Elasticsearch("http://localhost:9200")

for document in tqdm(dummy_data, total=len(dummy_data)):
    # Remove _id from the document
    doc_id = document.pop("_id", None)
    
    # Index document (with or without id)
    if doc_id:
        es.index(index="my_tracks", id=doc_id, document=document)
    else:
        es.index(index="my_tracks", document=document)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9619/9619 [00:14<00:00, 659.34it/s]


In [15]:
# Search first 10 documents
response = es.search(
    index="my_tracks",
    body={
        "query": {
            "match_all": {}
        },
        "size": 2
    }
)

# Display the results
for i, hit in enumerate(response["hits"]["hits"], start=1):
    print(f"\nRecord {i}:")
    print(hit["_source"])


Record 1:
{'track_id': 12520, 'title': '(67) Donegal Rambler', 'album_name': 'Open Sets', 'track_category': 1983, 'track_image': '1542675275Donegal.gif', 'media_file': '1542675275_67__Donegal_Rambler.mp3', 'min': 0, 'sec': 0, 'duration': 73, 'priority': 3, 'track_status': 1, 'track_isfree': 1, 'create_time': '2018-11-20 00:54:35.000', 'artist': 'William Paterson & Kevin Murphy', 'year': 2018, 'createdAt': '1758626407714', 'updatedAt': '1758626407714', 'album_id': '67f621b47ddaa422361dd5d5'}

Record 2:
{'track_id': 12528, 'title': '(66) Donegal Rambler', 'album_name': 'Open Sets', 'track_category': 1983, 'track_image': '1542675388Donegal.gif', 'media_file': '1542675388_66__Donegal_Rambler.mp3', 'min': 0, 'sec': 0, 'duration': 75, 'priority': 2, 'track_status': 1, 'track_isfree': 1, 'create_time': '2018-11-20 00:56:28.000', 'artist': 'William Paterson & Kevin Murphy', 'year': 2018, 'createdAt': '1758626407714', 'updatedAt': '1758626407714', 'album_id': '67f621b47ddaa422361dd5d5'}


In [19]:
response = es.search(
    index='my_tracks',
    body={
        "query":{
            "match":{
                "album_id": "67f621b47ddaa422361dd5d3"
            }
        }
    }
)

n_hits = response['hits']['total']['value']
print(f"Found {n_hits} documents in my_index")


Found 281 documents in my_index


## Multiple filters

In [25]:
response = es.search(
    index='my_tracks',
    body={
        "query": {
            "bool": {
                "must": [
                    {"match": {"album_id": "67f621b47ddaa422361dd5d3"}},
                    # {"match": {"album_name": "Grade Music"}},
                    {"match":{"title":"bobby casey"}}
                ]
            }
        },
        "size": 10
    }
)

n_hits = response["hits"]["total"]["value"]
print(f"Found {n_hits} documents in my_tracks")

for hit in response["hits"]["hits"]:
    # Pretty print each _source document
    print(json.dumps(hit["_source"], indent=4, ensure_ascii=False))
    print("-" * 80)


Found 1 documents in my_tracks
{
    "track_id": 1251,
    "title": "Light Jig (116) Bobby Casey’s, Morrison’s, Court Town Harbour",
    "album_name": "Grade Music",
    "track_category": 121,
    "track_image": "1541518822A_Grade_Music.gif",
    "media_file": "1541518822Light_Jig__116__Bobby_Casey___s__Morrison___s__Court_Two_Harbour.mp3",
    "min": 0,
    "sec": 0,
    "duration": 209,
    "priority": 20,
    "track_status": 1,
    "track_isfree": 0,
    "create_time": "2024-01-11 07:24:52.000",
    "artist": "Kevin Murphy",
    "year": 2018,
    "createdAt": "1758626407714",
    "updatedAt": "1758626407714",
    "album_id": "67f621b47ddaa422361dd5d3"
}
--------------------------------------------------------------------------------
