In [None]:
!pip install "elasticsearch<8.0.0"

In [None]:
from elasticsearch import Elasticsearch

# Konfigurasi koneksi ke Elasticsearch
ELASTICSEARCH_HOST = "http://es.robota.datains.id:80"  # Ganti dengan host Elasticsearch Anda
INDEX_NAME = "hotel_reviews"  # Ganti dengan nama indeks yang ingin diambil datanya

# Inisialisasi koneksi ke Elasticsearch
es = Elasticsearch([ELASTICSEARCH_HOST])


def get_large_data_from_elasticsearch(index, size=10000):
    """
    Mengambil data hingga 10.000 dokumen dari Elasticsearch dengan metode search_after.

    :param index: Nama indeks Elasticsearch.
    :param size: Jumlah maksimum data yang ingin diambil.
    :return: List hasil pencarian.
    """
    query = {
        "query": {
            "bool": {
                "must": [
                    {"exists": {"field": "review.message"}},
                    {"exists": {"field": "sentiment"}},
                    {
                        "range": {
                            "review.date": {
                                "gte": "2022-01-22",
                                "lte": "now"
                            }
                        }
                    }
                ]
            }
        },
        "sort": [
            {"review.date": {"order": "desc"}},
            {"_id": "asc"}  # Tambahkan _id untuk search_after
        ],
        "_source": ["review", "sentiment", "sentiment_general"],
        "size": 1000  # Ambil data dalam batch 1000 per iterasi
    }

    results = []
    last_hit = None

    while len(results) < size:
        if last_hit:
            query["search_after"] = last_hit

        response = es.search(index=index, body=query)
        hits = response.get("hits", {}).get("hits", [])

        if not hits:
            break  # Jika tidak ada data lagi, berhenti

        results.extend(hits)

        # Set `search_after` berdasarkan dokumen terakhir
        last_hit = hits[-1]["sort"]

        # Jika sudah mencapai 10.000 data, berhenti
        if len(results) >= size:
            break

    return results[:size]  # Pastikan hanya 10.000 data yang dikembalikan

In [None]:
import json
import pandas as pd

# Fungsi untuk mengonversi JSON Elasticsearch ke CSV
def convert_json_to_df(data):
    """
    Mengonversi file JSON hasil Elasticsearch ke CSV dengan hanya mengambil
    review.message, language.language, sentiment.*, dan sentiment_general.

    :param json_file: Path file JSON input.
    :param csv_file: Path file CSV output.
    """
    # Pastikan data dalam format Elasticsearch (_source)
    records = []
    for record in data:  # Ambil dari _source
        source = record["_source"]
        row = {
            "review_message": source.get("review", {}).get("message", ""),
            "language": source.get("language", {}).get("language", ""),
            "sentiment_general": source.get("sentiment_general", "")
        }

        # Ambil semua sentiment.*
        for key, value in source.get("sentiment", {}).items():
            row[f"sentiment_{key}"] = value

        records.append(row)

    # Konversi ke DataFrame
    df = pd.DataFrame(records)
    return df


In [None]:
# Query untuk mengambil semua data
results = get_large_data_from_elasticsearch(INDEX_NAME, size=10000)

  response = es.search(index=index, body=query)


In [None]:
# Contoh penggunaan

df = convert_json_to_df(results)

In [None]:
df.head()

Unnamed: 0,review_message,language,sentiment_general,sentiment_meal,sentiment_surrounding,sentiment_service,sentiment_location,sentiment_staff,sentiment_facility,sentiment_value,sentiment_room,sentiment_quality
0,New hotel in a strategic location in Jakarta. ...,,,neutral,positive,neutral,positive,neutral,neutral,neutral,neutral,neutral
1,Nice room dengan amenities yang lengkap. Yang ...,,,,,,,,,,negative,
2,Makan di hotel ini sangat menyenangkan! Menu y...,,,positive,,positive,,positive,positive,,,positive
3,"buka bareng rekan disini, makanan enak dan sta...",,,positive,,,,positive,,,,
4,Close to the city center,,,neutral,positive,neutral,positive,neutral,neutral,neutral,neutral,neutral
