# Task - 1: Data Ingestion into Elasticsearch:

- Preprocess the dataset and convert it to a format compatible with Elasticsearch (e.g., JSON or CSV).

- Index the data using Logstash or Python Bulk API.

- Map each feature correctly (categorical, numeric, boolean).

In [2]:
import pandas as pd
import json

# Load the dataset
df = pd.read_csv('healthcare-dataset-stroke-data.csv')  # replace with your actual file path

# Replace 'N/A' with None
df.replace('N/A', None, inplace=True)

# Fill missing bmi values with median
df['bmi'] = df['bmi'].astype(float)
df['bmi'].fillna(df['bmi'].median(), inplace=True)

# Convert boolean-like columns
df['hypertension'] = df['hypertension'].astype(bool)
df['heart_disease'] = df['heart_disease'].astype(bool)
df['stroke'] = df['stroke'].astype(bool)

# Optional: Convert categorical to lowercase for consistency
categorical_cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
df[categorical_cols] = df[categorical_cols].apply(lambda x: x.str.lower())

# Convert to JSON format for bulk indexing
docs = [
    {"index": {"_index": "stroke_data", "_id": int(row["id"])}}
    for _, row in df.iterrows()
]
docs += df.drop(columns=["id"]).to_dict(orient='records')

# Save to NDJSON file
with open('stroke_bulk_data.json', 'w') as f:
    for _, row in df.iterrows():
        index_cmd = {"index": {"_index": "stroke_data", "_id": int(row["id"])}}
        data_doc = row.drop("id").to_dict()
        json.dump(index_cmd, f)
        f.write('\n')
        json.dump(data_doc, f)
        f.write('\n')



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(df['bmi'].median(), inplace=True)


In [2]:
pip install elasticsearch


Collecting elasticsearchNote: you may need to restart the kernel to use updated packages.

  Downloading elasticsearch-8.17.2-py3-none-any.whl.metadata (8.8 kB)
Collecting elastic-transport<9,>=8.15.1 (from elasticsearch)
  Downloading elastic_transport-8.17.1-py3-none-any.whl.metadata (3.8 kB)
Collecting urllib3<3,>=1.26.2 (from elastic-transport<9,>=8.15.1->elasticsearch)
  Downloading urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi (from elastic-transport<9,>=8.15.1->elasticsearch)
  Downloading certifi-2025.1.31-py3-none-any.whl.metadata (2.5 kB)
Downloading elasticsearch-8.17.2-py3-none-any.whl (717 kB)
   ---------------------------------------- 0.0/718.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/718.0 kB ? eta -:--:--
   -------------- ------------------------- 262.1/718.0 kB ? eta -:--:--
   ----------------------------- ---------- 524.3/718.0 kB 2.4 MB/s eta 0:00:01
   ---------------------------------------- 718.0/718.0 kB 1.5 MB/s e

In [14]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import json

# your Elasticsearch password
password = "uN=9kPhqvt+WJvd=S3vt"

# connect to Elasticsearch
es = Elasticsearch("https://localhost:9200", basic_auth=("elastic", password), verify_certs=False)

# delete existing index if present
if es.indices.exists(index="stroke_data"):
    es.indices.delete(index="stroke_data")

# define mapping using correct data types
mapping = {
    "mappings": {
        "properties": {
            # categorical fields
            "gender": { "type": "keyword" },
            "ever_married": { "type": "keyword" },
            "work_type": { "type": "keyword" },
            "Residence_type": { "type": "keyword" },
            "smoking_status": { "type": "keyword" },

            # numeric fields
            "age": { "type": "float" },
            "avg_glucose_level": { "type": "float" },
            "bmi": { "type": "float" },

            # boolean fields
            "hypertension": { "type": "boolean" },
            "heart_disease": { "type": "boolean" },
            "stroke": { "type": "boolean" }
        }
    }
}

# create index
es.indices.create(index="stroke_data", body=mapping)

# load bulk data from JSON file
with open("stroke_bulk_data.json", 'r') as f:
    lines = f.readlines()

actions = []
i = 0
while i < len(lines):
    meta = json.loads(lines[i])
    doc = json.loads(lines[i+1])
    actions.append({
        "_index": "stroke_data",
        "_id": meta["index"].get("_id"),
        "_source": doc
    })
    i += 2

# bulk insert
success, _ = bulk(es, actions)
print(f"✅ Indexed {success} documents into 'stroke_data'")




✅ Indexed 5110 documents into 'stroke_data'




In [11]:
res = es.search(index="stroke_data", size=5)
for doc in res['hits']['hits']:
    print(doc['_source'])


{'gender': 'male', 'age': 67.0, 'hypertension': False, 'heart_disease': True, 'ever_married': 'yes', 'work_type': 'private', 'Residence_type': 'urban', 'avg_glucose_level': 228.69, 'bmi': 36.6, 'smoking_status': 'formerly smoked', 'stroke': True}
{'gender': 'female', 'age': 61.0, 'hypertension': False, 'heart_disease': False, 'ever_married': 'yes', 'work_type': 'self-employed', 'Residence_type': 'rural', 'avg_glucose_level': 202.21, 'bmi': 28.1, 'smoking_status': 'never smoked', 'stroke': True}
{'gender': 'male', 'age': 80.0, 'hypertension': False, 'heart_disease': True, 'ever_married': 'yes', 'work_type': 'private', 'Residence_type': 'rural', 'avg_glucose_level': 105.92, 'bmi': 32.5, 'smoking_status': 'never smoked', 'stroke': True}
{'gender': 'female', 'age': 49.0, 'hypertension': False, 'heart_disease': False, 'ever_married': 'yes', 'work_type': 'private', 'Residence_type': 'urban', 'avg_glucose_level': 171.23, 'bmi': 34.4, 'smoking_status': 'smokes', 'stroke': True}
{'gender': 'fem



In [12]:
mapping = es.indices.get_mapping(index="stroke_data")
print(json.dumps(mapping.body, indent=2))


{
  "stroke_data": {
    "mappings": {
      "properties": {
        "Residence_type": {
          "type": "keyword"
        },
        "age": {
          "type": "float"
        },
        "avg_glucose_level": {
          "type": "float"
        },
        "bmi": {
          "type": "float"
        },
        "ever_married": {
          "type": "keyword"
        },
        "gender": {
          "type": "keyword"
        },
        "heart_disease": {
          "type": "boolean"
        },
        "hypertension": {
          "type": "boolean"
        },
        "smoking_status": {
          "type": "keyword"
        },
        "stroke": {
          "type": "boolean"
        },
        "work_type": {
          "type": "keyword"
        }
      }
    }
  }
}




## Task - 2: Kibana Dashboard Design:

- Visualize each dataset feature using suitable charts:

- Pie/Donut for gender, marital status

- Histogram/Line for age, glucose levels

- Bar/Stacked Bar for hypertension, heart disease, smoking status

- Heatmaps and alerts for stroke risk tracking
 