# Simulate Plans Data

In [1]:
import numpy as np
import json
import requests
import us
from elasticsearch import Elasticsearch
from faker import Factory

In [3]:
# Elasticsearch instance (hosted on AWS EC2)
es = Elasticsearch()

### Generate Simulated Data

In [3]:
# Simulate some data for testing

In [38]:
def simulate_data(output_path, N=10000):
    '''
    Simulate healthcare plans data in json format
    '''
    # Create fake names generator
    fake = Factory.create()
    fake.seed(1)
    np.random.seed(1)
    # Set parameters
    n_providers_pool = 50
    n_providers_per_plan = 10
    # Initial data pool
    levels = ["Platinum", "Gold", "Silver", "Bronze", "Catastrophic"]
    states = us.states.mapping('abbr', 'name').keys()
    providers_pool = [dict(name=fake.name(), address=fake.address()) for i in range(n_providers_pool)]
    
    # Output    
    result = [dict(
                plan_name = fake.company(),
                level = np.random.choice(levels),
                premium = np.random.normal(loc=100, scale=15),
                premium_q1 = np.random.normal(loc=100, scale=15),
                plan_rank_0 = np.random.random(),
                plan_rank_1 = np.random.random(),
                plan_rank_2 = np.random.random(),
                url = fake.url(),
                state = np.random.choice(states),
                providers = list(np.random.choice(providers_pool, size=n_providers_per_plan, replace=False))
              ) for i in range(N)]
                        
    json.dump(result, open(output_path, "w"), indent=2)

In [39]:
simulate_data("sim_plans_data_v4.json", 10000)

In [40]:
!head -n100 sim_plans_data_v4.json

[
  {
    "premium_q1": 93.26683288418842, 
    "premium": 87.96740742027036, 
    "providers": [
      {
        "name": "Joyce Harrison", 
        "address": "688 Dakota Fords Suite 317\nFlynnhaven, KY 97043-2846"
      }, 
      {
        "name": "Jordan Peters", 
        "address": "22570 Whitney Inlet Suite 170\nFloresstad, ID 91786-7499"
      }, 
      {
        "name": "Robert Clark", 
        "address": "64379 Anthony Junction Suite 170\nPort Rachel, RI 44771"
      }, 
      {
        "name": "Daniel Burnett", 
        "address": "57634 Bowers Mount\nSouth Thomas, AS 95315-9758"
      }, 
      {
        "name": "Michelle Jones", 
        "address": "31001 Gonzalez Squares Apt. 198\nBrownbury, NC 95308-9963"
      }, 
      {
        "name": "George Hernandez", 
        "address": "2020 Kennedy Circles Suite 274\nRossport, RI 91798"
      }, 
      {
        "name": "Steve Davis", 
        "address": "USNV Owens\nFPO AA 60596"
      }, 
      

### Define ES Mapping

In [17]:
# Create Elasticsearch mapping

Index: Data  
Type: Plans  
Fields:  
- plan_name (string, raw)
- premium (nested, number)
- level (string, raw)
- url (string, raw)
- plan_ranks (number)
- state (string, non-analyzed)
- providers (nested)
    - name: string, raw
    - address: string, non-analyzed

In [12]:
!curl -XDELETE 'localhost:9200/data'

{"acknowledged":true}

In [13]:
def define_plan_mappings(es):
    # Mapping
    settings = {
        "settings": {
            "index": {
                "number_of_shards" : 5,
                "number_of_replicas" : 1                
            }
        },
        
        "mappings": {
            "plan": {
                "properties": {
                    "plan_name": {
                        "type": "string",
                        "index": "analyzed",
                        "fields": {
                            "raw": {
                                "type": "string",
                                "index": "not_analyzed"
                            }
                        }                    
                    },

                    "premium": {
                        "type": "float",                                                    
                    },
                    
                    "premium_q1": {
                        "type": "float",
                        "index": "no"
                    },

                    "level": {
                        "type": "string",
                        "index": "analyzed",
                        "fields": {
                            "raw": {
                                "type": "string",
                                "index": "not_analyzed"
                            }
                        }                    
                    },

                    "url": {
                        "index": "no",
                        "type": "string"                        
                    },
                    
                    "plan_rank_0": {
                        "type": "float",
                    },
                    
                    "plan_rank_1": {
                        "type": "float",
                    },
                    
                    "plan_rank_2": {
                        "type": "float",
                    },
                    
                    "state": {
                        "type": "string",
                        "index": "not_analyzed"
                    },

                    "providers": {
                        "type": "nested",
                        "properties": {
                            "name": {
                                "type": "string",
                                "index": "analyzed",
                                "fields": {
                                    "raw": {
                                        "type": "string",
                                        "index": "not_analyzed"
                                    }
                                }                                
                            },
                            "address": {
                                "type": "string",
                                "index": "no",                                                       
                            }
                        }                                                                                            
                    }
                }                                 
            }    
        }        
    }
    
    # Define mappings in ES
    es.indices.create(index="data", body=settings)

In [14]:
define_plan_mappings(es)

In [15]:
!curl 'localhost:9200/data/_mapping/plan?pretty'

{
  "data" : {
    "mappings" : {
      "plan" : {
        "properties" : {
          "level" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "plan_name" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "plan_rank_0" : {
            "type" : "float"
          },
          "plan_rank_1" : {
            "type" : "float"
          },
          "plan_rank_2" : {
            "type" : "float"
          },
          "premium" : {
            "type" : "float"
          },
          "premium_q1" : {
            "type" : "float",
            "index" : "no"
          },
          "providers" : {
            "type" : "nested",
       

### Load Data

In [8]:
def load_data(input_path, es):
    # Get data from file    
    data = json.load(open(input_path))
    # Add each plan
    for i, plan in enumerate(data):
        es.index(index='data', doc_type='plan', id=i, body=plan)

In [16]:
load_data("sim_plans_data_v4.json", es)

In [58]:
# Check data

In [17]:
!curl 'localhost:9200/_cat/indices?v'

health status index                  pri rep docs.count docs.deleted store.size pri.store.size 
yellow open   data                     5   1     110000            0     10.2mb         10.2mb 
yellow open   get-together             2   1         20            0     28.4kb         28.4kb 
yellow open   myindex                  5   1          0            0       800b           800b 
yellow open   november_2014_invoices   5   1          0            0       800b           800b 
yellow open   blog                     5   1          1            0      3.6kb          3.6kb 
yellow open   december_2014_invoices   5   1          0            0       800b           800b 
yellow open   logs                     5   1          1            0      3.7kb          3.7kb 


In [18]:
!curl 'localhost:9200/data/plan/0?pretty'

{
  "_index" : "data",
  "_type" : "plan",
  "_id" : "0",
  "_version" : 1,
  "found" : true,
  "_source" : {
    "premium_q1" : 93.26683288418842,
    "premium" : 87.96740742027036,
    "level" : "Bronze",
    "url" : "http://www.adams.com/",
    "providers" : [ {
      "name" : "Joyce Harrison",
      "address" : "688 Dakota Fords Suite 317\nFlynnhaven, KY 97043-2846"
    }, {
      "name" : "Jordan Peters",
      "address" : "22570 Whitney Inlet Suite 170\nFloresstad, ID 91786-7499"
    }, {
      "name" : "Robert Clark",
      "address" : "64379 Anthony Junction Suite 170\nPort Rachel, RI 44771"
    }, {
      "name" : "Daniel Burnett",
      "address" : "57634 Bowers Mount\nSouth Thomas, AS 95315-9758"
    }, {
      "name" : "Michelle Jones",
      "address" : "31001 Gonzalez Squares Apt. 198\nBrownbury, NC 95308-9963"
    }, {
      "name" : "George Hernandez",
      "address" : "2020 Kennedy Circles Suite 274\nRossport, RI 91798"
    }, {
      "na

In [48]:
!curl 'localhost:9200/data/plan/1?pretty'

{
  "_index" : "data",
  "_type" : "plan",
  "_id" : "1",
  "_version" : 1,
  "found" : true,
  "_source" : {
    "premium_q1" : 89.744082112385,
    "premium" : 113.51283923896618,
    "level" : "Gold",
    "url" : "http://www.keller.net/",
    "providers" : [ {
      "name" : "Steve Davis",
      "address" : "USNV Owens\nFPO AA 60596"
    }, {
      "name" : "Joyce Harrison",
      "address" : "688 Dakota Fords Suite 317\nFlynnhaven, KY 97043-2846"
    }, {
      "name" : "Judith Salazar",
      "address" : "5479 Smith Rest\nAngelberg, PR 63564"
    }, {
      "name" : "Jose Pugh",
      "address" : "4951 Bobby Park Suite 147\nNorth Daltonburgh, CO 65008"
    }, {
      "name" : "Alexis Peterson",
      "address" : "71320 Robert Keys\nPhelpsberg, OH 53938"
    }, {
      "name" : "Ricky Newman",
      "address" : "28594 Gabriel Stream\nNew Antonioland, VT 29411"
    }, {
      "name" : "Cheryl Hancock",
      "address" : "18526 Klein Brook\nLake Brandon