# Simulate Plans Data

In [1]:
import numpy as np
import json
import requests
import us
from elasticsearch import Elasticsearch
from faker import Factory

In [2]:
# Elasticsearch instance (hosted on AWS EC2)
es = Elasticsearch('http://ec2-54-215-248-141.us-west-1.compute.amazonaws.com:9200')

### Generate Simulated Data

In [16]:
# Simulate some data for testing

In [13]:
def simulate_data(output_path):
    '''
    Simulate healthcare plans data in json format
    '''
    # Create fake names generator
    fake = Factory.create()
    fake.seed(1)
    np.random.seed(1)
    # Set parameters
    N = 10000
    n_providers_pool = 50
    n_providers_per_plan = 10
    # Initial data pool
    levels = ["Platinum", "Gold", "Silver", "Bronze", "Catastrophic"]
    states = us.states.mapping('abbr', 'name').keys()
    providers_pool = [fake.name() for i in range(n_providers_pool)]
    
    # Simulate data points
    plan_name_list = [fake.company() for i in range(N)]
    premium_array = np.random.normal(loc=100, scale=15, size=N)
    level_array = np.random.choice(levels, size=N)
    url_list = [fake.url() for i in range(N)]
    state_array = np.random.choice(states, size=N)
    providers_list = [list(np.random.choice(providers_pool, size=n_providers_per_plan, replace=False))
                      for i in range(N)]
    
    # Combine columns
    result = [dict(plan_name = plan_name_list[i],
                   premium = premium_array[i],
                   level = level_array[i],
                   url = url_list[i],
                   state = state_array[i],
                   providers = providers_list[i],
                   weights = 
                  ) for i in range(N)]
    json.dump(result, open(output_path, "w"), indent=2)

In [14]:
simulate_data("sim_plans_data.json")

In [18]:
!head -n100 sim_plans_data.json

[
  {
    "state": "ND", 
    "premium": 124.36518045494863, 
    "level": "Silver", 
    "url": "http://www.tremblay.com/", 
    "providers": [
      "Jamal Barrows Jr.", 
      "Clinton Brekke PhD", 
      "Mr. Eusebio Leuschke V", 
      "Patrice Leannon", 
      "Winnifred Medhurst", 
      "Dr. Raelynn Schumm", 
      "Mortimer Kling", 
      "Chauncey Kutch DVM", 
      "Jessenia Barrows", 
      "Kyra Erdman"
    ], 
    "plan_name": "Gutmann, Stokes and Hackett"
  }, 
  {
    "state": "CT", 
    "premium": 90.823653795248873, 
    "level": "Catastrophic", 
    "url": "http://cummings.com/", 
    "providers": [
      "Kristin Stokes", 
      "Dr. Raelynn Schumm", 
      "Phyliss Botsford", 
      "Dr. Keanna Ernser PhD", 
      "Casey Purdy", 
      "Dedrick Schmeler", 
      "Velia Abshire", 
      "Mrs. Alanna Corkery", 
      "Rhoda Larson", 
      "Ms. Creola Johnston MD"
    ], 
    "plan_name": "Roberts, Jerde and Fritsch"
  }, 
  {


### Define ES Mapping

In [17]:
# Create Elasticsearch mapping

Index: Data  
Type: Plans  
Fields:  
- plan_name (string, raw)
- premium (number)
- level (string, raw)
- url (string, raw)
- state (string, non-analyzed)
- providers (string, raw)

In [4]:
!curl -XDELETE 'http://ec2-54-215-248-141.us-west-1.compute.amazonaws.com:9200/data'

{"error":{"root_cause":[{"type":"index_not_found_exception","reason":"no such index","index":"data","resource.type":"index_or_alias","resource.id":"data"}],"type":"index_not_found_exception","reason":"no such index","index":"data","resource.type":"index_or_alias","resource.id":"data"},"status":404}

In [3]:
def define_plan_mappings(es):
    # Elasticsearch path
    host = 'http://localhost:9200'
    index = 'data'
    doc_type = 'plan'
    url = '/'.join((host, index))
    # Put settings
    settings = {
        "settings": {
            "index": {
                "number_of_shards" : 5,
                "number_of_replicas" : 1                
            }
        },
        
        "mappings": {
            doc_type: {
                "properties": {
                    "plan_name": {
                        "type": "string",
                        "index": "analyzed",
                        "fields": {
                            "raw": {
                                "type": "string",
                                "index": "not_analyzed"
                            }
                        }                    
                    },

                    "premium": {
                        "type": "float"
                    },

                    "level": {
                        "type": "string",
                        "index": "analyzed",
                        "fields": {
                            "raw": {
                                "type": "string",
                                "index": "not_analyzed"
                            }
                        }                    
                    },

                    "url": {
                        "type": "string",
                        "index": "not_analyzed"
                    },
                    
                    "state": {
                        "type": "string",
                        "index": "not_analyzed"
                    },

                    "providers": {
                        "type": "string",
                        "index": "analyzed",
                        "fields": {
                            "raw": {
                                "type": "string",
                                "index": "not_analyzed"
                            }
                        }                    
                    }
                }                                 
            }    
        }        
    }
    
    # Define mappings in ES
    es.indices.create(index="data", body=settings)

In [5]:
define_plan_mappings(es)

In [6]:
!curl 'http://ec2-54-215-248-141.us-west-1.compute.amazonaws.com:9200/data/_mapping/plan?pretty'

{
  "data" : {
    "mappings" : {
      "plan" : {
        "properties" : {
          "level" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "plan_name" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "premium" : {
            "type" : "float"
          },
          "providers" : {
            "type" : "string",
            "fields" : {
              "raw" : {
                "type" : "string",
                "index" : "not_analyzed"
              }
            }
          },
          "state" : {
            "type" : "string",
            "index" : "not_analyzed"
          },
          "url" : {
            "type" : "st

### Load Data

In [11]:
def load_data(input_path, es):
    # Get data from file    
    data = json.load(open(input_path))
    # Add each plan
    for i, plan in enumerate(data):
        es.index(index='data', doc_type='plan', id=i, body=plan)

In [12]:
load_data("sim_plans_data.json", es)

In [58]:
# Check data

In [13]:
!curl 'http://ec2-54-215-248-141.us-west-1.compute.amazonaws.com:9200/_cat/indices?v'

health status index pri rep docs.count docs.deleted store.size pri.store.size 
yellow open   data    5   1      10000            0      4.1mb          4.1mb 


In [14]:
!curl 'http://ec2-54-215-248-141.us-west-1.compute.amazonaws.com:9200/data/plan/0?pretty'

{
  "_index" : "data",
  "_type" : "plan",
  "_id" : "0",
  "_version" : 1,
  "found" : true,
  "_source" : {
    "premium" : 124.36518045494863,
    "providers" : [ "Jamal Barrows Jr.", "Clinton Brekke PhD", "Mr. Eusebio Leuschke V", "Patrice Leannon", "Winnifred Medhurst", "Dr. Raelynn Schumm", "Mortimer Kling", "Chauncey Kutch DVM", "Jessenia Barrows", "Kyra Erdman" ],
    "url" : "http://www.tremblay.com/",
    "level" : "Silver",
    "state" : "ND",
    "plan_name" : "Gutmann, Stokes and Hackett"
  }
}


In [15]:
!curl 'http://ec2-54-215-248-141.us-west-1.compute.amazonaws.com:9200/data/plan/_search?pretty' -d '{\
"query": {\
"match": {\
"plan_name.raw": "Hirthe, Marks and Ernser"}}}'

{
  "took" : 53,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 1,
    "max_score" : 7.8855095,
    "hits" : [ {
      "_index" : "data",
      "_type" : "plan",
      "_id" : "21",
      "_score" : 7.8855095,
      "_source" : {
        "premium" : 117.17085564759421,
        "providers" : [ "Claiborne McDermott", "Clinton Brekke PhD", "Jamal Barrows Jr.", "Velia Abshire", "Mr. Alessandro Smitham V", "Dedrick Schmeler", "Rosanna Torphy", "Jiles Cummings", "Dr. Raelynn Schumm", "Haywood Kunde" ],
        "url" : "http://www.friesen.com/",
        "level" : "Gold",
        "state" : "ND",
        "plan_name" : "Hirthe, Marks and Ernser"
      }
    } ]
  }
}
