This python code searches data from the ERIC Api and uploads the data to an Elastic Index - "search-eric"

In [81]:
import numpy as np
import pandas as pd
import requests
import json
import time

In [82]:
#url = "https://api.ies.ed.gov/eric/?search=mental%20health%20risks%20in%20construction%20industry&rows=200&format=json&start=0"
#responseJson = requests.get(url).json()

In [83]:
#responseJson

In [84]:
def getEricRecords(search, fields = None, start=0, rows=2000):
    url = 'https://api.ies.ed.gov/eric/?'
    url = url + 'search=' + search + '&rows=' + str(rows) + '&format=json&start=' + str(start)
    if(fields):
        url = url + '&fields=' + ', '.join(fields)
    print(url)
    responseJson = requests.get(url).json()
    #df = pd.DataFrame(responseJson)
    return pd.DataFrame(responseJson)

In [85]:
def getRecordCount(search):
    dataFrame = getEricRecords(search)
    totalRecords = dataFrame.loc['numFound'][0]
    print('Search', search, 'returned', format(totalRecords), 'records')
    return totalRecords

In [86]:
def cleanElementsUsingList(x):
    if(not isinstance(x, list)):
        return x
    if(not x or (len(x) == 1 and x[0] == '')):
        return None
    return ', '.join(x)

In [87]:
def getAllEricRecords(search, fields = None, cleanElements = True):
    startTime = time.time()
    nextFirstRecord = 0
    numRecordsReturnedEachApiCall = 2000
    #totalRecords = getRecordCount(search)
    totalRecords = 32000
    if(totalRecords == 0):
        print ('Search', search, 'has no results')
        return []
        
    while(nextFirstRecord < totalRecords):
        dataFrame = getEricRecords(search, fields, nextFirstRecord)
        if(nextFirstRecord == 0):
            records = pd.DataFrame(dataFrame.loc['docs'][0])
        else:
            records = pd.concat([records, pd.DataFrame(dataFrame.loc['docs'][0])], sort=False, ignore_index=True)
        nextFirstRecord += numRecordsReturnedEachApiCall
    #print('took', '{:,.lf}'.format(time.time() - startTime), 'seconds')
    return records.applymap(cleanElementsUsingList) if cleanElements else records

In [88]:
#search = "mental%20health%20risks%20in%20construction%20industry"
#search = 'subject:mental%20health%20risks%20in%20construction%20industry AND publicationdateyear:2019'
search = 'subject:mental%20health%20risks%20in%20construction%20industry'
records = getAllEricRecords(search)
records.info()

https://api.ies.ed.gov/eric/?search=subject:mental%20health%20risks%20in%20construction%20industry&rows=2000&format=json&start=0
https://api.ies.ed.gov/eric/?search=subject:mental%20health%20risks%20in%20construction%20industry&rows=2000&format=json&start=2000
https://api.ies.ed.gov/eric/?search=subject:mental%20health%20risks%20in%20construction%20industry&rows=2000&format=json&start=4000
https://api.ies.ed.gov/eric/?search=subject:mental%20health%20risks%20in%20construction%20industry&rows=2000&format=json&start=6000
https://api.ies.ed.gov/eric/?search=subject:mental%20health%20risks%20in%20construction%20industry&rows=2000&format=json&start=8000
https://api.ies.ed.gov/eric/?search=subject:mental%20health%20risks%20in%20construction%20industry&rows=2000&format=json&start=10000
https://api.ies.ed.gov/eric/?search=subject:mental%20health%20risks%20in%20construction%20industry&rows=2000&format=json&start=12000
https://api.ies.ed.gov/eric/?search=subject:mental%20health%20risks%20in%20co

In [89]:
#records.to_csv("/home/shiva/Downloads/eric_response.csv")

In [90]:
#records

Write output to Elastic Index

In [91]:
from elasticsearch import Elasticsearch
import eland as ed
import pandas as pd
from re import sub

## Setup Variables

#elasticHost      = 'localhost'
#elasticUsername  = 'elastic'
#elasticPassword  = 'elastic'
#elasticScheme    = 'http'
#elasticPort      =  9200
#elasticTimeout   =  100
elasticOpaqueId  = 'python-eland-requests'  

es = Elasticsearch(
    cloud_id="WHS:YXVzdHJhbGlhLXNvdXRoZWFzdDEuZ2NwLmVsYXN0aWMtY2xvdWQuY29tJGNhYTExYzViNzQ3YjQwMzViZWFhNzQ1MTcwY2EwOWZlJDZkOWQxNzg1ZjM1MDQxMDdhOTcxMWYyMWU5YjE3ZmQx",
    basic_auth=("elastic", "BnV9bouQp2Pbksh0Zla8Ow78"),
    opaque_id=elasticOpaqueId
)

## Setup Connection to Elasticsearch
##es = Elasticsearch(
##    [elasticHost],
##    http_auth=(elasticUsername, elasticPassword),
##    scheme=elasticScheme,
##    port=elasticPort,
##    request_timeout=elasticTimeout,
##    opaque_id=elasticOpaqueId
##)

## Create a Pandas Dataframe of Data to be Loaded into Elasticsearch
##df = pd.read_json('')

df = records

## Replace NaN (null) Values with Zero 
df.fillna(0, inplace=True)

# Rename the Columns to be Camel Case
def camel_case_string(string):
    string =  sub(r"(_|-)+", " ", string).title().replace(" ", "")
    string = string[0].lower() + string[1:]
    return string
df.columns = [camel_case_string(x) for x in df.columns]

## Save the Data into Elasticsearch
df = ed.pandas_to_eland(
    pd_df=df,
    es_client=es,
    # Where the data will live in Elasticsearch
    es_dest_index="search-eric",
    # Type overrides for certain columns, the default is keyword
    # name has been set to free text and year to a date field.
    #es_type_overrides={
    #    "name": "text",
    #    "year": "date"
    #},
    # If the index already exists replace it
    es_if_exists="append",
    # Wait for data to be indexed before returning
    es_refresh=True,
)


  for column, dtype in dataframe.dtypes.iteritems():
