In [56]:
from elasticsearch_dsl import Document, Date, Integer, Float, Keyword, Text, GeoPoint, Object, InnerDoc, Nested
from elasticsearch_dsl.connections import connections

import json
import requests
import datetime

# create connection pool
pooledConnection = connections.create_connection(hosts=['localhost'])


In [61]:

# read the given file and parse the contents (from json to dict)
def prepareDictFromJsonString(filename):
    content = {}
    
    f = open(filename, 'r')
    if f.mode == 'r':
        content = json.loads(f.read())

    return content

# data structure
#{
# "name": "Data Mining, Southeast Asia Edition", # bookname   
# "authors": [ "Jiawei Han" ], 
# "publisher": "Elsevier",
# "publish_date": "2006-04-06",
# "desc": ".....",
# "isbn10": "...",
# "isbn13": "...",
# "pages": 800,
# "print_type": "book",
# "category": ["computer"],
# "average_rating": 3.5,
# "ratings_count": 23,
#    "image": "http://books.google.com/books/content?id=AfL0t-YzOrEC&printsec=frontcover&img=1&zoom=5&edge=curl&source=gbs_api",
#    "language": "en"
#}

class GoogleBook(Document):
    name = Text(analyzer='english', fields={'raw': Keyword()})
    authors = Text(multi=True)
    publisher = Text()
    publish_date = Date()
    desc = Text(analyzer='english')
    isbn10 = Keyword()
    isbn13 = Keyword()
    pages = Integer()
    print_type = Keyword(multi=True)
    book_category = Keyword(multi=True)
    average_rating = Float()
    ratings_count = Integer()
    image = Keyword()
    language = Keyword(multi=True)
    search_category = Text(analyzer='english', fields={'raw': Keyword()})
    
    class Index:
        name = "google_book_demo"
        
    class Meta:
        docType = "_doc"

        
# create Mappings
GoogleBook.init()
        
    
# search for book volumes based on given token / keyword (using Google books api)
def searchForBookByApi(vocabDict):
    URL = "https://www.googleapis.com/books/v1/volumes"
    PARAMS = {
        "q": "",
        "maxResults": 20
    }
    entityList = []
    
    vocabList = vocabDict['vocabulary']
    for catItem in vocabList:
        searchCategory = catItem['category']
        tokenList = catItem['tokens']
        
        for token in tokenList:
            # add back the token for search
            PARAMS['q'] = token
            jsonData = requests.get(url=URL, params=PARAMS).json()
            ###### PS...
            # print(jsonData)
            if 'items' in jsonData:
                items = jsonData['items']
            else:
                items = []
            
            for item in items:
                item = item['volumeInfo']
                inst = GoogleBook(search_category=searchCategory)
                # language
                if 'language' in item:
                    inst.language = item['language']
                    
                if 'title' in item:
                    inst.name = item['title']
                
                if 'publisher' in item:
                    inst.publisher = item['publisher']
                    
                # image
                if 'imageLinks' in item and 'smallThumbnail' in item['imageLinks']:
                    inst.image = item['imageLinks']['smallThumbnail']
                
                # description
                if 'description' in item:
                    inst.desc = item['description']
                
                # print type
                if 'printType' in item:
                    inst.print_type = item['printType']
                
                # authors array
                if 'authors' in item:
                    aList = item['authors']
                    for a in aList:
                        inst.authors.append(a)
                    
                # publish date
                dateErr = False
                if 'publishedDate' in item:
                    try:
                        inst.publish_date = datetime.datetime.strptime(item['publishedDate'], '%Y-%m-%d')
                    except:
                        dateErr = True

                    if dateErr == True:
                        try:
                            dateErr = False
                            inst.publish_date = datetime.datetime.strptime(item['publishedDate'], '%Y')
                        except:
                            dateErr = True

                    if dateErr == True:
                        try:
                            dateErr = False                            
                            inst.publish_date = datetime.datetime.strptime(item['publishedDate'], '%Y-%m')
                        except:
                            inst.publish_date = datetime.datetime.strptime('1970-01-01', '%Y-%m-%d')
                    
                            
                # isbn10 and isbn13
                if 'industryIdentifiers' in item:
                    aList = item['industryIdentifiers']
                    for a in aList:
                        if a['type'] == 'ISBN_10':
                            inst.isbn10 = a['identifier']
                        elif a['type'] == 'ISBN_13':
                            inst.isbn13 = a['identifier']
                # category(s)
                if 'categories' in item:
                    aList = item['categories']
                    for a in aList:
                        inst.book_category.append(a)
                # avg rating
                if 'averageRating' in item:
                    inst.average_rating = float(item['averageRating'])
                else:
                    inst.average_rating = 0.0
                    
                if 'ratingsCount' in item:
                    inst.ratings_count = int(item['ratingsCount'])
                else: 
                    inst.ratings_count = 0
                    
                if 'pageCount' in item: 
                    inst.pages = int(item['pageCount'])
                else:
                    inst.pages = 0
                
                entityList.append(inst)
                
            # end -- for (items from the result)
        # end -- for (token within the vocabulary)
    # end -- for (category object containing the tokens)
    
    return entityList


def persistToES(entityList):
    for item in entityList:
        item.save()
        

In [62]:

# main flow

vocabDict = prepareDictFromJsonString('vocabulary')
entityList = searchForBookByApi(vocabDict)
print('start to persist...')
persistToES(entityList)



start to persist...
