# Index into Elasticsearch Example

In [174]:
from datetime import datetime
from elasticsearch import Elasticsearch
import json
import os

import hashlib
import eland as ed
import pandas as pd
from sqlalchemy import create_engine

In [175]:
es = Elasticsearch("http://localhost:9200")

# Let's index some real data

In [176]:
engine = create_engine('postgresql://username:password@localhost:5432/default_database?gssencmode=disable')
# select all from papers table
df = pd.read_sql_table('papers', engine)

In [177]:
df

Unnamed: 0,name,scholar_link,doi,bibtext,pdf_name,year,scholar_page,journal,downloaded,downloaded_from,authors
0,The free-energy principle: a unified brain theory?,https://www.nature.com/articles/nrn2787/boxes/bx1,10.1038/nrn2787,True,The free-energy principle_ a unified brain theory_.pdf,2010,"https://scholar.google.com/scholar?hl=en&q=Free Energy Principle&as_vis=1&as_sdt=1,5&start=%d&as_ylo=2010",Nature Reviews Neuroscience,True,SciHub,Karl Friston
1,How particular is the physics of the free energy principle?,https://www.sciencedirect.com/science/article/pii/S1571064521000749,10.1016/j.plrev.2022.05.002,True,,2022,"https://scholar.google.com/scholar?hl=en&q=Free Energy Principle&as_vis=1&as_sdt=1,5&start=%d&as_ylo=2010",Physics of Life Reviews,False,,Karl Friston
2,Is the free-energy principle neurocentric?,https://www.nature.com/articles/nrn2787-c2,10.1038/nrn2787-c2,True,Is the free-energy principle neurocentric_.pdf,2010,"https://scholar.google.com/scholar?hl=en&q=Free Energy Principle&as_vis=1&as_sdt=1,5&start=%d&as_ylo=2010",Nature Reviews Neuroscience,True,SciHub,Karl Friston
3,Some interesting observations on the free energy principle,https://www.mdpi.com/1235352,10.3390/e23081076,True,,2021,"https://scholar.google.com/scholar?hl=en&q=Free Energy Principle&as_vis=1&as_sdt=1,5&start=%d&as_ylo=2010",Entropy,False,,Karl J. Friston and Lancelot Da Costa and Thomas Parr
4,The math is not the territory: navigating the free energy principle,https://link.springer.com/article/10.1007/s10539-021-09807-0,10.1007/s10539-021-09807-0,True,The math is not the territory_ navigating the free energy principle.pdf,2021,"https://scholar.google.com/scholar?hl=en&q=Free Energy Principle&as_vis=1&as_sdt=1,5&start=%d&as_ylo=2010",Biology {&}amp$mathsemicolon$ Philosophy,True,SciHub,Mel Andrews
5,A technical critique of some parts of the free energy principle,https://www.mdpi.com/1099-4300/23/3/293,10.3390/e23030293,True,A technical critique of some parts of the free energy principle.pdf,2021,"https://scholar.google.com/scholar?hl=en&q=Free Energy Principle&as_vis=1&as_sdt=1,5&start=%d&as_ylo=2010",Entropy,True,SciHub,Martin Biehl and Felix A. Pollock and Ryota Kanai
6,What does the free energy principle tell us about the brain?,https://arxiv.org/abs/1901.07945,,False,What does the free energy principle tell us about the brain_.pdf,2019,"https://scholar.google.com/scholar?hl=en&q=Free Energy Principle&as_vis=1&as_sdt=1,5&start=%d&as_ylo=2010",,True,SciHub,SJ Gershman
7,"The free energy principle: it's not about what it takes, it's about what took you there",https://link.springer.com/article/10.1007/s10539-021-09787-1,10.1007/s10539-021-09787-1,True,The free energy principle_ it_s not about what it takes_ it_s about what took you there.pdf,2021,"https://scholar.google.com/scholar?hl=en&q=Free Energy Principle&as_vis=1&as_sdt=1,5&start=%d&as_ylo=2010",Biology {&}amp$mathsemicolon$ Philosophy,True,SciHub,Axel Constant
8,Living models or life modelled? On the use of models in the free energy principle,https://journals.sagepub.com/doi/abs/10.1177/1059712320918678,10.1177/1059712320918678,True,Living models or life modelled_ On the use of models in the free energy principle.pdf,2020,"https://scholar.google.com/scholar?hl=en&q=Free Energy Principle&as_vis=1&as_sdt=1,5&start=%d&as_ylo=2010",Adaptive Behavior,True,SciHub,Thomas van Es
9,"Self-supervision, normativity and the free energy principle",https://link.springer.com/article/10.1007/s11229-020-02622-2,10.1007/s11229-020-02622-2,True,Self-supervision_ normativity and the free energy principle.pdf,2020,"https://scholar.google.com/scholar?hl=en&q=Free Energy Principle&as_vis=1&as_sdt=1,5&start=%d&as_ylo=2010",Synthese,True,SciHub,Jakob Hohwy


In [178]:
df['authors']=df['authors'].str.split('and')
# create md5 hash of each row of the dataframe
df['md5'] = df.apply(lambda x: hashlib.md5(str(x).encode('utf-8')).hexdigest(), axis=1)
pdfs_dir = '../downloads'
# filter the dataframe to only include pdfs that exist in the pdfs directory
df = df[~df.pdf_name.isnull()]
# df = df[df.pdf_name.apply(lambda x: os.path.exists(os.path.join(pdfs_dir, x)))]

In [179]:
# pandas setting show full string for each cell
pd.set_option('display.max_colwidth', -1)
df['path_exists'] = df.pdf_name.apply(lambda x: os.path.exists(os.path.join(pdfs_dir, x)))
df[['pdf_name', 'path_exists']]

  pd.set_option('display.max_colwidth', -1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['path_exists'] = df.pdf_name.apply(lambda x: os.path.exists(os.path.join(pdfs_dir, x)))


Unnamed: 0,pdf_name,path_exists
0,The free-energy principle_ a unified brain theory_.pdf,True
2,Is the free-energy principle neurocentric_.pdf,True
4,The math is not the territory_ navigating the free energy principle.pdf,True
5,A technical critique of some parts of the free energy principle.pdf,True
6,What does the free energy principle tell us about the brain_.pdf,True
7,The free energy principle_ it_s not about what it takes_ it_s about what took you there.pdf,True
8,Living models or life modelled_ On the use of models in the free energy principle.pdf,True
9,Self-supervision_ normativity and the free energy principle.pdf,True
10,First principles in the life sciences_ the free-energy principle_ organicism_ and mechanism.pdf,True
11,The free energy principle made simpler but not too simple.pdf,True


Users might want to filter on paper name, journal, year, authors. So let's include all of these in opensearch.

In [180]:
def create_document(row):
    return {
        'authors': row['authors'],
        'year': row['year'],
        'timestamp': datetime.now(),
        'journal': row['journal'],
        'title': row['name'],
        'id': row['md5'],
    }

path = '../data/ocr/'
index_name = 'free-energy-principle'

In [195]:
# Delete index if it exists. For testing.
es.indices.delete(index=index_name, ignore=[400, 404])

  es.indices.delete(index=index_name, ignore=[400, 404])


ObjectApiResponse({'acknowledged': True})

In [194]:
for ix, file in enumerate(os.listdir(path)):
    pdf_name = file.split('.')[0]
    with open(path+file, 'r') as f:
        data = json.load(f)
        for page in data['pages']:
            for block in page['blocks']:
                text = block['text']
                # create dict from row of df with name column equal to file_name
                df_row = df.loc[df['pdf_name'].str.split('.').str[0] == pdf_name]
                # if row_of_interest is empty, skip to next file
                if len(df_row) == 0:
                    print('No row found for file: ' + pdf_name)
                    continue
                row_of_interest = df_row.to_dict('records')[0]
                doc = create_document(row_of_interest)
                doc['text'] = text
                resp = es.index(index=index_name, document=doc, id=row_of_interest['md5'])
                print('Indexed document: ' + str(ix) + ' of ' + str(len(os.listdir(path))))

es.indices.refresh(index=index_name)

resp = es.search(index=index_name, query={"match_all": {}})
print("Got %d Hits:" % resp['hits']['total']['value'])

Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8
Indexed document: 0 of 8


In [190]:
resp['hits']

{'total': {'value': 7, 'relation': 'eq'},
 'max_score': 1.0,
 'hits': [{'_index': 'free-energy-principle',
   '_id': '09c77536c10b63dd1a622978b4e6e225',
   '_score': 1.0,
   '_source': {'authors': ['Karl Friston'],
    'year': 2010,
    'timestamp': '2022-07-01T17:52:22.129345',
    'journal': 'Nature Reviews Neuroscience',
    'title': 'The free-energy principle: a unified brain theory?',
    'id': '09c77536c10b63dd1a622978b4e6e225',
    'text': 'SUPPLEMENTARY\nINFORMATION\nSee online article: 1 box)|52 (bas) | 51 (bas) | 54 (bas) |\nALL LINKS ARE ACTIVE IN THE ONLINE POT'}},
  {'_index': 'free-energy-principle',
   '_id': 'fd9e5ea50214557164cb202326bb2fb0',
   '_score': 1.0,
   '_source': {'authors': ['Andrew Sims'],
    'year': 2016,
    'timestamp': '2022-07-01T17:52:22.465213',
    'journal': 'Philosophical Psychology',
    'title': 'A problem of scope for the free energy principle as a theory of cognition',
    'id': 'fd9e5ea50214557164cb202326bb2fb0',
    'text': 'Lyon, P. (2005

In [78]:
# Connect to free-energy-principle index via localhost:9200 elasticsearch node. Back to pandas style code for familiarity.
df = ed.DataFrame(es, es_index_pattern=index_name)

In [79]:
df

Unnamed: 0,authors,id,journal,text,timestamp,title,year
60777de6dde381179751a31d91aefc60,"[Jinjian Wu , Guangming Shi , Weisi Lin , Anmin Liu , Fei Qi]",60777de6dde381179751a31d91aefc60,{IEEE} Transactions on Multimedia,"This article has been accepted for publication in a future issue of this journal, but has not been fully edited. Content may change prior to final publication.",2022-07-01 16:19:57.207938,Just noticeable difference estimation for images with free-energy principle,2013
ea2b8e44843b8b95c7ff76f231d228f0,"[Ajith Anil Meera , Martijn Wisse]",ea2b8e44843b8b95c7ff76f231d228f0,{IEEE},We thank Prof. Robert Babuska for providing us with\nthe motor data. We also thank Karl Friston and Sherin\nGrimbergen for the insightful discussions on FEP.,2022-07-01 16:19:57.393259,Free energy principle based state and input observer design for linear systems with colored noise,2020
8ad0e694e6b424be1d4fa4a3e85c97c1,[Mark Solms],8ad0e694e6b424be1d4fa4a3e85c97c1,Frontiers in Psychology,"Squire, L. (2004). Memory systems of the brain: A brief history and current\nperspective. Neurobiology of Learning and Memory. 82: 171-177",2022-07-01 16:19:58.647032,The hard problem of consciousness and the free energy principle,2019
88af8f92586380586112650e3c27dab3,"[Takuya Isomura , Kiyoshi Kotani , Yasuhiko Jimbo]",88af8f92586380586112650e3c27dab3,{PLOS} Computational Biology,"1. Belouchrani A, Abed-Meraim K, Cardoso JF, Moulines E (1997) A blind source separation technique\nusing second-order statistics. Signal Processing IEEE Trans on 45(2): 434-444.",2022-07-01 16:19:59.056078,Cultured cortical neurons can perform blind source separation according to the free-energy principle,2015
d877aafb31e46a5beff496121452f9ec,"[Wenhan Zhu , Guangtao Zhai , Yutao Liu , Ning Lin , Xiaokang Yang]",d877aafb31e46a5beff496121452f9ec,{IEEE},"This work was supported by the National Science Founda-\ntion of China (61521062, 61527804) and Science and Technol-\nogy Commission of Shanghai Municipality (15DZ0500200).",2022-07-01 16:19:59.190375,Reduced-reference image quality assessment in free-energy principle and sparse representation,2018
1eb30b5ac534e71c1f32a8b416163eb4,[Michael D. Kirchhoff],1eb30b5ac534e71c1f32a8b416163eb4,Australasian Journal of Philosophy,"Friston, Karl 2011. Embodied Inference: Or 1 Think Therefore I Am, If I Am What I Think', in The Implica-\ntions of Embodiment (Cognition and Communication), ed. Wolfgang Tschacher and Claudia Ber...",2022-07-01 16:19:59.616018,Species of realization and the free energy principle,2014


# Index in bulk

In [169]:
# https://github.com/climatepolicyradar/navigator/blob/aa5c1de51d9ff29d7c8f3bd2d9a577cfcaf1f9ba/search-index/app/index.py

In [170]:
from elasticsearch.helpers import bulk, streaming_bulk

In [None]:
from typing import Iterable


class ElasticSearchSIndexer:
    """Methods for indexing data into Elasticsearch."""

    def __init__(self, es_client, index_name):
        self.es_client = es_client
        self.index_name = index_name
        self.es_client.indices.create(index=index_name, ignore=400)

    def _index_body(self) -> dict:
        """Return the body of the index request."""
        return {
            '_index': self.index_name,
            '_type': self.doc_type,
            '_source': {
                'timestamp': datetime.now(),
                'text': self.text,
                'title': self.title,
                'authors': self.authors,
                'journal': self.journal,
                'year': self.year,
            }
        }

    def bulk_index(self, es: Elasticsearch, actions: Iterable[dict]):
        """Bulk load."""
        bulk(es, actions)