# Importing Libraries

In [1]:
pip install elasticsearch

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import logging
from pprint import pprint
from time import sleep

import requests
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch

In [3]:
def connect_elasticsearch():
    """ connect and check the elasticsearch connection
    """
    _es = None
    _es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    if _es.ping():
        print('Elasticsearch Connected')
    else:
        print('Elasticsearch could not connect!')
    return _es

In [4]:
es = connect_elasticsearch()

Elasticsearch Connected


# Indexing

In [5]:
def create_index(es_object, index_name, settings):
    """ To create an index
    """
    created = False
    try:
        if not es_object.indices.exists(index_name):
            # Ignore 400 means to ignore "Index Already Exist" error.
            es_object.indices.create(index=index_name, ignore=400, body=settings)
            print('Created Index')
        created = True
    except Exception as ex:
        print(str(ex))
    finally:
        return created

#### Default Model

In [6]:
# index settings
base_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "books": {
            "dynamic": "strict",
            "properties": {
                "id": {
                    "type": "integer"
                },
                "title": {
                    "type": "text"
                },
                "author": {
                    "type": "text"
                },
                "abstract": {
                    "type": "text"
                },
            }
        }
    }
}
# create_index(es, 'est_base', base_settings)

#### BM25 Model

In [7]:
bm25_model = {
    "settings" : {
        "analysis" : {
            "analyzer" : {
                "my_analyzer":{ 
                   "type":"custom",
                   "tokenizer":"standard",
                   "filter":[
                      "lowercase"
                   ]
                },
                "my_stop_analyzer":{ 
                   "type":"custom",
                   "tokenizer":"standard",
                   "filter":[
                      "lowercase",
                      "english_stop"
                   ]
                }
            }
        },
        "index" : {
            "similarity" : {
                "bm25" : {
                    "type" : "BM25",
                    "b" : 0.75,
                    "k1" : 1.2
                }
            }
        }
    },
    "mappings" : {
        "properties" : {
            "id" : {"type" : "text"},
            "title" : {"type" : "text", "analyzer" : "my_analyzer", "search_analyzer":"my_stop_analyzer", "similarity" : "bm25"},
            "author" : {"type" : "text"},
            "abstract" : {"type" : "text", "analyzer" : "my_analyzer", "search_analyzer":"my_stop_analyzer", "similarity" : "bm25"}
        }
    }
}
# create_index(es, 'est_bm25', bm25_model)

#### DFR Model

In [8]:
dfr_model = {
    "settings" : {
        "analysis" : {
            "analyzer" : {
                "my_analyzer":{ 
                   "type":"custom",
                   "tokenizer":"standard",
                   "filter":[
                      "lowercase"
                   ]
                },
                "my_stop_analyzer":{ 
                   "type":"custom",
                   "tokenizer":"standard",
                   "filter":[
                      "lowercase",
                      "english_stop"
                   ]
                }
            }
        },
        "index" : {
            "similarity" : {
                "dfr" : {
                    "type" : "DFR",
                    "basic_model" : 'in',
                    "after_effect" : 'b',
                    "normalization" : 'no'
                }
            }
        }
    },
    "mappings" : {
        "properties" : {
            "id" : {"type" : "integer"},
            "title" : {"type" : "text", "analyzer" : "my_analyzer", "search_analyzer":"my_stop_analyzer", "similarity" : "dfr"},
            "author" : {"type" : "text"},
            "abstract" : {"type" : "text", "analyzer" : "my_analyzer", "search_analyzer":"my_stop_analyzer", "similarity" : "dfr"}
        }
    }
}
# create_index(es, 'est_bm25', bm25_model)

### IB Model

In [9]:
ib_model = {
    "settings" : {
        "analysis" : {
            "analyzer" : {
                "my_analyzer":{ 
                   "type":"custom",
                   "tokenizer":"standard",
                   "filter":[
                      "lowercase"
                   ]
                },
                "my_stop_analyzer":{ 
                   "type":"custom",
                   "tokenizer":"standard",
                   "filter":[
                      "lowercase",
                      "english_stop"
                   ]
                }
            }
        },
        "index" : {
            "similarity" : {
                "ib" : {
                    "type" : "IB",
                    "distribution" : "spl",
                    "lambda" : "ttf",
                    "normalization" : "z"
                }
            }
        }
    },
    "mappings" : {
        "properties" : {
            "id" : {"type" : "integer"},
            "title" : {"type" : "text", "analyzer" : "my_analyzer", "search_analyzer":"my_stop_analyzer", "similarity" : "ib"},
            "author" : {"type" : "text"},
            "abstract" : {"type" : "text", "analyzer" : "my_analyzer", "search_analyzer":"my_stop_analyzer", "similarity" : "ib"}
        }
    }
}
# create_index(es, 'est_ib', ib_model)

In [10]:
def store_record(elastic_object, index_name, record):
    """ store documents in created indexes
    """
    is_stored = True
    try:
        outcome = elastic_object.index(index=index_name, doc_type='books', body=record)
        print(outcome)
    except Exception as ex:
        print('Error in indexing data')
        print(str(ex))
        is_stored = False
    finally:
        return is_stored

In [14]:
def loadData():
    """ extract and index the data
    """
    with open('cisi-kaggle/CISI.ALL') as f:
        lines = ""
        for l in f.readlines():
            lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
        lines = lines.lstrip("\n").split("\n")
    
    for l in lines:
        if l.startswith(".I"):
            doc_set = {}
            doc_id = int(l.split(" ")[1].strip())
            doc_set["id"] = doc_id
        elif l.startswith(".T"):
            doc_set["title"] = l.strip()[3:]
        elif l.startswith(".A"):
            doc_set["author"] = l.strip()[3:]
        elif l.startswith(".W"):
            doc_set["abstract"] = l.strip()[3:]
        else:
            result = json.dumps(doc_set)
            if es is not None:
                if create_index(es, 'est_base', base_settings):
                    store_record(es, 'est_base', result)
                if create_index(es, 'est_bm25', bm25_model):
                    store_record(es, 'est_bm25', result)
                if create_index(es, 'est_ib', ib_model):
                    store_record(es, 'est_ib', result)
                if create_index(es, 'est_dfr', dfr_model):
                    store_record(es, 'est_dfr', result)
    print('Data indexed successfully')

In [55]:
# read the dataset and index the documents
# UNCOMMENT TO CREATE
# loadData()

In [12]:
# To delete an existing Elasticsearch index
# UNCOMMENT TO DELETE
# es.indices.delete(index='est_dfr')

{'acknowledged': True}

# Retrieval

In [16]:
def search(es_object, index_name, search):
    """ search the index to display the related documents
    """
    res = es_object.search(index=index_name, body=search)
    return res

#### **Retrieval using BM25**

Retrieving all the books title for which the author name is 'Palmour' using BM25 model.

In [17]:
# write a query to retrieve the related documents
retrieval_index = 'est_bm25'

query =  {'match_phrase': {'author': 'Palmour'}}
search_object = {'_source': ['title'], 'query': {'match_phrase': {'author': 'Palmour'}}}
search_results = search(es, 'est_bm25', json.dumps(search_object))
pprint(search_results)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '2Yc-1ngByg1E2ufPGkgR',
                    '_index': 'est_bm25',
                    '_score': 6.903818,
                    '_source': {'title': 'Access to Periodical Resources'},
                    '_type': 'books'},
                   {'_id': 'y4c_1ngByg1E2ufPa1J6',
                    '_index': 'est_bm25',
                    '_score': 6.903818,
                    '_source': {'title': 'Resources and Bibliographic Support '
                                         'for a Nationwide Library Program '
                                         'Final Report to the National '
                                         'Commission for Libraries and '
                                         'Information Science'},
                    '_type': 'books'},
                   {'_id': '1Yc_1ngByg1E2ufPjFOO',
                    '_index': 'est_bm25',
                    '_score': 6.903818,
         

In [18]:
for b in search_results['hits']['hits']:
    print("Book Title: {}".format(b['_source']['title']))

Book Title: Access to Periodical Resources
Book Title: Resources and Bibliographic Support for a Nationwide Library Program Final Report to the National Commission for Libraries and Information Science
Book Title: A Study of the Characteristics, Cost and Magnitude of Interlibrary Loans in Academic Libraries


#### **Retrieval using DFR**

Retrieving all the books title for which the author name is 'Palmour' using DFR model.

In [19]:
# write a query to retrieve the related documents
retrieval_index = 'est_dfr'

query =  {'match_phrase': {'author': 'Palmour'}}
search_object = {'_source': ['title'], 'query': {'match_phrase': {'author': 'Palmour'}}}
search_results = search(es, retrieval_index, json.dumps(search_object))
pprint(search_results)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '2oc-1ngByg1E2ufPGkhE',
                    '_index': 'est_dfr',
                    '_score': 6.903818,
                    '_source': {'title': 'Access to Periodical Resources'},
                    '_type': 'books'},
                   {'_id': 'zIc_1ngByg1E2ufPa1Kc',
                    '_index': 'est_dfr',
                    '_score': 6.903818,
                    '_source': {'title': 'Resources and Bibliographic Support '
                                         'for a Nationwide Library Program '
                                         'Final Report to the National '
                                         'Commission for Libraries and '
                                         'Information Science'},
                    '_type': 'books'},
                   {'_id': '1oc_1ngByg1E2ufPjFOz',
                    '_index': 'est_dfr',
                    '_score': 6.903818,
            

In [20]:
for b in search_results['hits']['hits']:
    print("Book Title: {}".format(b['_source']['title']))

Book Title: Access to Periodical Resources
Book Title: Resources and Bibliographic Support for a Nationwide Library Program Final Report to the National Commission for Libraries and Information Science
Book Title: A Study of the Characteristics, Cost and Magnitude of Interlibrary Loans in Academic Libraries


#### **Retrieval using IB**

Retrieving all the books title for which the author name is 'Palmour' using IB model.

In [21]:
# write a query to retrieve the related documents
retrieval_index = 'est_ib'

query =  {'match_phrase': {'author': 'Palmour'}}
search_object = {'_source': ['title'], 'query': {'match_phrase': {'author': 'Palmour'}}}
search_results = search(es, retrieval_index, json.dumps(search_object))
pprint(search_results)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': 'Tv4gnngBYdI-tU1F124x',
                    '_index': 'est_ib',
                    '_score': 6.903818,
                    '_source': {'title': 'Access to Periodical Resources'},
                    '_type': 'books'},
                   {'_id': 'R_4hnngBYdI-tU1FhXM1',
                    '_index': 'est_ib',
                    '_score': 6.903818,
                    '_source': {'title': 'Resources and Bibliographic Support '
                                         'for a Nationwide Library Program '
                                         'Final Report to the National '
                                         'Commission for Libraries and '
                                         'Information Science'},
                    '_type': 'books'},
                   {'_id': 'zP4hnngBYdI-tU1FlnMh',
                    '_index': 'est_ib',
                    '_score': 6.903818,
               

In [24]:
for b in search_results['hits']['hits']:
    print("Book Title: {}".format(b['_source']['title']))

Book Title: Access to Periodical Resources
Book Title: Resources and Bibliographic Support for a Nationwide Library Program Final Report to the National Commission for Libraries and Information Science
Book Title: A Study of the Characteristics, Cost and Magnitude of Interlibrary Loans in Academic Libraries


# Evaluation

#### **Evaluating a single query**

In [25]:
# code to retrieve the related documents for the query being evaluated
q = "Image recognition and any other methods of automatically \
    transforming printed text into computer-ready form."

search_object = {"query": { "multi_match": { "query": q, "fields": ["abstract"] }}}
# search(es, 'est_bm25', json.dumps(search_object))

We use rating on a scale from zero to one, which means a document with a rating of 1 and above will be considered as “relevant” by setting the "relevant_rating_threshold" value as 1.
We also choose to evaluate the query only on the top 5 documents by setting the value of k as 5. This means that when those top 5 results contain one document with rating one or higher, the precision will be 1 / 5 = 0.2.

In [49]:
# name of an index
index_ = 'est_bm25'

# search request’s ID, used to group the result details later
req_id = 'author_query'

# query being evaluated
q = "Image recognition and any other methods of automatically \
    transforming printed text into computer-ready form."

# request section
rank_body = {
  "requests": [
    {
      "id": req_id,                                  
      "request": {                                              
          "query": { "multi_match": { "query": q, "fields": ["abstract"] }}
      },
      "ratings": [                                              
        { "_index": index_, "_id": 'G4c-1ngByg1E2ufP9E_e', "rating": 1 }
      ]
    }
  ],
   "metric": {
    "precision": {
      "k": 5,
      "relevant_rating_threshold": 1,
      "ignore_unlabeled": 'false'
    }
  }
}

The ranking evaluation API (rank_eval) allows you to evaluate the quality of ranked search results over a set of typical search queries. Given this set of queries and a list of manually rated documents, the rank_eval endpoint calculates and returns typical information retrieval metrics like mean reciprocal rank, precision etc. 

Top 5 results contain one document with rating one or higher, therefore the precision will be 1 / 5 = 0.2.

In [50]:
# call ranking evaluation API with specific index and query
pprint(es.rank_eval(index="est_bm25", body=rank_body))

{'details': {'author_query': {'hits': [{'hit': {'_id': 'w4c-1ngByg1E2ufP6k4d',
                                                '_index': 'est_bm25',
                                                '_score': 15.648968,
                                                '_type': 'books'},
                                        'rating': None},
                                       {'hit': {'_id': 'XYc-1ngByg1E2ufPd0tI',
                                                '_index': 'est_bm25',
                                                '_score': 15.393477,
                                                '_type': 'books'},
                                        'rating': None},
                                       {'hit': {'_id': 'G4c-1ngByg1E2ufP9E_e',
                                                '_index': 'est_bm25',
                                                '_score': 14.280847,
                                                '_type': 'books'},
                               

#### **Bulk Evaluation**

In [28]:
def getQueriesList():
    """ get the bulk queries to test
    """
    queries_list = {}
    with open('cisi-kaggle/CISI.QRY') as f:
        lines = ""
        for l in f.readlines():
            lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()
        lines = lines.lstrip("\n").split("\n")
    for l in lines:
        if l.startswith(".I"):
            doc_set = {}
            doc_id = int(l.split(" ")[1].strip())
            doc_set["id"] = doc_id
        elif l.startswith(".W"):
            doc_set["abstract"] = l.strip()[3:]
            queries_list[doc_set["id"]] = doc_set["abstract"]
    return queries_list

In [29]:
def getEvaluationData():
    """ get the evaluation data (queries and the related documents)
    """
    bulkq_ = []
    with open('cisi-kaggle/CISI2.REL') as f:
        query_id = ""
        for l in f.readlines():
            rel_queries = {}
            qry_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[0]
            rel_rating = l.lstrip(" ").strip("\n").split("\t")[1]
            doc_id = l.lstrip(" ").strip("\n").split("\t")[0].split(" ")[-1]
            if query_id == qry_id:
                l = bulkq_[-1]
                l['rel'].append(doc_id)
            else:
                rel_queries['qry_id'] = qry_id
                rel_queries['rating'] = rel_rating
                rel_queries['rel'] = [doc_id]
                bulkq_.append(rel_queries)
            query_id = qry_id
    return bulkq_

In [30]:
evaluation_data = getEvaluationData()
all_queries = getQueriesList()

def getQueryText(id):
    """ get the query text
        Args:
            id: document id
    """
    t = all_queries[id]
    return t

def getIndexId(index_model, doc_id):
    """ get the index id
        Args:
            index_model: index model name
            doc_id: document id
    """
    esb = es.search(index=index_model, doc_type="books", body={'query': {'match_phrase': {'id': doc_id}}})
    return esb['hits']['hits'][0]['_id']
    
def getRatings(index, rel):
    """ get the ratings list for requests
        Args:
            index: index model name
            rel:  relavance documents list
    """
    ratings = []
    for i in rel:
        ratings.append(
            { 
                "_index": index, 
                "_id": getIndexId(index, i), 
                "rating": 1 
            }
        )
    return ratings

def get_requests(index):
    """ create requests for evaluating the data
        Args:
            index: index model name
    """
    requests = []
    
    for query in evaluation_data:
        requests.append(
            {
                "id": query['qry_id'],
                "request": {
                    "query": { 
                        "multi_match": { 
                            "query": getQueryText(int(query['qry_id'])),
                            "fields": [
                                "abstract"
                            ] 
                        }
                    }
                },
                "ratings": getRatings(index, query['rel'])
            }
        )
    return requests

##### **BM25 Bulk Evaluation**

In [73]:
# PRECISION
index_ = 'est_bm25'
bulk_requests = get_requests(index_)
bulk_rank_body = {
  "requests": bulk_requests,
    "metric": {
    "precision": {
      "k": 5,
      "relevant_rating_threshold": 1,
      "ignore_unlabeled": 'false'
    }
  }
}
# call ranking evaluation API with specific index and query
model_score = es.rank_eval(index=index_, body=bulk_rank_body)
precision = model_score['metric_score']
print("BM25 precision: {}".format(precision))

BM25 precision: 0.33157894736842103


In [74]:
# MEAN RECIPROCAL RANK
index_ = 'est_bm25'
bulk_requests = get_requests(index_)
bulk_rank_body = {
  "requests": bulk_requests,
    "metric": {
        "mean_reciprocal_rank": {
          "k": 20,
          "relevant_rating_threshold": 1
        }
    }
}
# call ranking evaluation API with specific index and query
mrr_score = es.rank_eval(index=index_, body=bulk_rank_body)
print("BM25 mean reciprocal rank: {}".format(mrr_score['metric_score']))

BM25 mean reciprocal rank: 0.5813352454219327


##### **DFR Bulk Evaluation**

In [75]:
# PRECISION
index_ = 'est_dfr'
bulk_requests = get_requests(index_)
bulk_rank_body = {
  "requests": bulk_requests,
    "metric": {
    "precision": {
      "k": 5,
      "relevant_rating_threshold": 1,
      "ignore_unlabeled": 'false'
    }
  }
}
# call ranking evaluation API with specific index and query
model_score = es.rank_eval(index=index_, body=bulk_rank_body)
precision = model_score['metric_score']
print("DFR precision: {}".format(precision))

DFR precision: 0.33157894736842103


In [77]:
# MEAN RECIPROCAL RANK
index_ = 'est_dfr'
bulk_requests = get_requests(index_)
bulk_rank_body = {
  "requests": bulk_requests,
    "metric": {
        "mean_reciprocal_rank": {
          "k": 20,
          "relevant_rating_threshold": 1
        }
    }
}
# call ranking evaluation API with specific index and query
mrr_score = es.rank_eval(index=index_, body=bulk_rank_body)
print("DFR mean reciprocal rank: {}".format(mrr_score['metric_score']))

DFR mean reciprocal rank: 0.5813352454219327


##### **IB Bulk Evaluation**

In [78]:
# PRECISION
index_ = 'est_ib'
bulk_requests = get_requests(index_)
bulk_rank_body = {
  "requests": bulk_requests,
    "metric": {
    "precision": {
      "k": 5,
      "relevant_rating_threshold": 1,
      "ignore_unlabeled": 'false'
    }
  }
}
# call ranking evaluation API with specific index and query
model_score = es.rank_eval(index=index_, body=bulk_rank_body)
precision = model_score['metric_score']
print("IB precision: {}".format(precision))

IB precision: 0.33157894736842103


In [79]:
# MEAN RECIPROCAL RANK
index_ = 'est_ib'
bulk_requests = get_requests(index_)
bulk_rank_body = {
  "requests": bulk_requests,
    "metric": {
        "mean_reciprocal_rank": {
          "k": 20,
          "relevant_rating_threshold": 1
        }
    }
}
# call ranking evaluation API with specific index and query
mrr_score = es.rank_eval(index=index_, body=bulk_rank_body)
print("DFR mean reciprocal rank: {}".format(mrr_score['metric_score']))

DFR mean reciprocal rank: 0.5813352454219327
