In [1]:
import os, sys, pandas as pd, numpy as np, string, json, math
from random import randrange, shuffle
from elasticsearch import Elasticsearch

In [9]:
class ESConnector():
    
    """
    Reads config file, and connects to ES engine
    """
    
    def __init__(self):
        
        import config
        from elasticsearch import Elasticsearch
        
        self.connector = Elasticsearch([{"host" : config.ELASTIC_SEARCH_HOST, "port":config.ELASTIC_SEARCH_PORT}])
        
        self.ES_INDEX_NAME = config.ES_INDEX_NAME
        
        return
    
    

In [None]:
class ServeSearchResults():
    
    """
    Uses the input keyword provided to search, process and pass on the results    
    """
    
    def __init__(self, 
                 connector=ESConnector()
                ):
        
        self.connector = connector
        
        return
    
    def search_reports(self,
                       kw="sales", 
                   es_connector=ESConnector().connector,
                   index=ESConnector().ES_INDEX_NAME,
                   field="metric_names"
                  
                  ):
    
        """
        Searches for report names using metric name keyword
        """

        kw = kw.strip() #Strip any trailing/leading whitespaces as it might give unwanted results

        search_term_proc = "*" + kw + "*" #adding wildcard

        split_search_term = "*"+ (" ".join(kw.split("_"))).strip() +"*" # manually splitting kw on "_", strip it and then add wild cards

        custom_fname = field + ".custom" # Index has custom, standard and raw aliases with separate analyzers, Refer to index mapping in config file

        std_fname = field + ".standard" #

        raw_fname = field + ".raw" #


        results = es_connector.search(index=index, 
                                      body={ 
                                       "size":10000,
                                       "query":{ 
                                          "bool":{ 
                                             "should":[#I would like any of these queries to return a possible match, but they are weighted separately 
                                                { 
                                                   "query_string":{ #Custom fname indexes after splitting on underscores. Hence this query is required
                                                      "query":search_term_proc,
                                                      "fields":[ 
                                                         custom_fname
                                                      ],
                                                      "boost":1
                                                   }
                                                },
                                                { 
                                                   "query_string":{#A non wildcard search was required to avoid forcing the term to be in b/w text
                                                      "query":kw,
                                                      "fields":[ 
                                                         custom_fname
                                                      ],
                                                      "boost":1,
                                                      "fuzziness":100,
                                                      "fuzzy_max_expansions":1000
                                                   }
                                                },
                                                { 
                                                   "query_string":{ #This does a keyword search with wildcard option, Needed this to prioritize exact matches
                                                      "query":search_term_proc,
                                                      "fields":[ 
                                                         raw_fname
                                                      ],
                                                      "boost":3
                                                   }
                                                },
                                                { 
                                                   "query_string":{ #Wildcard search on standard analyzer for general query/indexed terms
                                                      "query":search_term_proc,
                                                      "fields":[ 
                                                         std_fname
                                                      ],
                                                      "boost":1.5,
                                                      "fuzziness":10,

                                                   }
                                                },
                                                { 
                                                   "query_string":{ #Sometimes custom analyzer wasnt working while querying. Hence I manually split on tokens and included them here
                                                      "query":split_search_term,
                                                      "fields":[ 
                                                         custom_fname
                                                      ],
                                                      "boost":1
                                                   }
                                                },
                                                { 
                                                   "query_string":{ #Prioritizing exact matches
                                                      "query":split_search_term,
                                                      "fields":[ 
                                                         raw_fname
                                                      ],
                                                      "boost":3
                                                   }
                                                },
                                                { 
                                                   "query_string":{ #a general search
                                                      "query":split_search_term,
                                                      "fields":[ 
                                                         std_fname
                                                      ],
                                                      "boost":1.5,
                                                      "fuzziness":10,

                                                   }
                                                }
                                             ]
                                          }
                                       }
                                    })
        return pd.json_normalize(results["hits"]["hits"])
    
    def get_metric_names(self, kw):
        
        """
        processes, search results to return only metric names
        """
        
        return



In [11]:
def search_reports(kw="sales", 
                   es_connector_obj=ESConnector(),
                   index="",
                   field="metric_names"
                  
                  ):
    
    """
    Searches for report names using metric name keyword
    """
    
    es_connector = es_connector_obj.connector
    
    if index == "":
        
        index = es_connector_obj.ES_INDEX_NAME
        
        
    
    kw = kw.strip() #Strip any trailing/leading whitespaces as it might give unwanted results
    
    search_term_proc = "*" + kw + "*" #adding wildcard
    
    split_search_term = "*"+ (" ".join(kw.split("_"))).strip() +"*" # manually splitting kw on "_", strip it and then add wild cards
    
    custom_fname = field + ".custom" # Index has custom, standard and raw aliases with separate analyzers, Refer to index mapping in config file
    
    std_fname = field + ".standard" #
    
    raw_fname = field + ".raw" #
    
        
    results = es_connector.search(index=index, 
                                  body={ 
                                   "size":10000,
                                   "query":{ 
                                      "bool":{ 
                                         "should":[#I would like any of these queries to return a possible match, but they are weighted separately 
                                            { 
                                               "query_string":{ #Custom fname indexes after splitting on underscores. Hence this query is required
                                                  "query":search_term_proc,
                                                  "fields":[ 
                                                     custom_fname
                                                  ],
                                                  "boost":1
                                               }
                                            },
                                            { 
                                               "query_string":{#A non wildcard search was required to avoid forcing the term to be in b/w text
                                                  "query":kw,
                                                  "fields":[ 
                                                     custom_fname
                                                  ],
                                                  "boost":1,
                                                  "fuzziness":100,
                                                  "fuzzy_max_expansions":1000
                                               }
                                            },
                                            { 
                                               "query_string":{ #This does a keyword search with wildcard option, Needed this to prioritize exact matches
                                                  "query":search_term_proc,
                                                  "fields":[ 
                                                     raw_fname
                                                  ],
                                                  "boost":3
                                               }
                                            },
                                            { 
                                               "query_string":{ #Wildcard search on standard analyzer for general query/indexed terms
                                                  "query":search_term_proc,
                                                  "fields":[ 
                                                     std_fname
                                                  ],
                                                  "boost":1.5,
                                                  "fuzziness":10,

                                               }
                                            },
                                            { 
                                               "query_string":{ #Sometimes custom analyzer wasnt working while querying. Hence I manually split on tokens and included them here
                                                  "query":split_search_term,
                                                  "fields":[ 
                                                     custom_fname
                                                  ],
                                                  "boost":1
                                               }
                                            },
                                            { 
                                               "query_string":{ #Prioritizing exact matches
                                                  "query":split_search_term,
                                                  "fields":[ 
                                                     raw_fname
                                                  ],
                                                  "boost":3
                                               }
                                            },
                                            { 
                                               "query_string":{ #a general search
                                                  "query":split_search_term,
                                                  "fields":[ 
                                                     std_fname
                                                  ],
                                                  "boost":1.5,
                                                  "fuzziness":10,

                                               }
                                            }
                                         ]
                                      }
                                   }
                                })
    return pd.json_normalize(results["hits"]["hits"])

search_reports(kw="total calls", field="metric_names", index="").sort_values(["_score", "_source.metric_names"], ascending=False)

Unnamed: 0,_index,_type,_id,_score,_source.metric_names,_source.repname,_source.colname
65,test2,_doc,119,9.231398,total calls_9,reports_synthetic_155_v1_19feb20.xlsx,adelaid_61
68,test2,_doc,156,9.231398,total calls_9,reports_synthetic_111_v1_19feb20.xlsx,qld_41
76,test2,_doc,382,9.231398,total calls_9,reports_synthetic_52_v1_19feb20.xlsx,sydney_83
79,test2,_doc,405,9.231398,total calls_9,reports_synthetic_153_v1_19feb20.xlsx,qld_41
90,test2,_doc,603,9.231398,total calls_9,reports_synthetic_38_v1_19feb20.xlsx,adelaid_61
...,...,...,...,...,...,...,...
507,test2,_doc,1378,2.000000,roaming_calls_1,reports_synthetic_138_v1_19feb20.xlsx,qld_258
575,test2,_doc,161,2.000000,roaming_calls_1,reports_synthetic_178_v1_19feb20.xlsx,auckland_11
587,test2,_doc,305,2.000000,roaming_calls_1,reports_synthetic_190_v1_19feb20.xlsx,sydney_47
534,test2,_doc,1842,2.000000,roaming_calls_0,reports_synthetic_125_v1_19feb20.xlsx,nsw_45
