In [10]:
import re
from elasticsearch import Elasticsearch
from datetime import datetime

In [11]:
ELASTIC_USER = "elastic"
ELASTIC_PWD = "changeme"
ELASTIC_INDEX = "ap_dataset"
ELASTIC_TYPE = "document"

from elasticsearch import Elasticsearch, RequestsHttpConnection
es = Elasticsearch(connection_class=RequestsHttpConnection,
                  http_auth=(ELASTIC_USER, ELASTIC_PWD))

In [12]:
def get_avg_dlen(es):
    query_aggs = """
    {
      "query": {
        "match_all": {}
      },
      "aggs": {
        "doc_lengths": {
          "stats": {
            "script": {
              "lang": "groovy",
              "inline": "_doc['doc_length']"
            }
          }
        }
      }
    }
    """
    return es.search(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, body=query_aggs)["aggregations"]['doc_lengths']['avg']

In [13]:
class Term:
    def __init__(self, term, tf, df):
        self.term = term
        self.tf = tf
        self.df = df

class Document:
    def __init__(self, id, dlen):
        self.id = id
        self.dlen = dlen
        self.terms = []
        self.queries = {}
    
    def add_words(self, doc_words):
        word_lst = []
        for word in doc_words:
            word_lst.append(Term(word['term'], word['tf'], word['df']))
        self.terms = word_lst
        self.words = Set(map(lambda w: w.term, self.terms))
        
    def filter_terms_query(self, q_id, query):
        self.queries[q_id] = self.words.intersection(Set(query))
        
class Query:
    def __init__(self, doc_id, score):
        self.doc_id = doc_id
        self.score = score

In [14]:
class Scoring:
    def __init__(self, list_docs):
        self.docs = list_docs
        
    def logTF(self, tf):
        return Math.log(tf)

    def okapiTF(self, tf, dlen, avgdlen):
        return tf / (tf + 0.5 + 1.5 * (dlen/avgdlen))

    def tf_doc_query(self, doc, query, num_docs, tfidf=False):
        sumtf = 0.0
        #tokenize query
        for word in query:
            tf = getTF(doc, word)
            temp = okapiTF(tf, dlen, avgdlen)
            if tfidf:
                #TODO: Get df of words
                temp * log(num_docs/df)
            sumtf += temp
        return sumtf

    def tfidf(self, doc, query):
        return tf_doc_query(doc, query, True)

    def bm25(self, doc, query, k1, k2, b):
        sumtf =0.0
        for word in query:
            sumtf += log(num_docs+0.5/df+0.5) * (tf + k1*tf)/(tf+k1((1-b) + b * dlen/avgdlen)) * (tfq + k2*tfq)/(tfq + k2)

    def ulm_laplace(self):
        sumScore = 0.0
        for word in query:
            sumScore += Math.log((tf +1)/(dlen+vocab_size))
        return sumScore

    def ulm_mercer(self):
        pass

In [15]:
#### def getTF():
avgdlen = get_avg_dlen(es)
avgdlen

282.38710172654055

In [16]:
def get_tf(doc, term, withdf=False):
    tv_doc = es.termvectors(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, id=doc, positions=False, payloads=False, offsets=False, term_statistics=True)['term_vectors']['text']['terms'][term]
    if withdf:
        return {"tf": tv_doc['term_freq'], "df":tv_doc['doc_freq']}
    else:
        return tv_doc['term_freq']

In [17]:
get_tf("AP890703-0285", "weather", True)

{'df': 3596, 'tf': 28}

In [18]:
num_docs = es.termvectors(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, id="AP890703-0285")['term_vectors']['text']['field_statistics']['doc_count']
num_docs
# TODO: Use the num_docs to page records and load it into the Document class
# TODO: Use the multi termvector API
# TODO: Get the vocab size


84678

In [19]:
def get_df_temp(es, term):
    df_search_qry = {
      "script_fields": {
        "df": {
          "script": {
          "lang": "groovy",
          "inline": "_index['text']['01'].df()"
                }
            }
          }
    }
    return es.search(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE,body=df_search_qry)

def get_df(es, term):
    return es.count(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, body={"query": {"match": {"text": term}}})['count']
    
get_df(es, "offici")

31555

In [30]:
DELETE_LIST = ["document","discuss", "report", "type", "identifi", "predict", "cite", "describ", "determin", "sole", "basic", "side", "produc", "instanc", 
               "base", "second", "take", "anticipate", "make"]
#TODO: Remove tokens with high TF
def get_tokens(es, query):
    data = es.indices.analyze(index=ELASTIC_INDEX, analyzer="my_english", body={"text": query})["tokens"]
    return list(filter(lambda x: x not in DELETE_LIST, map(lambda x: x["token"], data)))

In [34]:
get_tokens(es, """Document must describe or identify supporters of the National Rifle Association (NRA), or its assets.""")

['support', 'nation', 'rifl', 'associ', 'nra', 'asset']

In [39]:
def search_term(es, term):
    search_qry = {
      "query": {
        "match": {
          "text": term
          }
      }
    }
    res = es.search(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, _source=["_id", "doc_length"], size=get_df(es, term), body=search_qry)
    return (list(map(lambda x: {"id": x['_id'], "dlen": x['_source']['doc_length']}, res['hits']['hits'])))

len(search_term(es, "government"))

GET http://localhost:9200/ap_dataset/document/_search?_source=_id%2Cdoc_length&size=29253 [status:500 request:0.035s]


TransportError: TransportError(500, 'search_phase_execution_exception', 'Result window is too large, from + size must be less than or equal to: [10000] but was [29253]. See the scroll api for a more efficient way to request large data sets. This limit can be set by changing the [index.max_result_window] index level setting.')

In [6]:
mtermvectors_ids = ["AP890101-0009", "AP890601-0343"]
def get_tf_docs(es, term, doc_ids):
    res = es.mtermvectors(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, offsets="false", payloads="false", positions="false", term_statistics="true", field_statistics="true", ids=doc_ids)['docs']
    return list(map(lambda x: {"id": x['_id'], "term": term, "tf": x['term_vectors']['text']['terms'][term]['term_freq'], "df": x['term_vectors']['text']['terms'][term]['doc_freq']}, res))

get_tf_docs(es, '01', mtermvectors_ids)

[{'df': 10438, 'id': 'AP890101-0009', 'term': '01', 'tf': 4},
 {'df': 10438, 'id': 'AP890601-0343', 'term': '01', 'tf': 2}]

In [None]:
doc_list = {}
token = get_tokens(es, "Document will discuss allegations, or measures being taken against, corrupt public officials of any governmental jurisdiction worldwide.")[0]
for doc in search_term(es, token):
    if (doc['id'] in list(doc_list.keys())):
        doc_list[doc['id']]['terms']
    else:
        doc_list[doc['id']] = {}
        doc_list[doc['id']]['dlen'] = doc['dlen']
        for doc_id in list(doc_list.keys()):
            doc_list[doc['id']]['terms'] = []
            doc_list[doc['id']]['terms'].append(get_tf(doc_id, token, True))

6359

In [39]:
es.field_stats(index=ELASTIC_INDEX, fields=['doc_length'])['indices']['_all']['fields']['doc_length']['doc_count']

84678

In [3]:
res = es.get(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, id="AP890310-0090")
print(res)

# res = es.search(index="test-index", body={"query": {"match_all": {}}})
# print("Got %d Hits:" % res['hits']['total'])
# for hit in res['hits']['hits']:
#     print("%(timestamp)s: %(body)s" % hit["_source"])

{'_id': 'AP890310-0090', '_type': 'document', '_version': 1, 'found': True, '_source': {'text': "AP890310-0090 \nAP-NR-03-10-89 0957EST\nr a PM-ComaWoman     03-10 0374\nPM-Coma Woman,0385\nComatose Woman Transferred To New Jersey Rehabilitation Center\nBy MELANIE MADER\nAssociated Press Writer\nEDISON, N.J. (AP) \n\n   A comatose New York woman who received an\nabortion after an emotional legal battle started an evaluation and\ntherapy program today at a rehabilitation center in this central New\nJersey community.\n   ``We're committed to long-term care. ... We're talking months,\nnot weeks,'' said Dr. Keith D. Cicerone, clinical director of trauma\nrehabilitation at John F. Kennedy Medical Center's Robert Wood\nJohnson Jr. Rehabilitation Institute.\n   Doctors will evaluate Nancy Klein and make an initial prognosis\non her condition in four to six weeks, Cicerone said. He declined to\ngive her condition or specific details of treatment, citing a\nrequest by the family.\n   Mrs. Klein

In [5]:
tv_doc = es.termvectors(index=ELASTIC_INDEX, doc_type=ELASTIC_TYPE, id="AP890310-0090")
tv_doc

{'_id': 'AP890310-0090',
 '_index': 'ap_dataset',
 '_type': 'document',
 '_version': 1,
 'found': True,
 'term_vectors': {'text': {'field_statistics': {'doc_count': 30020,
    'sum_doc_freq': 5524322,
    'sum_ttf': 8434096},
   'terms': {'0090': {'term_freq': 1,
     'tokens': [{'end_offset': 13, 'position': 1, 'start_offset': 9}]},
    '03': {'term_freq': 2,
     'tokens': [{'end_offset': 23, 'position': 4, 'start_offset': 21},
      {'end_offset': 61, 'position': 12, 'start_offset': 59}]},
    '0374': {'term_freq': 1,
     'tokens': [{'end_offset': 69, 'position': 14, 'start_offset': 65}]},
    '0385': {'term_freq': 1,
     'tokens': [{'end_offset': 88, 'position': 18, 'start_offset': 84}]},
    '0957est': {'term_freq': 1,
     'tokens': [{'end_offset': 37, 'position': 7, 'start_offset': 30}]},
    '10': {'term_freq': 2,
     'tokens': [{'end_offset': 26, 'position': 5, 'start_offset': 24},
      {'end_offset': 64, 'position': 13, 'start_offset': 62}]},
    '11': {'term_freq': 1,
  