In [25]:
import sys
import math
import time
import lucene
 
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene import document
from org.apache.lucene.document import Document, Field
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions, DirectoryReader, Term, MultiFields, Fields, FieldInfos
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.search.similarities import TFIDFSimilarity
from org.apache.lucene.search.similarities import ClassicSimilarity
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.queryparser.classic import QueryParser
from helper import get_docs

# INDEXER

In [2]:
lucene.initVM()

<jcc.JCCEnv at 0x7f7cf91ddb30>

In [3]:
indexPath = File("index/").toPath() #from java. io import File
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(StandardAnalyzer())
writer = IndexWriter(indexDir, writerConfig)

In [4]:
docIdToText = get_docs("time/test.all")

In [5]:
print(docIdToText)

defaultdict(<class 'list'>, {1: ['The', 'dog', 'barked'], 2: ['The', 'dog', 'jumped'], 3: ['A', 'cat', 'jumped']})


In [6]:
for k, v in docIdToText.items():
    doc = Document()
    fieldType = document.FieldType()
    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
    fieldType.setStored(True)
    fieldType.setTokenized(True)
    fieldType.setOmitNorms(False)

    doc.add(Field("myID", k, fieldType))
    doc.add(Field("content", " ".join(v), fieldType))

    writer.addDocument(doc)


In [7]:
writer.close()

# RETRIEVER

In [None]:
lucene.initVM()

In [6]:
analyzer = StandardAnalyzer()
indexPath = File("index").toPath()
indexDir = FSDirectory.open(indexPath)
reader = DirectoryReader.open(indexDir)
searcher = IndexSearcher(reader)
searcher.similarity = ClassicSimilarity()
indexReader = searcher.getIndexReader()

In [26]:
print(reader.numDocs())

3


Creating a document vector
- For each document
- Create a vector where each index is associated with a vocabulary term
    - For each term in the document
        - Set the value at its index in the vector to tf * idf 

In [43]:
collectionSize = reader.numDocs()

# Get docTermFreqMap = {0: {'The': 1, 'dog': 1, 'barked': 1}, 1: {'The': 1, 'dog': 2, 'jumped': 1}, 2: {'A': 1, 'cat': 1, 'jumped': 1}}
# and vocab
vocab = {}
docTermFreqMap = {}

for i in range (collectionSize):
    map = {}
    curDoc = str(indexReader.document(i).get('content')).split(' ')
    # Get TF of individual docs
    for word in curDoc:
        vocab[word] = True
        if word in map:
            map[word] = map.get(word) + 1
        else:
            map[word] = 1
    docTermFreqMap[i] = map

vocab = vocab.keys()

#Get Doc Vectors
docVectors = {} 

for i in range (collectionSize):
    curVector = []

    for word in vocab:
        docTermFreq = docTermFreqMap.get(i).get(word)
        weight = 0
        if docTermFreq is not None:
            weight = 1 + math.log10(docTermFreq)

        term = Term('content', word.lower())
        totalTermDocs = reader.docFreq(term)
        idf = math.log10(collectionSize/totalTermDocs)

        curVector.append(idf*weight)

    docVectors[i] = curVector

# Print out the results
print('Vocab: ', vocab)
for dv in docVectors.keys():
    print('DocID: ', dv, ' Vector = ', docVectors[dv])

for doc in range (collectionSize):
    print('DocID: ', doc, indexReader.document(doc).get('content'))




Vocab:  dict_keys(['The', 'dog', 'barked', 'jumped', 'A', 'cat'])
DocID:  0  Vector =  [0.17609125905568124, 0.17609125905568124, 0.47712125471966244, 0.0, 0.0, 0.0]
DocID:  1  Vector =  [0.17609125905568124, 0.17609125905568124, 0.0, 0.17609125905568124, 0.0, 0.0]
DocID:  2  Vector =  [0.0, 0.0, 0.0, 0.17609125905568124, 0.47712125471966244, 0.47712125471966244]
DocID:  0 The dog barked
DocID:  1 The dog jumped
DocID:  2 A cat jumped


In [46]:
# Now, To do Exact Top K, given K and a freetext Query
k = int(input("Enter K:"))
if k > collectionSize:
    k = collectionSize
query = input("Enter Query:")
timeStart = time.time_ns()

# Make the query vector
queryVector = []
for word in vocab:
    freq = query.count(word)
    weight = 0
    if freq != 0:
        weight = 1 + math.log10(freq)

    term = Term('content', word.lower())
    totalTermDocs = reader.docFreq(term)
    idf = math.log10(collectionSize/totalTermDocs)

    queryVector.append(weight*idf)

print('Query: ', query)
print('Query vector: ',queryVector)

# Now, Get Top K by ranking the results of document vectors via cosine sim

cosines = []
for i in range(collectionSize):
    curDocVec = docVectors[i]
    magnitudeD, magnitudeQ, dotProduct = 0, 0, 0
    for j in range(len(vocab)):
        dotProduct += (queryVector[j] * curDocVec[j])
        magnitudeQ += (queryVector[j] * queryVector[j])
        magnitudeD += (curDocVec[j] * curDocVec[j])
    
    cosSim = dotProduct / (math.sqrt(magnitudeQ) * math.sqrt(magnitudeD))
    cosines.append((i, cosSim))

listk = sorted(cosines, key = lambda x:x[1], reverse=True)
print ('Results in Order of Cosine Similarity')
for i in range(k):
    print(listk[i])

timeEnd = time.time_ns()

print('Time taken to query: ', (timeEnd-timeStart)/(10**9))


Query:  cat jumped
Query vector:  [0.0, 0.0, 0.0, 0.17609125905568124, 0.0, 0.47712125471966244]
Results in Order of Cosine Similarity
(2, 0.7293023054525128)
(1, 0.19990265386264808)
(0, 0.0)
Time taken to query:  0.000857457
