In [1]:
import sys
import math
import lucene
 
from java.io import File
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene import document
from org.apache.lucene.document import Document, Field
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, IndexOptions, DirectoryReader, Term, MultiFields, Fields, FieldInfos
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.search.similarities import TFIDFSimilarity
from org.apache.lucene.search.similarities import ClassicSimilarity
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.queryparser.classic import QueryParser
from helper import get_docs

# INDEXER

In [2]:
lucene.initVM()

<jcc.JCCEnv at 0x7f7cf91ddb30>

In [3]:
indexPath = File("index/").toPath() #from java. io import File
indexDir = FSDirectory.open(indexPath)
writerConfig = IndexWriterConfig(StandardAnalyzer())
writer = IndexWriter(indexDir, writerConfig)

In [4]:
docIdToText = get_docs("time/test.all")

In [5]:
print(docIdToText)

defaultdict(<class 'list'>, {1: ['The', 'dog', 'barked'], 2: ['The', 'dog', 'jumped'], 3: ['A', 'cat', 'jumped']})


In [6]:
for k, v in docIdToText.items():
    doc = Document()
    fieldType = document.FieldType()
    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
    fieldType.setStored(True)
    fieldType.setTokenized(True)
    fieldType.setOmitNorms(False)

    doc.add(Field("myID", k, fieldType))
    doc.add(Field("content", " ".join(v), fieldType))

    writer.addDocument(doc)


In [7]:
writer.close()

# RETRIEVER

In [11]:
lucene.initVM()

ValueError: JVM is already running and updating its classpath failed. Call initVM() instead just once but with a classpath keyword argument set to the module.CLASSPATH strings of all the JCC extension modules to be imported by this process

In [6]:
analyzer = StandardAnalyzer()
indexPath = File("index").toPath()
indexDir = FSDirectory.open(indexPath)
reader = DirectoryReader.open(indexDir)
searcher = IndexSearcher(reader)
searcher.similarity = ClassicSimilarity()
indexReader = searcher.getIndexReader()

In [24]:
t = Term("content", "dog")
df = indexReader.docFreq(t)
df

2

In [26]:
print(reader.numDocs())

3


In [35]:
d1 = indexReader.document(1)

In [40]:
print(d1)
print(d1.fields)

Document<stored<myID:2> stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS<content:The dog jumped>>
[stored<myID:2>, stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS<content:The dog jumped>]


In [49]:
fields = Fields.getFields(indexReader)


AttributeError: type object 'Fields' has no attribute 'getFields'

In [19]:
print(indexReader.maxDoc())

3


In [34]:
print(ClassicSimilarity.idfExplain())

TypeError: unbound method ClassicSimilarity.idfExplain() needs an argument

In [8]:
query = QueryParser("content", analyzer).parse("a cat jumped")

In [9]:
MAX = 1000
hits = searcher.search(query, MAX)

In [10]:
for hit in hits.scoreDocs:
    # print(hit)
    print(hit.score, hit.doc, hit.toString())
    doc = searcher.doc(hit.doc)
        

2.698521375656128 2 doc=2 score=2.6985214 shardIndex=-1
0.7434435486793518 1 doc=1 score=0.74344355 shardIndex=-1


Creating a document vector
- For each document
- Create a vector where each index is associated with a vocabulary term
    - For each term in the document
        - Set the value at its index in the vector to tf * idf 

In [17]:
collectionSize = reader.numDocs()

# Get docTermFreqMap = {0: {'The': 1, 'dog': 1, 'barked': 1}, 1: {'The': 1, 'dog': 2, 'jumped': 1}, 2: {'A': 1, 'cat': 1, 'jumped': 1}}
# and vocab
vocab = {}
docTermFreqMap = {}

for i in range (collectionSize):
    map = {}
    curDoc = str(indexReader.document(i).get('content')).split(' ')
    # Get TF of individual docs
    for word in curDoc:
        vocab[word] = True
        if word in map:
            map[word] = map.get(word) + 1
        else:
            map[word] = 1
    docTermFreqMap[i] = map

vocab = vocab.keys()

#Get Doc Vectors
docVectors = {} 

for i in range (collectionSize):
    curVector = []

    for word in vocab:
        docTermFreq = docTermFreqMap.get(i).get(word)
        weight = 0
        if docTermFreq is not None:
            weight = 1 + math.log10(docTermFreq)

        term = Term('content', word.lower())
        totalTermDocs = reader.docFreq(term)
        idf = math.log10(collectionSize/totalTermDocs)

        curVector.append(idf*weight)

    docVectors[i] = curVector

# Print out the results
print(vocab)
for dv in docVectors.keys():
    print(docVectors[dv])

for doc in range (collectionSize):
    print(indexReader.document(doc).get('content'))


# Now, To do Exact Top K, given K and a freetext Query
k = 2
query = 'A cat barked barked barked'

# Make the query vector
queryVector = []
for word in vocab:
    freq = query.count(word)
    weight = 0
    if freq != 0:
        weight = 1 + math.log10(freq)

    term = Term('content', word.lower())
    totalTermDocs = reader.docFreq(term)
    idf = math.log10(collectionSize/totalTermDocs)

    queryVector.append(weight*idf)

print(query)
print(queryVector)

# Now, Get Top K by ranking the results of document vectors via cosine sim

cosines = []
for i in range(collectionSize):
    curDocVec = docVectors[i]
    dotProduct = 0
    magnitudeQ = 0
    magnitudeD = 0
    for j in range(len(vocab)):
        dotProduct += (queryVector[j] * curDocVec[j])
        magnitudeQ += (queryVector[j] * queryVector[j])
        magnitudeD += (curDocVec[j] * curDocVec[j])
    
    cosSim = dotProduct / (math.sqrt(magnitudeQ) * math.sqrt(magnitudeD))
    cosines.append((i, cosSim))

listk = sorted(cosines, key = lambda x:x[1], reverse=True)
print(listk[:2])



dict_keys(['The', 'dog', 'barked', 'jumped', 'A', 'cat'])
[0.17609125905568124, 0.17609125905568124, 0.47712125471966244, 0.0, 0.0, 0.0]
[0.17609125905568124, 0.17609125905568124, 0.0, 0.17609125905568124, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.17609125905568124, 0.47712125471966244, 0.47712125471966244]
The dog barked
The dog jumped
A cat jumped
A cat barked barked barked
[0.0, 0.0, 0.7047659464249274, 0.0, 0.47712125471966244, 0.47712125471966244]
[(2, 0.6691470647505655), (0, 0.6403446349278143)]


In [64]:
vocab = set()

In [69]:
for doc in range (indexReader.numDocs()):
    curDoc = indexReader.document(doc)
    curDoc = str(curDoc)
    print(curDoc)
    

Document<stored<myID:1> stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS<content:The dog barked>>
Document<stored<myID:2> stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS<content:The dog jumped>>
Document<stored<myID:3> stored,indexed,tokenized,indexOptions=DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS<content:A cat jumped>>


AttributeError: 'int' object has no attribute 'fields'