In [1]:
file_path = 'wiki_movie_plots_deduped.csv'

# ESSENTIAL IMPORTS:

In [2]:
import pandas as pd
import lucene
# Indexer imports:
from org.apache.lucene.analysis.standard import StandardAnalyzer
# Porter Stemmer :- word with > 4 letters last letter y -> i
from org.apache.lucene.analysis.en import EnglishAnalyzer 
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
#from org.apache.lucene.store import SimpleFSDirectory, FSDirectory
from org.apache.lucene.store import FSDirectory
import org.apache.lucene.document as document

In [3]:
lucene.initVM()

<jcc.JCCEnv at 0x7f196b152970>

# Different Analyzers

In [4]:
from java.io import StringReader 
from org.apache.lucene.analysis.standard import StandardAnalyzer 
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute
from org.apache.lucene.analysis.en import EnglishAnalyzer
from org.apache.lucene.analysis.core import WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer
test = "The quick brown fox jumped over the lazy dog." 
print("Raw data - ",test)

print("EnglishAnalyzer")
analyzer = EnglishAnalyzer() 
# 1st parameter - fint parameter , StringReader : convert python string -> java
stream = analyzer.tokenStream("", StringReader(test)) 

stream.reset() 
tokens = [] 

while stream.incrementToken():
    tokens.append(stream.getAttribute(CharTermAttribute.class_).toString())
print(tokens)

# On presence of whitespace stripping is done
print("WhitespaceAnalyzer") 
analyzer = WhitespaceAnalyzer() 
# 1st parameter - fint parameter , StringReader : convert python string -> java
stream = analyzer.tokenStream("", StringReader(test)) 
stream.reset() 
tokens = [] 
while stream.incrementToken():
    tokens.append(stream.getAttribute(CharTermAttribute.class_).toString())
print(tokens)

# Converting in lowercase and removing special characters
print("SimpleAnalyzer")
analyzer = SimpleAnalyzer()
# 1st parameter - fint parameter , StringReader : convert python string -> java
stream = analyzer.tokenStream("", StringReader(test)) 
stream.reset() 
tokens = [] 
while stream.incrementToken():
    tokens.append(stream.getAttribute(CharTermAttribute.class_).toString())
print(tokens)

# print("StopAnalyzer") # User will have to initialize set of stopwords as per his choice
# analyzer = StopAnalyzer() 
# stream = analyzer.tokenStream("", StringReader(test)) # 1st parameter - fint parameter , StringReader : convert python string -> java
# stream.reset() 
# tokens = [] 
# while stream.incrementToken():
#     tokens.append(stream.getAttribute(CharTermAttribute.class_).toString())
# print(tokens)

Raw data -  The quick brown fox jumped over the lazy dog.
EnglishAnalyzer
['quick', 'brown', 'fox', 'jump', 'over', 'lazi', 'dog']
WhitespaceAnalyzer
['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.']
SimpleAnalyzer
['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


# INDEXING THE DOCUMENT

In [5]:
from java.io import File

In [6]:
!pwd

/home/sharanya/Codes/PyLucene


In [7]:
# Convert to path variable
indexPath = File("index/").toPath() 
# Storing in hard-disk -> file directory
indexDir = FSDirectory.open(indexPath)

In [8]:
# English Analyzer -> Remove all stopwords
writerConfig = IndexWriterConfig(EnglishAnalyzer()) 

# Responsible for writing lucene document in directory
writer = IndexWriter(indexDir, writerConfig) 

def indexMovie(title, plot):
    
    # Storing individual component of data in lucene understandable format (Convert text -> lucene document with many fields)
    doc = document.Document() 

    # Adding Fields
    # .add() -> 3 parameters -> name of the field (string) , actual thing to store , whether we want to store the data (Playing)
    # Telling interpreter to store the content in the variable in a field named as title and content will be stored
    doc.add(document.Field("TITLE", title, document.TextField.TYPE_STORED)) 
    doc.add(document.Field("PLOT", plot, document.TextField.TYPE_STORED))
    
    writer.addDocument(doc)

def closeWriter():
    writer.close()

def makeIndex(file_path):
    df = pd.read_csv(file_path)
    
    docid = 0
    for i in df.index:
        print(docid, "-", df['Title'][i])
        indexMovie(df['Title'][i], df['Plot'][i])
        docid += 1

makeIndex(file_path)
closeWriter()

0 - Kansas Saloon Smashers
1 - Love by the Light of the Moon
2 - The Martyred Presidents
3 - Terrible Teddy, the Grizzly King
4 - Jack and the Beanstalk
5 - Alice in Wonderland
6 - The Great Train Robbery
7 - The Suburbanite
8 - The Little Train Robbery
9 - The Night Before Christmas
10 - Dream of a Rarebit Fiend
11 - From Leadville to Aspen: A Hold-Up in the Rockies
12 - Kathleen Mavourneen
13 - Daniel Boone
14 - How Brown Saw the Baseball Game
15 - Laughing Gas
16 - The Adventures of Dollie
17 - The Black Viper
18 - A Calamitous Elopement
19 - The Call of the Wild
20 - A Christmas Carol
21 - The Fight for Freedom
22 - At the Altar
23 - A Drunkard's Reformation
24 - The Golden Louis
25 - The Lure of the Gown
26 - An Arcadian Maid
27 - A Christmas Carol
28 - Frankenstein
29 - Hemlock Hoax, the Detective
30 - The House with Closed Shutters
31 - A Lad from Old Ireland
32 - Pocahontas
33 - Ramona
34 - What the Daisy Said
35 - The Wonderful Wizard of Oz
36 - Baseball and Bloomers
37 - The 

# VIEWING INDEX FILES -
- To view the index file cant done by normal text editor -> binary file
- Install Lucene-9.7.0.tgz
- go to bin -> ./luke.sh
- open index folder
- view documents -> texts are not casefolded neither stopwords are removed (why ?)
- when retrieval is happening it is essentially being done by processed text but when document is presented to user -> raw document

# LUCENE FOR SEARCHING
- input : A query (string) <- same analyser that is used during indexing; index_path; search_field - title/plot ; top_size - how many top ranked documents to be retrieved
- output : list of documents 

# IMPORTING MODULE

In [9]:
# Retriever imports:
from org.apache.lucene.analysis.en import EnglishAnalyzer
#from org.apache.lucene.store import SimpleFSDirectory, FSDirectory
from org.apache.lucene.store import FSDirectory

from org.apache.lucene.index import IndexReader
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.search.similarities import BM25Similarity

from java.io import File

In [10]:
indexPath = File("index/").toPath()

In [11]:
def search(index_path, q):

    print ("Searching for:", q)

    # Using same analyser that is used to create the index
    analyzer = EnglishAnalyzer() 
    
    # To open index file
    directory = FSDirectory.open(index_path)  

    # Initializing index-searcher -> Ready to search
    searcher = IndexSearcher(DirectoryReader.open(directory)) 
    
    # BM25Similarity -> 2 parameters -> Controlling document length smoothing (raw/smooth document)
    # Set the similarity function -> BM25Similarity implemented inside lucine
    searcher.setSimilarity(BM25Similarity(1.8, 0.3)) 

    # QueryParser -> 2 parameters -> field to be searched and analyser to analyse the query
    query = QueryParser("PLOT", analyzer).parse(q) # Query Object
    # query = QueryParser("TITLE", analyzer).parse(q)

    # Query is the parsed query returned from QueryParser ; 2nd parameter -> top document
    scoreDocs = searcher.search(query, 10).scoreDocs 
    print ("%s total matching documents" % (len(scoreDocs)))

    # Accessing individual documents & their corresponding field
    for scoreDoc in scoreDocs:
        
        # Returns content of document as lucene document type
        # scoreDoc.doc -> internal lucene docid set during indexing e.g. for 1000 documents -> 0 to 999
        doc = searcher.doc(scoreDoc.doc) 
        # print (doc.get("TITLE"))

        print("Title: ")
        # doc.get -> information stored in individual field
        print (doc.get("TITLE")) 
        print ("-------------------------------------------------")
        print ("########### PLOT ###############")
        print (doc.get("PLOT"))
        print(scoreDoc.score)
        # doc.get("PLOT"))


#search(indexPath, "murder")
search(indexPath, "crime in prison")
#search(indexPath, "mute")

#search(indexPath, "CRIME AND MURDER")

Searching for: crime in prison
10 total matching documents
Title: 
Dark Alibi
-------------------------------------------------
########### PLOT ###############
Thomas Harley, an ex-convict who served time in prison eight years ago, is wrongfully arrested for a bank robbery he didn't commit. The police have found fingerprints on the crime scene, incriminating Harley, even though he was present at the Carey Theatrical Warehouse at the time of the crime.
The policemen do not believe Harley's explanation, partly because he claims to have been called to the warehouse by a note from an old cell mate by the name of Dave Wyatt, a man that has been dead for eight years. Subsequently, Harley is sentenced to death for the robbery. He goes off to prison to wait for his execution.
Harley's daughter June asks private investigator Charlie Chan for help to prove her father's innocence. Hearing about the suspicious circumstances, Chan immediately agrees to take the case.
With only 9 days before Harley