# Notes: this Java notebook requires Ganymede 
* Ganymede (Java kernel for Jupyter): [Installation and documentation](https://github.com/allen-ball/ganymede)
* We need additional libraries for lucene

In [2]:
%%pom
dependencies:
- org.apache.lucene:lucene-core:9.7.0
- org.apache.lucene:lucene-analysis-common:9.7.0
- org.apache.lucene:lucene-queryparser:9.7.0

#### Common imports (java)

In [3]:
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Files;
import java.nio.file.Paths;

import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.List;

#### Common imports (lucene)

In [4]:
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.FloatField;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.util.BytesRef;

## Define paths to documents and index

In [6]:
var fileImdbDataset = "datasets/imdb_top_1000.csv";
var pathIndex = "lucene/index";

## Let's read in the data collection

In [7]:
List<Map<String, String>> readCollection(String name) throws IOException {
    List<Map<String, String>> docs = new ArrayList<Map<String, String>>();
    String splitter = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)";
    BufferedReader reader = new BufferedReader(new FileReader(name));
    String line, keys[] = reader.readLine().split(splitter);

    while ((line = reader.readLine()) != null) {
        String[] values = line.split(splitter);
        Map<String, String> dataMap = new HashMap<>();

        for (int i = 0; i < keys.length; i++) {
            // dataMap.put(keys[i], values[i]);
            switch (keys[i]) {
                case "Series_Title":
                    dataMap.put("title", values[i]);
                    break;
                case "Released_Year":
                    dataMap.put("year", values[i]);
                    break;
                case "Runtime":
                    dataMap.put("runtime", values[i].replace(" min", ""));
                    break;
                case "Genre":
                    dataMap.put("genre", values[i].replace(",", ""));
                    break;
                case "IMDB_Rating":
                    dataMap.put("rating", values[i]);
                    break;
                case "Overview":
                    dataMap.put("summary", values[i].replace("\"", ""));
                    break;
                case "Star1":
                    dataMap.put("actors", values[i]);
                    break;
                case "Star2":
                case "Star3":
                case "Star4":
                    dataMap.put("actors", dataMap.get("actors") + " " + values[i]);
                    break;
            }
        }
        docs.add(dataMap);
    }
    reader.close();

    // print summary
    System.out.println("Read " + docs.size() + " documents from " + name);
    return docs;
}

var collection = readCollection(fileImdbDataset);
System.out.println("\nfirst document:");
collection.get(42).forEach((key, value) -> System.out.println(String.format("%10s: %s", key, value)));

Read 1000 documents from datasets/imdb_top_1000.csv

first document:
   summary: Mathilda, a 12-year-old girl, is reluctantly taken in by Léon, a professional assassin, after her family is murdered. An unusual relationship forms as she becomes his protégée and learns the assassin's trade.
    actors: Jean Reno Gary Oldman Natalie Portman Danny Aiello
      year: 1994
     genre: "Action Crime Drama"
    rating: 8.5
   runtime: 110
     title: Leon


## Analyzer of Lucene

In [8]:
void print_tokens(Analyzer analyzer, String text) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", new StringReader(text));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

    for(ts.reset(); ts.incrementToken();) 
        System.out.print(termAtt.toString() + ", ");
    ts.end();
    System.out.println();
}

class EnglishASCIIFoldedAnalyzer extends Analyzer {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new StandardTokenizer();
        TokenStream tokenStream = new LowerCaseFilter(tokenizer);
        tokenStream = new ASCIIFoldingFilter(tokenStream);
        return new TokenStreamComponents(tokenizer, tokenStream);
    }
}

class MyAnalyzer extends Analyzer {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
          final Tokenizer source = new StandardTokenizer();
          TokenStream tokenStream = new EnglishPossessiveFilter(source);
          // tokenStream = new LowerCaseFilter(tokenStream);
          tokenStream = new FilteringTokenFilter(tokenStream) {
              private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
              @Override
              protected boolean accept() throws IOException {
                  return termAtt.length() > 3;
              }
          };
          tokenStream = new KStemFilter(tokenStream);
          return new TokenStreamComponents(source, tokenStream);
    }
}

var text = "I think text's values' color goes here; WHAT happens with Léon? do we see IT again; I went there to be gone with houses";
var stopWords = new CharArraySet(Arrays.asList("i", "do"), false);

System.out.println("             text: "+ text);
System.out.println();

// standard analyzer
System.out.print("         standard: ");
print_tokens(new StandardAnalyzer(), text);

// english analyzer (with porter stemmer)
System.out.print("          english: ");
print_tokens(new EnglishAnalyzer(), text);

// english analyzer (with porter stemmer) and new set of stopwords
System.out.print("english/stopwords: ");
print_tokens(new EnglishAnalyzer(stopWords), text);

// english analyzer with ascii folding
System.out.print("english/folding: ");
print_tokens(new EnglishASCIIFoldedAnalyzer(), text);

// a custom analyzer, no lower case and kstemmer
System.out.print("      my analyzer: ");
print_tokens(new MyAnalyzer(), text);

// print standard stop word list
System.out.println("\nenglish stopword list:");
System.out.println(EnglishAnalyzer.getDefaultStopSet());

             text: I think text's values' color goes here; WHAT happens with L�on? do we see IT again; I went there to be gone with houses

         standard: i, think, text's, values, color, goes, here, what, happens, with, l�on, do, we, see, it, again, i, went, there, to, be, gone, with, houses, 
          english: i, think, text, valu, color, goe, here, what, happen, l�on, do, we, see, again, i, went, gone, hous, 
english/stopwords: think, text, valu, color, goe, here, what, happen, with, l�on, we, see, it, again, went, there, to, be, gone, with, hous, 
english/folding: i, think, text's, values, color, goes, here, what, happens, with, leon, do, we, see, it, again, i, went, there, to, be, gone, with, houses, 
      my analyzer: think, text, value, color, go, here, WHAT, happen, with, L�on, again, went, there, gone, with, house, 

english stopword list:
[but, be, with, such, then, for, no, will, not, are, and, their, if, this, on, into, a, or, there, in, that, they, was, is, it, an, t

## Building an index with Lucene

### Providing default analyzer, diretory, and index writer/searcher
We pick the English Analyzer. Make sure that you always use the *same* analyzer for indexing and searching. Lucene is not checking for this and your search performance can suffer. The directory is on the file system. Using standard configurations.

In [9]:
Analyzer getAnalyzer() {
    return new EnglishAnalyzer();
}

Directory getDirectory() throws IOException {
    return FSDirectory.open(Paths.get(pathIndex));
}

IndexWriter getIndexWriter() throws IOException {
    Directory directory = getDirectory();
    IndexWriterConfig config = new IndexWriterConfig(getAnalyzer());
    return new IndexWriter(directory, config);
}

IndexWriter getIndexWriter(boolean mergePolicy) throws IOException {
    Directory directory = getDirectory();
    IndexWriterConfig config = new IndexWriterConfig(getAnalyzer());
    MergePolicy policy = mergePolicy ? new LogDocMergePolicy() : NoMergePolicy.INSTANCE;
    if (mergePolicy) policy.setNoCFSRatio(1.0);
    config.setMergePolicy(policy);
    return new IndexWriter(directory, config);
}

### We delete the index first, to load all documents into a fresh index

In [13]:
void deleteIndex() throws IOException {
    IndexWriter writer = getIndexWriter();
    writer.deleteAll();
    writer.commit();
    writer.close();
}

deleteIndex();

### Lucene accepts documents with fields
`Field.Store.YES` stores the value of the field in the index. That mean when we print results, we have these attributes available form the Lucene index. On the other hand, `Field.Store.NO` does not dtore the values in the index. In our example below, we have to retrieve actors, genre, and summary from the original data file as they are not stored in the Lucene index. 

The field type decides whether its value is tokenized and available for full-text search:
* `TextField`: Reader or String indexed for full-text search
* `StringField`: String indexed verbatim as a single token
* `IntField`: int indexed for exact/range queries. 
* `IntPoint`: faster int indexed for exact/range queries. If you need to store the value, also use a `StoredField` 
* `FloatField`: float indexed for exact/range queries.
* `FloatPoint`: faster float indexed for exact/range queries. If you need to store the value, also use a `StoredField` 
* `StoredField`: Stored-only value for retrieving in summary results

In summary, a field can be 'stored' / 'not stored' in the index, and the contents of a field can be used for full-text search, exact/range queries, or not at all.

In [14]:
Document createDocument(Map<String, String> data) {
    Document doc = new Document();

    // we store everything we need for result presentation
    doc.add(new TextField("title", data.get("title"), Store.YES));
    doc.add(new TextField("year", data.get("year"), Store.YES));
//    doc.add(new IntField("year", Integer.parseInt(data.get("year")), Store.YES));
    doc.add(new IntField("runtime", Integer.parseInt(data.get("runtime")), Store.YES));
    doc.add(new FloatField("rating", Float.parseFloat(data.get("rating")), Store.YES));

    // we do not store these fields and can't print them in the results
    doc.add(new TextField("actors", data.get("actors"), Store.NO));
    doc.add(new TextField("genre", data.get("genre"), Store.NO));

    // lastly, we use a custom field to show the term vectors in summary
    FieldType ftSummary = new FieldType();
    ftSummary.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    ftSummary.setStoreTermVectors(true);
    ftSummary.setStoreTermVectorPositions(true);
    ftSummary.setTokenized(true);
    ftSummary.setStored(true);
    doc.add(new Field("summary", data.get("summary"), ftSummary));

    return doc;
}

### Load the index in batches to observe how segment creation works
Change the `batchSize` argument in the call to `loadImdbData` and observe the index folder at `./lucene/index`. Every time we create a new `IndexWriter`, Lucene creates a new segment and all documents added with the same index writer are kept in the same segments. 

Note: we deliberately turn off the merge policy to demonstrate sgements. Do not do in production!

In [None]:
void loadBatch(List<Map<String, String>> docs) throws IOException {
    IndexWriter writer = null;

    try {
        writer = getIndexWriter(false);
        for (Map<String, String> doc : docs)
            writer.addDocument(createDocument(doc));
    } finally {
        if (writer!=null) writer.close();
    }
}

void loadImdbData(int batchSize) throws IOException {
    List<Map<String, String>> collection = readCollection(fileImdbDataset);

    deleteIndex();
    // load collection in batches to show how segments work
    for (int i = 0; i < collection.size(); i += batchSize)
        loadBatch(collection.subList(i, Math.min(i + batchSize, collection.size())));
}

loadImdbData(100);

Read 1000 documents from datasets/imdb_top_1000.csv


### Merging segments
We can force Lucene to merge segments (normally, this is done less costly by the MergePolicy). Above we created 10 segments and you can observe 10 groups (compounded file format) of files in the index folder. Next, we turn on merge policy and force Lucene to produce a compact index with one segment only.

In [17]:
var writer = getIndexWriter(true);
writer.forceMerge(1);
writer.close();

### Inspecting the index

##### Documents in the index

In [18]:
void printDocuments(int num) throws IOException {
    IndexReader reader = DirectoryReader.open(getDirectory());
    StoredFields storedFields = reader.storedFields();
    for(int i = 0; i < num; i++) {
        Document doc = storedFields.document(i);
        System.out.println("Doc " + i + ": " + doc.toString());
    }
}

printDocuments(10);

Doc 0: Document<stored,indexed,tokenized<title:The Shawshank Redemption> stored,indexed,tokenized<year:1994> stored<runtime:142> stored<rating:9.3> stored,indexed,tokenized,termVector<summary:Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.>>
Doc 1: Document<stored,indexed,tokenized<title:The Godfather> stored,indexed,tokenized<year:1972> stored<runtime:175> stored<rating:9.2> stored,indexed,tokenized,termVector<summary:An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.>>
Doc 2: Document<stored,indexed,tokenized<title:The Dark Knight> stored,indexed,tokenized<year:2008> stored<runtime:152> stored<rating:9.0> stored,indexed,tokenized,termVector<summary:When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.>>
Doc 3: Document<st

##### Terms used in a document
This next code is only possible if we store the term vectors (aka document vectors) in the index. We have created a custom field for summary to demonstrate this.

In [19]:
void printDocumentVector(int docNo, String field) throws IOException {
    IndexReader reader = DirectoryReader.open(getDirectory());
    TermVectors vectors = reader.termVectors();
    Terms terms = vectors.get(docNo, field);
        
    // iterate through terms
    System.out.println("index of doc " + docNo + ": \nterm             tf   postings");
    TermsEnum termsEnum = terms.iterator();
    PostingsEnum positions = null;
    BytesRef term = null;
    while((term = termsEnum.next()) != null) {
        String termstr = term.utf8ToString(); // Get the text string of the term.
        long freq = termsEnum.totalTermFreq(); // Get the frequency of the term in the document.
        
        System.out.printf("%-12s %6d  ", termstr, freq );
        positions = termsEnum.postings(positions, PostingsEnum.POSITIONS );
        positions.nextDoc(); // you still need to move the cursor
        for(int i = 0; i < freq; i++ )
            System.out.print(" " + positions.nextPosition());
        System.out.println();
    }
    reader.close();
}

// print document (to see what happened to it)
var docNo = 42;
System.out.println("\nsummary of doc " + docNo + ": " + collection.get(docNo).get("summary") + "\n");
printDocumentVector(docNo, "summary");


summary of doc 42: Mathilda, a 12-year-old girl, is reluctantly taken in by Léon, a professional assassin, after her family is murdered. An unusual relationship forms as she becomes his protégée and learns the assassin's trade.

index of doc 42: 
term             tf   postings
12                1   2
after             1   17
assassin          2   16 38
becom             1   28
e                 1   34
famili            1   19
form              1   25
girl              1   5
g�                1   32
her               1   18
hi                1   29
learn             1   36
l�                1   11
mathilda          1   0
murder            1   21
old               1   4
profession        1   15
prot�             1   30
relationship      1   24
reluctantli       1   7
she               1   27
taken             1   8
trade             1   39
unusu             1   23
year              1   3
�                 3   12 31 33


##### How does the Vocabulary look like?

In [20]:
void printVocabulary(String field) throws IOException {
    IndexReader reader = DirectoryReader.open(getDirectory());
    Terms vocabulary = MultiTerms.getTerms(reader, field);
    TermsEnum termsEnum = vocabulary.iterator();
    BytesRef term = null;
    Object output[][] = new Object[(int)vocabulary.size()][];
    int i = 0;

    // get all terms from vocbulary and produce the result
    while ((term = termsEnum.next()) != null) {
        // create an array list with the terms and their frequencies
        Object[] list = new Object[]{term.utf8ToString(),termsEnum.docFreq(), termsEnum.totalTermFreq()};
        output[i++] = list;
    }

    // print summary
    System.out.println(String.format("Total documents: %d", reader.numDocs()));
    System.out.println(String.format("    Total terms: %d", vocabulary.size()));

    System.out.println(String.format("  Total sum(tf): %d", reader.getSumTotalTermFreq(field)));
    System.out.println(String.format("     Average tf: %1.2f", (float)reader.getSumTotalTermFreq(field)/reader.getSumDocFreq(field)));
    System.out.println(String.format("Average doc len: %1.2f", (float)reader.getSumTotalTermFreq(field)/reader.numDocs()));
    
    System.out.println(String.format("     Average df: %1.2f", (float)reader.getSumTotalTermFreq(field)/vocabulary.size()));
    System.out.println("\nterm             df  sum(tf)");

    // show the objects with higest df
    Arrays.sort(output, (o1, o2) -> ((Integer)o2[1]).compareTo((Integer)o1[1]));
    for (i=0; i<20; i++)
        System.out.println(String.format("%-12s %6d %8d", output[i][0], output[i][1], output[i][2]));
    reader.close();
}

printVocabulary("summary");

Total documents: 1000
    Total terms: 4471
  Total sum(tf): 17405
     Average tf: 1.04
Average doc len: 17.41
     Average df: 3.89

term             df  sum(tf)
hi              375      516
who             158      165
from            139      148
he              138      155
young           127      132
her             124      164
man             116      120
after           112      112
find            103      106
life            103      111
when            103      107
two              98      104
world            82       85
becom            73       76
year             73       78
new              70       75
famili           68       71
stori            68       68
war              68       71
up               67       67


In [15]:
printVocabulary("title");

Total documents: 1000
    Total terms: 1624
  Total sum(tf): 2298
     Average tf: 1.01
Average doc len: 2.30
     Average df: 1.42

term             df  sum(tf)
la               23       24
man              20       20
de               18       18
le               18       19
2                14       14
dai              14       14
stori            10       10
star              9        9
harri             8        8
night             8        8
war               8        8
babi              7        7
dark              7        7
dead              7        7
king              7        7
�                 7        7
last              6        6
men               6        6
onc               6        6
potter            6        6


## Searching with Lucene

### We need an index searcher and query parser with the same settings as for index writers

In [21]:
IndexSearcher getIndexSearcher() throws IOException {
    return new IndexSearcher(DirectoryReader.open(getDirectory()));
}

QueryParser getQueryParser() throws IOException {
    return new MultiFieldQueryParser(new String[]{"title", "summary", "genre", "actors"}, getAnalyzer());
}

QueryParser getQueryParser(String field) throws IOException {
    return new QueryParser(field, getAnalyzer());
}

### This function takes a query parser and runs a set of queries

In [22]:
void printResults(String query, TopDocs results) throws IOException {
    int rank = 1;
    System.out.println("Query: " + query);
    System.out.printf("%3s %5s %6s %6s %7s %6s   %s\n", "#", "id", "Score", "Year", "Runtime", "Rating", "Title" );
    for(ScoreDoc doc: results.scoreDocs) {
        Document document = getIndexSearcher().doc(doc.doc);
        System.out.printf("%3d %5d %6.2f %6s %7s %6s   %s\n", rank++, doc.doc, doc.score, 
            document.get("year"), document.get("runtime"), document.get("rating"), document.get("title") );
    } 
    System.out.println();
}

void searchExamples(QueryParser parser, String[] queries) throws IOException, ParseException {
    IndexSearcher searcher = getIndexSearcher();

    for(String query: queries) {
        printResults(query, searcher.search(parser.parse(query), 10));
        System.out.println();
    }
}

void searchQuery(Query query) throws IOException {
    IndexSearcher searcher = getIndexSearcher();
    TopDocs results = searcher.search(query, 10);
    printResults(query.toString(), results);
}

##### First we perform some generic queries with keywords only

In [23]:
searchExamples(getQueryParser(), new String[]{
    "star wars", 
    "drama morgan freeman", 
    "comedy"
});

Query: star wars
  #    id  Score   Year Runtime Rating   Title
  1    29   4.53   1977     121    8.6   Star Wars
  2   109   4.00   1983     131    8.3   Star Wars: Episode VI - Return of the Jedi
  3    54   2.94   2017     125    8.4   Ayla: The Daughter of War
  4   746   2.94   2013     132    7.7   Star Trek Into Darkness
  5   182   2.93   1961     179    8.2   Judgment at Nuremberg
  6   747   2.89   2015     137    7.7   Beasts of No Nation
  7   955   2.85   1998     170    7.6   The Thin Red Line
  8   278   2.79   1978     183    8.1   The Deer Hunter
  9   461   2.79   1930     152    8.0   All Quiet on the Western Front
 10   542   2.71   1970     172    7.9   Patton


Query: drama morgan freeman
  #    id  Score   Year Runtime Rating   Title
  1     0   4.51   1994     142    9.3   The Shawshank Redemption
  2   167   4.47   1992     130    8.2   Unforgiven
  3   234   4.47   2004     132    8.1   Million Dollar Baby
  4   673   4.45   1989     122    7.8   Glory
  5   

In [24]:
searchExamples(getQueryParser(), new String[]{
    "genre:drama actors:morgan actors:freeman", 
    "genre:comedy",
});

Query: genre:drama actors:morgan actors:freeman
  #    id  Score   Year Runtime Rating   Title
  1     0   4.51   1994     142    9.3   The Shawshank Redemption
  2   167   4.47   1992     130    8.2   Unforgiven
  3   234   4.47   2004     132    8.1   Million Dollar Baby
  4   673   4.45   1989     122    7.8   Glory
  5   768   4.45   2006     110    7.7   Lucky Number Slevin
  6   922   4.45   2007     114    7.6   Gone Baby Gone
  7    27   4.25   1995     127    8.6   Se7en
  8   311   2.36   1940      99    8.1   The Shop Around the Corner
  9   609   2.08   2013     161    7.8   The Hobbit: The Desolation of Smaug
 10   618   2.08   2012     169    7.8   The Hobbit: An Unexpected Journey


Query: genre:comedy
  #    id  Score   Year Runtime Rating   Title
  1    78   0.89   1964      95    8.4   Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb
  2   277   0.89   1979      94    8.1   Life of Brian
  3   417   0.89   1974     106    8.0   Young Frankenstein
 

In [25]:
searchExamples(getQueryParser(), new String[]{
    "title:leon",
    "title:l?on",
    "title:leon~0.5",
});

Query: title:leon
  #    id  Score   Year Runtime Rating   Title
  1    42   3.84   1994     110    8.5   Leon


Query: title:l?on
  #    id  Score   Year Runtime Rating   Title
  1    42   1.00   1994     110    8.5   Leon
  2    43   1.00   1994      88    8.5   The Lion King
  3   328   1.00   2016     118    8.0   Lion
  4   545   1.00   1968     134    7.9   The Lion in Winter


Query: title:leon~0.5
  #    id  Score   Year Runtime Rating   Title
  1    42   2.36   1994     110    8.5   Leon
  2   328   1.77   2016     118    8.0   Lion
  3    43   1.44   1994      88    8.5   The Lion King
  4   545   1.44   1968     134    7.9   The Lion in Winter
  5   486   1.18   2009      97    7.9   Moon
  6   620   1.18   2007     123    7.8   Atonement
  7   284   0.96   1973     102    8.1   Paper Moon
  8   297   0.96   1960      89    8.1   Jungfrukällan
  9   389   0.96   1999      86    8.0   The Iron Giant
 10   448   0.96   1950      94    8.0   In a Lonely Place




In [26]:
searchExamples(getQueryParser(), new String[]{
    "title:{a TO b}",
    "year:{1990 TO 2000}",
    "year:1994",
    "runtime:142"
});

Query: title:{a TO b}
  #    id  Score   Year Runtime Rating   Title
  1     4   1.00   1957      96    9.0   12 Angry Men
  2    40   1.00   1998     119    8.5   American History X
  3    54   1.00   2017     125    8.4   Ayla: The Daughter of War
  4    59   1.00   2019     181    8.4   Avengers: Endgame
  5    60   1.00   2018     149    8.4   Avengers: Infinity War
  6    71   1.00   1984     229    8.4   Once Upon a Time in America
  7    72   1.00   1981     115    8.4   Raiders of the Lost Ark
  8    74   1.00   1979     147    8.4   Apocalypse Now
  9    75   1.00   1979     117    8.4   Alien
 10    76   1.00   1971     122    8.4   Anand


Query: year:{1990 TO 2000}
  #    id  Score   Year Runtime Rating   Title
  1     0   1.00   1994     142    9.3   The Shawshank Redemption
  2     6   1.00   1994     154    8.9   Pulp Fiction
  3     7   1.00   1993     195    8.9   Schindler's List
  4     9   1.00   1999     139    8.8   Fight Club
  5    11   1.00   1994     142    8.

In [27]:
var query = new BooleanQuery.Builder()
        // .add(IntField.newExactQuery("runtime", 142), Occur.MUST)
        // .add(IntField.newExactQuery("runtime", 142), Occur.SHOULD)
        .add(IntField.newExactQuery("runtime", 142), Occur.FILTER)
        .add(new TermQuery(new Term("actors", "morgan")), Occur.SHOULD)
        .build();

searchQuery(query);

Query: #runtime:[142 TO 142] actors:morgan
  #    id  Score   Year Runtime Rating   Title
  1     0   2.22   1994     142    9.3   The Shawshank Redemption
  2    11   0.00   1994     142    8.8   Forrest Gump
  3   105   0.00   1985     142    8.3   Idi i smotri
  4   169   0.00   1988     142    8.2   Dom za vesanje
  5   506   0.00   2004     142    7.9   Harry Potter and the Prisoner of Azkaban
  6   890   0.00   2015     142    7.6   Bridge of Spies



In [23]:
var query = new BooleanQuery.Builder()
        .add(IntField.newRangeQuery("runtime", 120, 180), Occur.FILTER)
        .add(TermRangeQuery.newStringRange("year", "1990", "2000", true, false), Occur.FILTER)
        .add(new TermQuery(new Term("actors", "morgan")), Occur.SHOULD)
        .build();

searchQuery(query);

Query: #runtime:[120 TO 180] #year:[1990 TO 2000} actors:morgan
  #    id  Score   Year Runtime Rating   Title
  1     0   2.22   1994     142    9.3   The Shawshank Redemption
  2   167   2.22   1992     130    8.2   Unforgiven
  3    27   2.12   1995     127    8.6   Se7en
  4     6   0.00   1994     154    8.9   Pulp Fiction
  5     9   0.00   1999     139    8.8   Fight Club
  6    11   0.00   1994     142    8.8   Forrest Gump
  7    14   0.00   1999     136    8.7   The Matrix
  8    15   0.00   1990     146    8.7   Goodfellas
  9    24   0.00   1998     169    8.6   Saving Private Ryan
 10    44   0.00   1991     137    8.5   Terminator 2: Judgment Day



In [24]:
var query = new BooleanQuery.Builder()
        .add(new TermQuery(new Term("title", "star")), Occur.MUST)
        .add(new TermQuery(new Term("genre", "action")), Occur.SHOULD)
        .build();

searchQuery(query);

Query: +title:star genre:action
  #    id  Score   Year Runtime Rating   Title
  1    29   2.95   1977     121    8.6   Star Wars
  2   493   2.86   2009     127    7.9   Star Trek
  3   746   2.51   2013     132    7.7   Star Trek Into Darkness
  4   903   2.24   2018     136    7.6   A Star Is Born
  5   839   2.05   1982     113    7.7   Star Trek II: The Wrath of Khan
  6   109   1.99   1983     131    8.3   Star Wars: Episode VI - Return of the Jedi
  7   477   1.90   2015     138    7.9   Star Wars: Episode VII - The Force Awakens
  8   731   1.88   2014     126    7.7   The Fault in Our Stars
  9    16   1.87   1980     124    8.7   Star Wars: Episode V - The Empire Strikes Back



In [25]:
var query = new BooleanQuery.Builder()
        .add(IntField.newRangeQuery("runtime", 120, 180), Occur.FILTER)
        .add(TermRangeQuery.newStringRange("year", "1990", "2000", true, false), Occur.SHOULD)
        .add(new TermQuery(new Term("title", "shawshank")), Occur.MUST)
        .add(new BoostQuery(new TermQuery(new Term("actors", "morgan")), 1.5f), Occur.SHOULD)
        .build();
        
IndexSearcher searcher = getIndexSearcher();
TopDocs results = searcher.search(query, 10);
printResults(query.toString(), results);
System.out.println(searcher.explain(query, results.scoreDocs[0].doc));

Query: #runtime:[120 TO 180] year:[1990 TO 2000} +title:shawshank (actors:morgan)^1.5
  #    id  Score   Year Runtime Rating   Title
  1     0   7.46   1994     142    9.3   The Shawshank Redemption

7.4595942 = sum of:
  0.0 = match on required clause, product of:
    0.0 = # clause
    1.0 = runtime:[120 TO 180]
  1.0 = year:[1990 TO 2000}
  3.1223383 = weight(title:shawshank in 0) [BM25Similarity], result of:
    3.1223383 = score(freq=1.0), computed as boost * idf * tf from:
      6.5022902 = idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:
        1 = n, number of documents containing term
        999 = N, total number of documents with field
      0.48019058 = tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:
        1.0 = freq, occurrences of term within document
        1.2 = k1, term saturation parameter
        0.75 = b, length normalization parameter
        2.0 = dl, length of field
        2.3003004 = avgdl, average length of field
  3.337256 = w

---