# Notes: this Java notebook requires Ganymede 
* Ganymede (Java kernel for Jupyter): [Installation and documentation](https://github.com/allen-ball/ganymede)
* We need additional libraries for lucene

In [5]:
%%pom
dependencies:
- org.apache.lucene:lucene-core:9.7.0
- org.apache.lucene:lucene-analysis-common:9.7.0

#### Common imports (java)

In [6]:
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;

#### Common imports (lucene)

In [9]:
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;  
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.CharArraySet;

import org.apache.lucene.util.Version;

## Let's read in the data collection

In [10]:
ArrayList<Map<String, String>> read_collection(String name) {
    ArrayList<Map<String, String>> docs = new ArrayList<Map<String, String>>();
    String splitter = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)";
    
    try{
        BufferedReader reader = new BufferedReader(new FileReader(name));
        String line, keys[] = reader.readLine().split(splitter);

        while ((line = reader.readLine()) != null) {
            String[] values = line.split(splitter);
            Map<String, String> dataMap = new HashMap<>();

            for (int i = 0; i < keys.length; i++) {
                // dataMap.put(keys[i], values[i]);
                switch(keys[i]){
                    case "Series_Title":
                        dataMap.put("title", values[i]);
                        break;
                    case "Released_Year":
                        dataMap.put("year", values[i]);
                        break;
                    case "Runtime":
                        dataMap.put("runtime", values[i].replace(" min", ""));
                        break;
                    case "Genre":
                        dataMap.put("genre", values[i].replace(",",""));
                        break;
                    case "IMDB_Rating":
                        dataMap.put("rating", values[i]);
                        break;
                    case "Overview":
                        dataMap.put("summary", values[i]);
                        break;
                    case "Star1":
                        dataMap.put("actors", values[i]);
                        break;
                    case "Star2":
                    case "Star3":
                    case "Star4":
                        dataMap.put("actors", dataMap.get("actors") + " " + values[i]);
                        break;
                }
            }
            docs.add(dataMap);
        }
    } catch(IOException e) {
        e.printStackTrace();
    }
    System.out.println("Read " + docs.size() + " documents from " + name);
    return docs;
}

var collection = read_collection("datasets/imdb_top_1000.csv");

Read 1000 documents from datasets/imdb_top_1000.csv


## Let's start with the analyzer of Lucene

In [16]:
void print_tokens(Analyzer analyzer, String text) {
    try {
        TokenStream ts = analyzer.tokenStream("text", new StringReader(text));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

        try {
            ts.reset();
            // Print all tokens until stream is exhausted
            while (ts.incrementToken()) 
                System.out.print(termAtt.toString() + " ");
            ts.end();
            System.out.println();
        } finally {
            ts.close();
        }
    } catch(IOException e) {
        e.printStackTrace();
    }    
}

class MyAnalyzer extends Analyzer {
    MyAnalyzer(Version matchVersion) {
    }
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
        Version matchVersion = Version.LUCENE_CURRENT;
        final Tokenizer source = new StandardTokenizer(matchVersion, reader);
        TokenStream result = new EnglishPossessiveFilter(matchVersion, source);
        // result = new LowerCaseFilter(matchVersion, result);
        result = new KStemFilter(result);
        return new TokenStreamComponents(source, result);
    }
}

var text = "I think text's values' color goes here; WHAT happens with it? do we see IT again; I went there to be gone with houses";
var stopWords = new CharArraySet(Arrays.asList("i", "do"), false);

System.out.println("             text: "+ text);
System.out.println();

// standard analyzer
System.out.print("         standard: ");
print_tokens(new StandardAnalyzer(Version.LUCENE_CURRENT), text);

// english analyzer (with porter stemmer)
// System.out.print("          english: ");
// print_tokens(new EnglishAnalyzer(Version.LUCENE_CURRENT), text);

// english analyzer (with porter stemmer) and new set of stopwords
// System.out.print("english/stopwords: ");
// print_tokens(new EnglishAnalyzer(Version.LUCENE_CURRENT, stopWords), text);

// a custom analyzer, no lower case and kstemmer
// System.out.print("      my analyzer: ");
// print_tokens(new MyAnalyzer(Version.LUCENE_CURRENT), text);

EnglishAnalyzer.getDefaultStopSet()

             text: I think text's values' color goes here; WHAT happens with it? do we see IT again; I went there to be gone with houses

         standard: 

REJECTED ERRONEOUS

print_tokens(new StandardAnalyzer(Version.LUCENE_CURRENT), text);
no suitable constructor found for StandardAnalyzer(org.apache.lucene.util.Version)
    constructor org.apache.lucene.analysis.standard.StandardAnalyzer.StandardAnalyzer(org.apache.lucene.analysis.CharArraySet) is not applicable
      (argument mismatch; org.apache.lucene.util.Version cannot be converted to org.apache.lucene.analysis.CharArraySet)
    constructor org.apache.lucene.analysis.standard.StandardAnalyzer.StandardAnalyzer(java.io.Reader) is not applicable
      (argument mismatch; org.apache.lucene.util.Version cannot be converted to java.io.Reader)


## Building an index (in memory)

In [5]:
import org.knowm.xchart.XYChart;
import org.knowm.xchart.XYChartBuilder;

var xchart = new XYChartBuilder().title("Trig").build();

xchart.addSeries("sin", x, sinx);
xchart.addSeries("cos", x, cosx);

print(xchart)

REJECTED ERRONEOUS


print(xchart)
cannot find symbol
  symbol:   method print(org.knowm.xchart.XYChart)
  location: class 
