Skip to content

Commit

Permalink
Add some commments and fix test just to compile only
Browse files Browse the repository at this point in the history
  • Loading branch information
astathopoulos committed Apr 26, 2012
1 parent a561ac5 commit 78d49af
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.logging.ESLogger;

/**
* @author Tasos Stathopoulos</p>
* Generates singular/plural variants of a greek word based
* on a combination of predefined rules.
*/
public class GreekReverseStemmer {

/**
Expand Down Expand Up @@ -44,7 +49,7 @@ public class GreekReverseStemmer {

/**
* This hash has as keys all the suffixes that we want to handle in order
* to generate singular/plurar greek words.
* to generate singular/plural greek words.
*/
private final Map<String, String[]> suffixes = new HashMap<String, String[]>();

Expand Down Expand Up @@ -77,13 +82,14 @@ public class GreekReverseStemmer {
};

/**
* The greek word buffer
* The greek word list
*/
private List<String> greekWords = new ArrayList<String>();

// Constructor
public GreekReverseStemmer() {

// initialize logger
this.logger = Loggers.getLogger("greeklish.greekReverseStemmer");

// populate suffixes
Expand All @@ -92,13 +98,25 @@ public GreekReverseStemmer() {
}
}

/**
* This method generates the greek variants of the greek token that
* receives.
*
* @param tokenString the greek word
* @return a list of the generated greek word variations
*/
public List<String> generateGreekVariants(String tokenString) {
// clear the list from variations of the previous greek token
greekWords.clear();

// add the initial greek token in the greek words
greekWords.add(tokenString);

// Find the first matching suffix and generate the
// the variants of this word
for (String[] suffix : suffixStrings) {
if (tokenString.endsWith(suffix[0])) {
// Add to greekWords the tokens with the desired suffixes
generate_more_greek_words(tokenString, suffix[0]);
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
import org.elasticsearch.common.logging.Loggers;

/**
* @author Tasos Stathopoulos </p> Generates tokens with latin characters from
* Greek tokens. It matches one or more latin characters for each Greek
* character of the token. A Greek character may have one or more latin
* counterparts. So, from a Greek token one or more latin tokens are
* @author Tasos Stathopoulos </p> Generates singular/plural variants of greek
* tokens and converts them to tokens with latin characters from which are
* matched to the corresponding greek characters.</p>
* A Greek character may have one or more latin counterparts. So,
* from a Greek token one or more latin tokens are
* generated. </p> Greek words have combination of vowels called
* digraphs. Because digraphs are special cases, they are treated in
* isolation.
* digraphs. Because digraphs are special cases, they are treated separately.
*/
public class GreeklishConverter {
/**
Expand All @@ -38,27 +38,43 @@ public class GreeklishConverter {
*/
private String tokenString;

/**
* Instance of the reverse stemmer that generates the word variants
* of the greek token.
*/
private final GreekReverseStemmer reverseStemmer;

/**
* Instance of the greeklish generator that generates the greeklish
* words from the words that are returned by the greek reverse stemmer.
*/
private final GreeklishGenerator greeklishGenerator;

/**
* Setting that which is set in the configuration file that defines
* whether the user wants to generate greek variants.
*/
private final boolean generateGreekVariants;

// Constructor
public GreeklishConverter(int maxExpansions, boolean generateGreekVariants) {

// Initialize the logger
this.logger = Loggers.getLogger("greeklish.converter");

// Initialize greekWords list
this.greekWords = new ArrayList<String>();

// Initialize reverse stemmer
this.reverseStemmer = new GreekReverseStemmer();

// Initialize greeklish generator
this.greeklishGenerator = new GreeklishGenerator(maxExpansions);

// Initialize setting for generating greek variants
this.generateGreekVariants = generateGreekVariants;

logger.debug("Max expansions: [{}]", maxExpansions);
logger.debug("Generate Greek Variants: [{}]", generateGreekVariants);
logger.debug("Max expansions: [{}] Generate Greek Variants [{}]", maxExpansions, generateGreekVariants);
}

/**
Expand All @@ -71,21 +87,24 @@ public GreeklishConverter(int maxExpansions, boolean generateGreekVariants) {
* @return A list of the generated strings
*/
public final List<StringBuilder> convert(char[] inputToken, int tokenLength) {
// Convert to string in order to replace the digraphs with
// special characters.
// Convert to string in order to pass it to the reverse stemmer.
tokenString = new String(inputToken, 0, tokenLength);
// Is this a Greek word?
if (!identifyGreekWord(tokenString)) {
return null;
}

// if generating greek variants is on
if (generateGreekVariants) {
// generate them
greekWords = reverseStemmer.generateGreekVariants(tokenString);
} else {
greekWords.add(tokenString);
}

// if there are greek words
if (greekWords.size() > 0) {
// generate their greeklish version
return greeklishGenerator.generateGreeklishWords(greekWords);
} else {
return null;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
package org.elasticsearch.index.analysis;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
Expand All @@ -9,6 +10,12 @@
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.logging.ESLogger;

/**
* @author Tasos Stathopoulos
* </p>
* Generates greeklish tokens for each element of list
* of greek tokens.
*/
public class GreeklishGenerator {

/**
Expand Down Expand Up @@ -65,8 +72,14 @@ public class GreeklishGenerator {
{ "φ", "f", "ph" }, { "χ", "x", "h", "ch" }, { "ψ", "ps" },
{ "ω", "w", "o", "v" } };

/**
* The maximum greeklish expansions per greek token.
*/
private final int maxExpansions;

/**
* A list of greeklish token per each greek word.
*/
private final List<StringBuilder> perWordGreeklish;

/**
Expand Down Expand Up @@ -111,6 +124,12 @@ public GreeklishGenerator(int maxExpansions) {
}
}

/**
* Gets a list of greek words and generates the greeklish version of
* each word.
* @param greekWords a list of greek words
* @return a list of greeklish words
*/
public List<StringBuilder> generateGreeklishWords(final List<String> greekWords) {
greeklishList.clear();
for (String greekWord : greekWords) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public class GreeklishConverterTest {

@BeforeMethod
public void setUp() {
this.converter = new GreeklishConverter(MAX_EXPANSIONS);
this.converter = new GreeklishConverter(MAX_EXPANSIONS, true);
}

@Test
Expand Down Expand Up @@ -70,7 +70,7 @@ public void testGreekTokenConversionsForInvalidWords() {
@Test
public void testMaxGreeklishWordGenerations() {
int newMaxExpansions = 2;
converter = new GreeklishConverter(newMaxExpansions);
converter = new GreeklishConverter(newMaxExpansions, true);

greeklishWords = converter.convert(greekWords[0].toCharArray(),
greekWords[0].length());
Expand Down

0 comments on commit 78d49af

Please sign in to comment.