Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiment with OpenNLP Language detection #512

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.gradle
Expand Up @@ -78,7 +78,7 @@ configure(allProjs) {
chillVersion = '0.9.3'
reflectionsVersion = '0.9.11'
collectionsVersion = '3.2.2'
optimaizeLangDetectorVersion = '0.0.1'
optimaizeLangDetectorVersion = '0.6'
tikaVersion = '1.22'
sparkTestingBaseVersion = '2.4.3_0.12.0'
sourceCodeVersion = '0.1.3'
Expand Down
3 changes: 1 addition & 2 deletions core/build.gradle
Expand Up @@ -9,8 +9,7 @@ dependencies {
compile "com.googlecode.libphonenumber:carrier:$googleCarrierVersion"

// Optimaize language detection
compile "com.salesforce.transmogrifai:language-detector:$optimaizeLangDetectorVersion"

compile "com.optimaize.languagedetector:language-detector:$optimaizeLangDetectorVersion"
Copy link
Collaborator

@tovbinm tovbinm Sep 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was some fixes I did into Optimaize language detector so we had to make our own version. I think it was related to Guava.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was hoping to get rid of this fork by shading if necessary.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. No development is happening on that repo, so not a big deal imo.

// Tika library for detecting various binary data formats
compile "org.apache.tika:tika-core:$tikaVersion"

Expand Down
Expand Up @@ -64,5 +64,5 @@ class LangDetector[T <: Text]
}

object LangDetector {
val DefaultDetector: LanguageDetector = new OptimaizeLanguageDetector()
val DefaultDetector: LanguageDetector = new OpenNLPLanguageDetector()
}
Expand Up @@ -92,7 +92,7 @@ class NameEntityRecognizer[T <: Text]

object NameEntityRecognizer {
val Analyzer: TextAnalyzer = new OpenNLPAnalyzer()
val LanguageDetector: LanguageDetector = new OptimaizeLanguageDetector()
val LanguageDetector: LanguageDetector = new OpenNLPLanguageDetector()
val Tagger: NameEntityTagger[_ <: TaggerResult] = new OpenNLPNameEntityTagger()
val Splitter: SentenceSplitter = new OpenNLPSentenceSplitter()
val AutoDetectLanguage = false
Expand Down
Expand Up @@ -35,6 +35,7 @@ import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.unary.UnaryTransformer
import com.salesforce.op.stages.impl.feature.TextTokenizer.TextTokenizerResult
import com.salesforce.op.stages.{OpPipelineStageReaderWriter, ReaderWriter}
import com.salesforce.op.utils.reflection.ReflectionUtils
import com.salesforce.op.utils.text.{Language, _}
import org.apache.spark.ml.param._
import org.json4s.{JObject, JValue}
Expand Down Expand Up @@ -176,6 +177,7 @@ object TextTokenizer {
.collectFirst { case (lang, confidence) if confidence > autoDetectThreshold => lang }
.getOrElse(defaultLanguage)
}
println(s"GERA DEBUG detected language $language for $textString")
val lowerTxt = if (toLowercase) textString.toLowerCase else textString

val sentences = sentenceSplitter.map(_.getSentences(lowerTxt, language))
Expand Down Expand Up @@ -248,7 +250,7 @@ class TextTokenizerReaderWriter[T <: Text] extends OpPipelineStageReaderWriter[T
*/
def read(stageClass: Class[TextTokenizer[T]], json: JValue): Try[TextTokenizer[T]] = Try {
val languageDetector = ((json \ "languageDetector").extract[JObject] \ "className").extract[String] match {
case c if c == classOf[OptimaizeLanguageDetector].getName => new OptimaizeLanguageDetector
case c => ReflectionUtils.newInstance[LanguageDetector](c)
}
val analyzerJson = (json \ "analyzer").extract[JObject]
val analyzer = (analyzerJson \ "className").extract[String] match {
Expand Down
@@ -0,0 +1,83 @@
/*
* Copyright (c) 2017, Salesforce.com, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of the copyright holder nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package com.salesforce.op.utils.text
import opennlp.tools.ml.model.MaxentModel
import opennlp.tools.langdetect.{LanguageDetectorContextGenerator, LanguageDetectorME, LanguageDetectorModel}
import org.slf4j.LoggerFactory

class OpenNLPLanguageDetector extends LanguageDetector {
/**
* Detect languages from a text
*
* @param s input text
* @return detected languages sorted by confidence score in descending order.
* Confidence score is range of [0.0, 1.0], with higher values implying greater confidence.
*/
def detectLanguages(s: String): Seq[(Language, Double)] = {
OpenNLPLanguageDetector.detector.predict(s)
}
}


case class OpenNLPLanguageDetectorME(
languageDetectorModel: MaxentModel,
contextGenerator: LanguageDetectorContextGenerator
) {
def predict(str: String): Seq[(Language, Double)] = {
languageDetectorModel
.eval(contextGenerator.getContext(str))
.zipWithIndex
.sortBy { case (confidence, _) => confidence }
.reverse
.map { case (prob, index) =>
(Language.fromString(languageDetectorModel.getOutcome(index)), prob)
}
}
}


private[op] object OpenNLPLanguageDetector {

@transient private lazy val log = LoggerFactory.getLogger(this.getClass)

// This detector is a singleton to avoid reloading the ngrams for the detector
lazy val detector = {
val start = System.currentTimeMillis()
val ldm = OpenNLPModels.getLanguageDetection()
val model = ldm.getMaxentModel
val contextGenerator = ldm.getFactory.getContextGenerator
println(s"GERA DEBUG Loaded OpenNLP Language Model for ${model.getNumOutcomes} languages. " +
s"Time elapsed: ${System.currentTimeMillis() - start}ms")
OpenNLPLanguageDetectorME(model, contextGenerator)
}
}


Expand Up @@ -34,6 +34,8 @@ import java.io.InputStream

import com.salesforce.op.utils.text.Language._
import com.salesforce.op.utils.text.NameEntityType._
import com.salesforce.op.utils.text.OpenNLPLanguageDetector.getClass
import opennlp.tools.langdetect.LanguageDetectorModel
import opennlp.tools.namefind.TokenNameFinderModel
import opennlp.tools.sentdetect.SentenceModel
import opennlp.tools.tokenize.TokenizerModel
Expand Down Expand Up @@ -107,6 +109,11 @@ object OpenNLPModels {
def getTokenizerModel(language: Language): Option[TokenizerModel] =
tokenizerModels.get(language)


def getLanguageDetection(): LanguageDetectorModel = {
new LanguageDetectorModel(loadFromResource(s"$modelsPath/langdetect-183.bin"))
}

private def loadTokenNameFinderModel(resourcePath: String): TokenNameFinderModel = {
val modelStream = loadFromResource(resourcePath)
new TokenNameFinderModel(modelStream)
Expand Down
Expand Up @@ -52,13 +52,14 @@ class OptimaizeLanguageDetector extends LanguageDetector {
*/
def detectLanguages(s: String): Seq[(Language, Double)] = {
OptimaizeLanguageDetector.detector.getProbabilities(s).asScala
.sortBy(_.getProbability)
.reverse
.map(r => makeLanguage(r.getLocale) -> r.getProbability)
.sortBy(-_._2)
}

private def makeLanguage(locale: LdLocale): Language = {
val maybeRegion = if (locale.getRegion.isPresent) s"-${locale.getRegion.get()}" else ""
Language.withNameInsensitive(s"${locale.getLanguage}$maybeRegion")
Language.fromString(s"${locale.getLanguage}$maybeRegion")
}

}
Expand Down
Binary file not shown.