Skip to content

Commit

Permalink
work in progress
Browse files Browse the repository at this point in the history
  • Loading branch information
plutext committed Jul 25, 2018
1 parent 1cc4a33 commit 73d07b4
Show file tree
Hide file tree
Showing 2 changed files with 152 additions and 0 deletions.
49 changes: 49 additions & 0 deletions src/main/java/org/docx4j/unicode/Script.java
@@ -0,0 +1,49 @@
package org.docx4j.unicode;

public enum Script {

Arabic,
Armenian,
Bengali,
Bopomofo,
Braille,
Buhid,
Canadian_Aboriginal,
Cherokee,
Cyrillic,
Devanagari,
Ethiopic,
Georgian,
Greek,
Gujarati,
Gurmukhi,
Han,
Hangul,
Hanunoo,
Hebrew,
Hiragana,
Inherited,
Kannada,
Katakana,
Khmer,
Lao,
Latin,
Limbu,
Malayalam,
Mongolian,
Myanmar,
Ogham,
Oriya,
Runic,
Sinhala,
Syriac,
Tagalog,
Tagbanwa,
TaiLe,
Tamil,
Telugu,
Thaana,
Thai,
Tibetan,
Yi
}
103 changes: 103 additions & 0 deletions src/main/java/org/docx4j/unicode/UnicodeScriptDetector.java
@@ -0,0 +1,103 @@
package org.docx4j.unicode;


import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UnicodeScriptDetector {

public static void main(String[] args) {

System.out.println(percentage("חץ", Script.Hebrew));

}


public static boolean containsScript(String text, Script script) {

String pattern = "\\p{Is" + script + "}";
//System.out.println(pattern);
Pattern p = Pattern.compile(pattern);
Matcher m = p.matcher(text);

return m.find();

}

public static boolean isAll(String text, Script script) {

// TODO skip punctuation, spaces?

String pattern = "\\p{Is" + script + "}*";
//System.out.println(pattern);
Pattern p = Pattern.compile(pattern);
Matcher m = p.matcher(text);

return m.matches();

}

public static int percentage(String text, Script script) {

int count=0;
int total = text.length();
for( int i = 0; i<text.length(); i++) {

// TODO skip punctuation, spaces?

// TODO surrogates
String glyph = text.charAt(i) + "";
if (containsScript(glyph, script)) count++;
}

return Math.round(100*count/total);
}

public static boolean containsCJK(String text) {

Pattern p = Pattern.compile("\\p{InHiragana}|\\p{InKatakana}|\\p{IsHan}|\\p{IsHangul}", Pattern.UNICODE_CASE);
Matcher m = p.matcher(text);

return m.find();

}

// from https://gist.github.com/TheFinestArtist/2fd1b4aa1d4824fcbaef
public static boolean hasKorean(CharSequence charSequence) {
boolean hasKorean = false;
for (char c : charSequence.toString().toCharArray()) {
if (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.HANGUL_JAMO
|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.HANGUL_SYLLABLES) {
hasKorean = true;
break;
}
}

return hasKorean;
}

public static boolean hasJapanese(CharSequence charSequence) {
boolean hasJapanese = false;
for (char c : charSequence.toString().toCharArray()) {
if (Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.HIRAGANA
|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.KATAKANA
|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
|| Character.UnicodeBlock.of(c) == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION) {
hasJapanese = true;
break;
}
}

return hasJapanese;
}
//
// public Script dominant(String text) {
//
// }



}

0 comments on commit 73d07b4

Please sign in to comment.