convert Chnese characters into Wubi codes

pzhaonet · Oct 12, 2018 · 59aaf6a · 59aaf6a
1 parent cbdd753
commit 59aaf6a
Show file tree

Hide file tree

Showing 15 changed files with 201 additions and 22 deletions.
diff --git a/pinyin.Rproj → -pinyin.Rproj b/pinyin.Rproj → -pinyin.Rproj
@@ -1,8 +1,8 @@
 Version: 1.0
 
-RestoreWorkspace: Default
-SaveWorkspace: Default
-AlwaysSaveHistory: Default
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: No
 
 EnableCodeIndexing: Yes
 UseSpacesForTab: Yes

diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,3 +1,8 @@
-^.*\.Rproj$
+LICENCE
 ^\.Rproj\.user$
-readme*.*
+\.Rproj$
+^\.gitignore$
+^\.travis\.yml$
+showcase/
+vignettes/
+readme*.*
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+LICENSE
+.Rhistory
+
 .Rproj.user/
 pinyin.Rproj
 .Rbuildignore

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pinyin
 Version: 1.1.3
-Date: 2018-09-10
+Date: 2018-10-10
 Title: Convert Chinese Characters into Pinyin
 Author: Peng Zhao
 Authors@R: c(
@@ -10,7 +10,8 @@ Maintainer: Peng Zhao <pzhao@pzhao.net>
 Depends: R (>= 3.1.0)
 Imports: 
   splitstackshape,
-  data.table
+  data.table,
+  parallel
 Suggests:
 Description: Convert Chinese characters into Pinyin (the official romanization system for Standard Chinese in mainland China, Malaysia, Singapore, and Taiwan. See <https://en.wikipedia.org/wiki/Pinyin> for details).
 License: MIT + file LICENSE

diff --git a/LICENCE b/LICENCE
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2018 Peng Zhao, https://pzhao.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/LICENSE b/LICENSE
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,8 @@ export(file2py)
 export(four_corner)
 export(pinyin)
 export(pylib)
+export(wubi)
 importFrom(data.table,as.data.table)
 importFrom(splitstackshape,cSplit)
 importFrom(stats,setNames)
+importFrom(utils,data)
diff --git a/R/pinyin.R b/R/pinyin.R
@@ -35,7 +35,6 @@ pinyin <- function(mychar = '',
   return(pinyin)
 }
 
-#############################################################
 #' A Pinyin library
 #'
 #' @param method character. The value can be:
@@ -108,10 +107,11 @@ pylib <- function(method = c('quanpin', 'tone', 'toneless'),
 }
 
 
-#############################################################
 #' Rename files with Chinese characters to pinyin
 #'
 #' @param folder character. The folder in which the files are to be renamed.
+#' @param py See `help(pinyin)`.
+#' @param dic See `help(pinzin)`.
 #'
 #' @return files with new names.
 #' @export
@@ -120,22 +120,27 @@ pylib <- function(method = c('quanpin', 'tone', 'toneless'),
 #' dir.create(mydir)
 #' file.create(paste0(mydir, '/test.txt'))
 #' file.rename2py(mydir)
-file.rename2py <- function(folder = 'py') {
+file.rename2py <- function(folder = 'py', py = NA, dic = c('zh', 'zh2')) {
   if (dir.exists(folder)) {
+    dic <- match.arg(dic)
+    if(class(py)!= 'environment')  py <- pylib(method = 'toneless',
+                                               only_first_letter = TRUE, dic = dic)
+
     oldname <- dir(folder, full.names = TRUE)
-    newname <- paste(folder, sapply(dir(folder), pinyin, method = 'toneless', sep = '', nonezh_replace = NULL, only_first_letter = TRUE), sep = '/')
+    newname <- paste(folder, sapply(dir(folder), pinyin,  sep = '', nonezh_replace = NULL, py = py), sep = '/')
     file.rename(oldname, newname)
   } else {message(paste('The directory', folder, 'does not exist!'))}
 
 }
 
-#############################################################
 #' Convert the Chinese headers of bookdown .Rmd files into Pinyin
 #'
 #' @param folder character. The folder in which the files are to be converted.
 #' @param remove_curly_bracket logical. Whether to remove existing curly brackets in the headers.
 #' @param nonezh_replace NULL or character. Define how to convert non-Chinese characters in mychar. NULL means 'let it be'.
 #' @param only_first_letter logical. Wheter only the first letter in pinyin.
+#' @param py See `help(pinyin)`.
+#' @param dic See `help(pinzin)`.
 #'
 #' @return new .Rmd files with Pinyin headers.
 #' @export
@@ -145,8 +150,18 @@ file.rename2py <- function(folder = 'py') {
 #' file.create(paste0(mydir, '/test.txt'))
 #' writeLines(text = '# test\n', paste0(mydir, '/test.txt'))
 #' bookdown2py(mydir)
-bookdown2py <- function(folder = 'py', remove_curly_bracket = TRUE, nonezh_replace = NULL, only_first_letter = TRUE) {
+bookdown2py <- function(folder = 'py',
+                        remove_curly_bracket = TRUE,
+                        nonezh_replace = NULL,
+                        only_first_letter = TRUE,
+                        py = NA,
+                        dic = c('zh', 'zh2')) {
   if (dir.exists(folder)) {
+    dic <- match.arg(dic)
+    if(class(py)!= 'environment')  py <- pylib(method = 'toneless',
+                                               only_first_letter = only_first_letter,
+                                               dic = dic)
+
     for (filename in dir(folder, full.names = TRUE)) {
       # filename <- dir(folder, full.names = TRUE)[1]
       file.copy(filename, to = paste0(filename, 'backup'))
@@ -157,7 +172,9 @@ bookdown2py <- function(folder = 'py', remove_curly_bracket = TRUE, nonezh_repla
       if (length(codeloc) > 0) headerloc <- headerloc[!sapply(headerloc, function(x) sum(x > codeloc[seq(1, length(codeloc), by = 2)] & x < codeloc[seq(2, length(codeloc), by = 2)])) == 1]
       if (remove_curly_bracket) md[headerloc] <- gsub(pattern = '\\{.*\\}', '', md[headerloc])
       for (i in headerloc){
-        headerpy <- pinyin(mychar = sub('^#* ', '', md[i]), method = 'toneless', sep = '', nonezh_replace = nonezh_replace, only_first_letter = only_first_letter)
+        headerpy <- pinyin(mychar = sub('^#* ', '', md[i]), py = py,
+                           sep = '',
+                           nonezh_replace = nonezh_replace)
         headerpy <- tolower(headerpy)
         headerpy <- gsub('[^a-z]', '_', headerpy)
         md[i] <- paste(md[i], ' {#', headerpy, '}', sep = '')
@@ -167,7 +184,6 @@ bookdown2py <- function(folder = 'py', remove_curly_bracket = TRUE, nonezh_repla
   } else {message(paste('The directory', folder, 'does not exist!'))}
 }
 
-#############################################################
 #' Convert entire files into Pinyin
 #'
 #' @param folder character. The folder in which the files are to be converted.
@@ -181,6 +197,8 @@ bookdown2py <- function(folder = 'py', remove_curly_bracket = TRUE, nonezh_repla
 #' @param only_first_letter logical. Wheter only the first letter in pinyin.
 #' @param multi logical. Whether display multiple pronounciations of a Chinese character or only the first pronounciation.
 #' @param encoding character. The encoding of the input files. 'UTF-8' by default.
+#' @param py See `help(pinyin)`.
+#' @param dic See `help(pinzin)`.
 #'
 #' @return files converted to Pinyin.
 #' @export
@@ -190,9 +208,20 @@ bookdown2py <- function(folder = 'py', remove_curly_bracket = TRUE, nonezh_repla
 #' file.create(paste0(mydir, '/test.txt'))
 #' writeLines(text = 'test\n', paste0(mydir, '/test.txt'))
 #' file2py(mydir)
-file2py <- function(folder = 'py', backup = TRUE, method = c('quanpin', 'tone', 'toneless'), sep = ' ', nonezh_replace = NULL, only_first_letter = FALSE, multi = FALSE, encoding = 'UTF-8') {
+file2py <- function(folder = 'py',
+                    backup = TRUE,
+                    method = c('quanpin', 'tone', 'toneless'),
+                    sep = ' ',
+                    nonezh_replace = NULL,
+                    only_first_letter = FALSE,
+                    multi = FALSE,
+                    encoding = 'UTF-8',
+                    py = NA,
+                    dic = c('zh', 'zh2')) {
   if (dir.exists(folder)) {
     method <- match.arg(method)
+    dic <- match.arg(dic)
+    if(class(py)!= 'environment')  py <- pylib(method = method, multi = multi, only_first_letter = only_first_letter, dic = dic)
     i <- 0
     filedir <- dir(folder, full.names = TRUE)
     filenr <- length(filedir)
@@ -201,7 +230,7 @@ file2py <- function(folder = 'py', backup = TRUE, method = c('quanpin', 'tone',
       i <- i + 1
       if (backup) file.copy(filename, to = paste0(filename, 'backup'))
       oldfile <- readLines(filename, encoding = encoding)
-      newfile <- sapply(oldfile, pinyin, method = method, sep = sep, nonezh_replace = nonezh_replace, only_first_letter = only_first_letter, multi = multi)
+      newfile <- sapply(oldfile, pinyin, py = py, sep = sep, nonezh_replace = nonezh_replace)
       writeLines(text = newfile, filename, useBytes = TRUE)
       message(paste(filename, 'converted.',  i, '/', filenr))
     }

diff --git a/R/wubi.R b/R/wubi.R
@@ -0,0 +1,62 @@
+#' Wubi database
+#'
+#' A lookup table for the wubi code of Chinese characters
+#'
+#' @docType data
+#'
+#' @usage data(WBlib)
+#'
+#' @format An environment with Chinese characters as the names.
+#'
+#' @keywords datasets
+#'
+#'
+#' @source \href{https://github.com/erstern/98WuBi}{erstern/98WuBi}
+#'
+#' @examples
+#' data(WBlib)
+#'
+"WBlib"
+
+
+#' Convert Chinese strings to wubi code (based on radicals).
+#'
+#' @param Chin.strs The string need to be converted
+#' @param sep Character used to seperate different characters
+#' @param parallel Whether or not use parallel calculation
+#'
+#' @return wubi code of \code{Chin.str}.
+#' @importFrom utils data
+#' @export
+#' @examples
+#' data(WBlib)
+#' wubi()
+wubi <- function(Chin.strs = NA, sep = "_", parallel = FALSE){
+  if(is.na(Chin.strs)) return(print('Please give a valid string.'))
+  # Convert one string to wubi code
+  ChStr2wb <- function(Chin.str, WBlib){
+    Sys.setlocale(category = 'LC_ALL', locale = 'chs')
+    Chin.char <- unlist(strsplit(Chin.str, split = "")) # divide the string to characters
+
+    # convert a single character to wubi code
+    ChChar2wb <- function(Chin.char){
+      ChCharwb <- WBlib[[Chin.char]]
+      if(is.null(ChCharwb)) ChCharwb <- Chin.char
+      return(ChCharwb)
+    }
+
+    paste(sapply(Chin.char, ChChar2wb), collapse = sep)
+  }
+
+  # Use parallel computing to convert strings if parallel is TRUE
+  if(parallel)
+  {
+    no_cores <- parallel::detectCores() - 1  # Get the number of available string
+    cl <- parallel::makeCluster(no_cores)   # Initiate cluster
+    wbcode <- parallel::parSapply(cl, X = Chin.strs, FUN = ChStr2wb, WBlib)
+    parallel::stopCluster(cl)
+    return(wbcode)
+  } else {
+    sapply(Chin.strs, ChStr2wb, WBlib)
+  }
+}
diff --git a/data/WBlib.rda b/data/WBlib.rda
diff --git a/man/WBlib.Rd b/man/WBlib.Rd
diff --git a/man/bookdown2py.Rd b/man/bookdown2py.Rd
diff --git a/man/file.rename2py.Rd b/man/file.rename2py.Rd
diff --git a/man/file2py.Rd b/man/file2py.Rd
diff --git a/man/wubi.Rd b/man/wubi.Rd