Skip to content

Commit

Permalink
convert Chnese characters into Wubi codes
Browse files Browse the repository at this point in the history
  • Loading branch information
pzhaonet committed Oct 12, 2018
1 parent cbdd753 commit 59aaf6a
Show file tree
Hide file tree
Showing 15 changed files with 201 additions and 22 deletions.
6 changes: 3 additions & 3 deletions pinyin.Rproj → -pinyin.Rproj
@@ -1,8 +1,8 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: No

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
Expand Down
9 changes: 7 additions & 2 deletions .Rbuildignore
@@ -1,3 +1,8 @@
^.*\.Rproj$
LICENCE
^\.Rproj\.user$
readme*.*
\.Rproj$
^\.gitignore$
^\.travis\.yml$
showcase/
vignettes/
readme*.*
3 changes: 3 additions & 0 deletions .gitignore
@@ -1,3 +1,6 @@
LICENSE
.Rhistory

.Rproj.user/
pinyin.Rproj
.Rbuildignore
Expand Down
5 changes: 3 additions & 2 deletions DESCRIPTION
@@ -1,6 +1,6 @@
Package: pinyin
Version: 1.1.3
Date: 2018-09-10
Date: 2018-10-10
Title: Convert Chinese Characters into Pinyin
Author: Peng Zhao
Authors@R: c(
Expand All @@ -10,7 +10,8 @@ Maintainer: Peng Zhao <pzhao@pzhao.net>
Depends: R (>= 3.1.0)
Imports:
splitstackshape,
data.table
data.table,
parallel
Suggests:
Description: Convert Chinese characters into Pinyin (the official romanization system for Standard Chinese in mainland China, Malaysia, Singapore, and Taiwan. See <https://en.wikipedia.org/wiki/Pinyin> for details).
License: MIT + file LICENSE
Expand Down
21 changes: 21 additions & 0 deletions LICENCE
@@ -0,0 +1,21 @@
The MIT License

Copyright (c) 2018 Peng Zhao, https://pzhao.org

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
2 changes: 0 additions & 2 deletions LICENSE

This file was deleted.

2 changes: 2 additions & 0 deletions NAMESPACE
Expand Up @@ -7,6 +7,8 @@ export(file2py)
export(four_corner)
export(pinyin)
export(pylib)
export(wubi)
importFrom(data.table,as.data.table)
importFrom(splitstackshape,cSplit)
importFrom(stats,setNames)
importFrom(utils,data)
49 changes: 39 additions & 10 deletions R/pinyin.R
Expand Up @@ -35,7 +35,6 @@ pinyin <- function(mychar = '',
return(pinyin)
}

#############################################################
#' A Pinyin library
#'
#' @param method character. The value can be:
Expand Down Expand Up @@ -108,10 +107,11 @@ pylib <- function(method = c('quanpin', 'tone', 'toneless'),
}


#############################################################
#' Rename files with Chinese characters to pinyin
#'
#' @param folder character. The folder in which the files are to be renamed.
#' @param py See `help(pinyin)`.
#' @param dic See `help(pinzin)`.
#'
#' @return files with new names.
#' @export
Expand All @@ -120,22 +120,27 @@ pylib <- function(method = c('quanpin', 'tone', 'toneless'),
#' dir.create(mydir)
#' file.create(paste0(mydir, '/test.txt'))
#' file.rename2py(mydir)
file.rename2py <- function(folder = 'py') {
file.rename2py <- function(folder = 'py', py = NA, dic = c('zh', 'zh2')) {
if (dir.exists(folder)) {
dic <- match.arg(dic)
if(class(py)!= 'environment') py <- pylib(method = 'toneless',
only_first_letter = TRUE, dic = dic)

oldname <- dir(folder, full.names = TRUE)
newname <- paste(folder, sapply(dir(folder), pinyin, method = 'toneless', sep = '', nonezh_replace = NULL, only_first_letter = TRUE), sep = '/')
newname <- paste(folder, sapply(dir(folder), pinyin, sep = '', nonezh_replace = NULL, py = py), sep = '/')
file.rename(oldname, newname)
} else {message(paste('The directory', folder, 'does not exist!'))}

}

#############################################################
#' Convert the Chinese headers of bookdown .Rmd files into Pinyin
#'
#' @param folder character. The folder in which the files are to be converted.
#' @param remove_curly_bracket logical. Whether to remove existing curly brackets in the headers.
#' @param nonezh_replace NULL or character. Define how to convert non-Chinese characters in mychar. NULL means 'let it be'.
#' @param only_first_letter logical. Wheter only the first letter in pinyin.
#' @param py See `help(pinyin)`.
#' @param dic See `help(pinzin)`.
#'
#' @return new .Rmd files with Pinyin headers.
#' @export
Expand All @@ -145,8 +150,18 @@ file.rename2py <- function(folder = 'py') {
#' file.create(paste0(mydir, '/test.txt'))
#' writeLines(text = '# test\n', paste0(mydir, '/test.txt'))
#' bookdown2py(mydir)
bookdown2py <- function(folder = 'py', remove_curly_bracket = TRUE, nonezh_replace = NULL, only_first_letter = TRUE) {
bookdown2py <- function(folder = 'py',
remove_curly_bracket = TRUE,
nonezh_replace = NULL,
only_first_letter = TRUE,
py = NA,
dic = c('zh', 'zh2')) {
if (dir.exists(folder)) {
dic <- match.arg(dic)
if(class(py)!= 'environment') py <- pylib(method = 'toneless',
only_first_letter = only_first_letter,
dic = dic)

for (filename in dir(folder, full.names = TRUE)) {
# filename <- dir(folder, full.names = TRUE)[1]
file.copy(filename, to = paste0(filename, 'backup'))
Expand All @@ -157,7 +172,9 @@ bookdown2py <- function(folder = 'py', remove_curly_bracket = TRUE, nonezh_repla
if (length(codeloc) > 0) headerloc <- headerloc[!sapply(headerloc, function(x) sum(x > codeloc[seq(1, length(codeloc), by = 2)] & x < codeloc[seq(2, length(codeloc), by = 2)])) == 1]
if (remove_curly_bracket) md[headerloc] <- gsub(pattern = '\\{.*\\}', '', md[headerloc])
for (i in headerloc){
headerpy <- pinyin(mychar = sub('^#* ', '', md[i]), method = 'toneless', sep = '', nonezh_replace = nonezh_replace, only_first_letter = only_first_letter)
headerpy <- pinyin(mychar = sub('^#* ', '', md[i]), py = py,
sep = '',
nonezh_replace = nonezh_replace)
headerpy <- tolower(headerpy)
headerpy <- gsub('[^a-z]', '_', headerpy)
md[i] <- paste(md[i], ' {#', headerpy, '}', sep = '')
Expand All @@ -167,7 +184,6 @@ bookdown2py <- function(folder = 'py', remove_curly_bracket = TRUE, nonezh_repla
} else {message(paste('The directory', folder, 'does not exist!'))}
}

#############################################################
#' Convert entire files into Pinyin
#'
#' @param folder character. The folder in which the files are to be converted.
Expand All @@ -181,6 +197,8 @@ bookdown2py <- function(folder = 'py', remove_curly_bracket = TRUE, nonezh_repla
#' @param only_first_letter logical. Wheter only the first letter in pinyin.
#' @param multi logical. Whether display multiple pronounciations of a Chinese character or only the first pronounciation.
#' @param encoding character. The encoding of the input files. 'UTF-8' by default.
#' @param py See `help(pinyin)`.
#' @param dic See `help(pinzin)`.
#'
#' @return files converted to Pinyin.
#' @export
Expand All @@ -190,9 +208,20 @@ bookdown2py <- function(folder = 'py', remove_curly_bracket = TRUE, nonezh_repla
#' file.create(paste0(mydir, '/test.txt'))
#' writeLines(text = 'test\n', paste0(mydir, '/test.txt'))
#' file2py(mydir)
file2py <- function(folder = 'py', backup = TRUE, method = c('quanpin', 'tone', 'toneless'), sep = ' ', nonezh_replace = NULL, only_first_letter = FALSE, multi = FALSE, encoding = 'UTF-8') {
file2py <- function(folder = 'py',
backup = TRUE,
method = c('quanpin', 'tone', 'toneless'),
sep = ' ',
nonezh_replace = NULL,
only_first_letter = FALSE,
multi = FALSE,
encoding = 'UTF-8',
py = NA,
dic = c('zh', 'zh2')) {
if (dir.exists(folder)) {
method <- match.arg(method)
dic <- match.arg(dic)
if(class(py)!= 'environment') py <- pylib(method = method, multi = multi, only_first_letter = only_first_letter, dic = dic)
i <- 0
filedir <- dir(folder, full.names = TRUE)
filenr <- length(filedir)
Expand All @@ -201,7 +230,7 @@ file2py <- function(folder = 'py', backup = TRUE, method = c('quanpin', 'tone',
i <- i + 1
if (backup) file.copy(filename, to = paste0(filename, 'backup'))
oldfile <- readLines(filename, encoding = encoding)
newfile <- sapply(oldfile, pinyin, method = method, sep = sep, nonezh_replace = nonezh_replace, only_first_letter = only_first_letter, multi = multi)
newfile <- sapply(oldfile, pinyin, py = py, sep = sep, nonezh_replace = nonezh_replace)
writeLines(text = newfile, filename, useBytes = TRUE)
message(paste(filename, 'converted.', i, '/', filenr))
}
Expand Down
62 changes: 62 additions & 0 deletions R/wubi.R
@@ -0,0 +1,62 @@
#' Wubi database
#'
#' A lookup table for the wubi code of Chinese characters
#'
#' @docType data
#'
#' @usage data(WBlib)
#'
#' @format An environment with Chinese characters as the names.
#'
#' @keywords datasets
#'
#'
#' @source \href{https://github.com/erstern/98WuBi}{erstern/98WuBi}
#'
#' @examples
#' data(WBlib)
#'
"WBlib"


#' Convert Chinese strings to wubi code (based on radicals).
#'
#' @param Chin.strs The string need to be converted
#' @param sep Character used to seperate different characters
#' @param parallel Whether or not use parallel calculation
#'
#' @return wubi code of \code{Chin.str}.
#' @importFrom utils data
#' @export
#' @examples
#' data(WBlib)
#' wubi()
wubi <- function(Chin.strs = NA, sep = "_", parallel = FALSE){
if(is.na(Chin.strs)) return(print('Please give a valid string.'))
# Convert one string to wubi code
ChStr2wb <- function(Chin.str, WBlib){
Sys.setlocale(category = 'LC_ALL', locale = 'chs')
Chin.char <- unlist(strsplit(Chin.str, split = "")) # divide the string to characters

# convert a single character to wubi code
ChChar2wb <- function(Chin.char){
ChCharwb <- WBlib[[Chin.char]]
if(is.null(ChCharwb)) ChCharwb <- Chin.char
return(ChCharwb)
}

paste(sapply(Chin.char, ChChar2wb), collapse = sep)
}

# Use parallel computing to convert strings if parallel is TRUE
if(parallel)
{
no_cores <- parallel::detectCores() - 1 # Get the number of available string
cl <- parallel::makeCluster(no_cores) # Initiate cluster
wbcode <- parallel::parSapply(cl, X = Chin.strs, FUN = ChStr2wb, WBlib)
parallel::stopCluster(cl)
return(wbcode)
} else {
sapply(Chin.strs, ChStr2wb, WBlib)
}
}
Binary file added data/WBlib.rda
Binary file not shown.
21 changes: 21 additions & 0 deletions man/WBlib.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/bookdown2py.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/file.rename2py.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/file2py.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions man/wubi.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 59aaf6a

Please sign in to comment.