Permalink
Browse files

Unidecode. unidecode-chars.el still requires optimization

  • Loading branch information...
1 parent 5147e84 commit 55344fe1714da390df5403fd960d13d9b1576e8c @sindikat committed Jan 7, 2013
Showing with 86 additions and 68 deletions.
  1. +44 −0 README.org
  2. +0 −68 sanitize.el
  3. BIN sanitize-chars.el → unidecode-chars.el
  4. +42 −0 unidecode.el
View
@@ -0,0 +1,44 @@
+* How this package was made
+
+The following is the explanation of the process of converting the
+Python package to Emacs Lisp.
+
+The following Python script was used to export Unidecode data to JSON.
+
+#+BEGIN_SRC python
+import os, sys
+
+xs = os.listdir('.')
+xs.remove('__init__.py')
+xs.sort()
+
+final_data = []
+for filename in xs:
+ module = __import__(filename[:-3])
+ module_data = list(module.data)
+
+ # some modules have data of 255 entries, fill them up to 256
+ if len(module_data) < 256:
+ module_data += [''] * (256 - len(module_data))
+
+ final_data.extend(module_data)
+#+END_SRC
+
+The following command was used to dump =final_data= to a JSON file.
+
+#+BEGIN_SRC python
+import json
+
+with open('unidecode.json', 'w') as filename:
+ json.dump(final_data, filename)
+#+END_SRC
+
+The following command was used to load JSON data as Emacs Lisp vector,
+after installing =json= package in Emacs:
+
+#+BEGIN_SRC emacs-lisp
+(json-read-file "unidecode.json")
+#+END_SRC
+
+After that the resulting vector was just stored verbatim in
+"unidecode-chars.el".
View
@@ -1,68 +0,0 @@
-;;; sanitize.el --- convert Unicode text into safe ASCII strings
-;; Copyright (C) 2013 sindikat
-;;
-;; Author: sindikat <sindikat at mail36 dot net>
-;; Version: 0.1
-;;
-;; This file is not part of GNU Emacs.
-;;
-;; This file is put into public domain to the extend possible by law.
-;;
-;;; Commentary:
-
-;; This package is Python Unidecode implementation in Emacs Lisp.
-;;
-;; Python Unidecode can be found here:
-;; http://pypi.python.org/pypi/Unidecode/
-;;
-;; The following is the explanation of the process of converting the
-;; Python package to Emacs Lisp.
-;;
-;; The following Python script was used
-;; to export Unidecode data to JSON.
-
-;; import os, sys
-;;
-;; xs = os.listdir('.')
-;; xs.remove('__init__.py')
-;; xs.sort()
-;;
-;; final_data = []
-;; for filename in xs:
-;; module = __import__(filename[:-3])
-;; module_data = list(module.data)
-;;
-;; # some modules have data of 255 entries, fill them up to 256
-;; if len(module_data) < 256:
-;; module_data += [''] * (256 - len(module_data))
-;;
-;; final_data.extend(module_data)
-;;
-;; import json
-;;
-;; with open('unidecode.json', 'w') as filename:
-;; json.dump(final_data, filename)
-
-;; The following command was used to load JSON data as Emacs Lisp
-;; vector, after installing `json` package in Emacs:
-;;
-;; (json-read-file "unidecode.json")
-;;
-;; After that the resulting vector was just stored verbatim in
-;; "sanitize-chars.el".
-
-;;; Code:
-
-(setq sanitize/unidecode-chars (read (find-file-noselect "sanitize-chars.el")))
-
-(defun sanitize/unidecode (s)
- (apply #'concat (mapcar (lambda (ch) (elt sanitize/unidecode-chars ch)) s)))
-
-(defun sanitize/sanitize (s)
- "Strip all chars from string that are not alphanumeric or
-hyphen, convert space to hyphen"
- (let ((s (replace-regexp-in-string " " "-" (sanitize/unidecode (downcase s))))
- (valid "abcdefghijklmnopqrstuvwxyz1234567890-"))
- (remove-if-not (lambda (ch) (find ch valid)) s)))
-
-;;; sanitize.el ends here
File renamed without changes.
View
@@ -0,0 +1,42 @@
+;;; unidecode.el --- convert Unicode text into safe ASCII strings
+;; Copyright (C) 2013 sindikat
+;;
+;; Author: sindikat <sindikat at mail36 dot net>
+;; Version: 0.1
+;;
+;; This file is not part of GNU Emacs.
+;;
+;; This file is put into public domain to the extend possible by law.
+;;
+;;; Commentary:
+
+;; Transliterate Unicode characters into one of 128 ASCII characters.
+;; This package is an Emacs Lisp port of Python Unidecode package.
+;;
+;; Python Unidecode can be found here:
+;; http://pypi.python.org/pypi/Unidecode/
+;;
+;; * TODO unidecode-chars.el should be heavily optimized, probably the
+;; same way, as in Python Unidecode
+;;
+;; More information in file README.org
+
+;;; Code:
+
+(defvar unidecode-chars
+ (read (find-file-noselect "unidecode-chars.el"))
+ "Contains vector of unidecoded chars corresponding to Unicode
+ code point of an original char")
+
+(defun unidecode-unidecode (s)
+ (apply #'concat (mapcar (lambda (ch) (elt unidecode-chars ch)) s)))
+
+(defun unidecode-sanitize (s)
+ "Strip all chars from string that are not alphanumeric or
+hyphen, convert space to hyphen"
+ (let ((s (replace-regexp-in-string " " "-" (unidecode-unidecode (downcase s))))
+ (valid "abcdefghijklmnopqrstuvwxyz1234567890-"))
+ (remove-if-not (lambda (ch) (find ch valid)) s)))
+
+(provide 'unidecode)
+;;; unidecode.el ends here

0 comments on commit 55344fe

Please sign in to comment.