From 8fe5bb6a8581581fdcad57a04911957528d4cb3c Mon Sep 17 00:00:00 2001
From: Santhosh Thottingal <santhosh.thottingal@gmail.com>
Date: Sat, 6 Oct 2012 18:25:09 +0530
Subject: [PATCH] Introduce optional language selector for spellcheck

And many python clean up
---
 .../modules/spellchecker/spellchecker.html    | 237 ++++++++++--------
 .../modules/spellchecker/spellchecker.py      | 137 +++++-----
 2 files changed, 199 insertions(+), 175 deletions(-)
diff --git a/src/silpa/modules/spellchecker/spellchecker.html b/src/silpa/modules/spellchecker/spellchecker.html
index 50850ef..681b65d 100644
--- a/src/silpa/modules/spellchecker/spellchecker.html
+++ b/src/silpa/modules/spellchecker/spellchecker.html
@@ -1,112 +1,137 @@
 <html>
-  <head>
-    <title></title>
-    <script type="text/javascript">
-    function docheck(form)
-    {
-    $('#progress') .html("Checking. Please Wait..");
-    $('#result').html("");
-    var word= form.word.value;
-    var jsonRequest = {
-        "method" :  "modules.Spellchecker.check",
-         "params" : [word], 
-         "id" : ""
-    };
-    $.ajax({
-        type: "POST",
-        contentType: "application/json; charset=utf-8",
-        url: "JSONRPC",
-        data: $.toJSON(jsonRequest), 
-        dataType: "json",
-        success: function(msg) {
-          var resultobj =  msg.result;
-          $('#progress').html("");
-          if(resultobj)   {
-                 $('#result').html("<b>  Correct Spelling.</b>"); 
-          }
-          else {
-                  $('#result').html("<b> Wrong Spelling</b>");
-                  getSuggestions(word);
-          }
-        },
-        error: function(msg) { alert(msg); }
-    });
-    return false;
-    }
-    function getSuggestions(word)
-    {
-    $('#progress').html("Fetching suggestions. Please wait...");
-    $('#errormessage').html("");
-    var jsonRequest = {
-        "method" :  "modules.Spellchecker.suggest",
-         "params" : [word], 
-         "id" : ""
-    };
-    $.ajax({
-        type: "POST",
-        contentType: "application/json; charset=utf-8",
-        url: "JSONRPC",
-        data: $.toJSON(jsonRequest), 
-        dataType: "json",
-        success: function(msg) {
-          var resultobj=eval(msg.result);
-          $('#progress').html("");
-          $('#result').html("<b>Wrong Spelling. Suggestions :</b><ul id='list'></ul>");
-	  if(resultobj.length != 0)
-	  {
-            $.each(resultobj,function(index, item)
-            {
-               $('#list').append($( "<li>" + item +"</li>" ));
-            });
-	  }
-	  else {
-	  	$('#list').append($( " <li> No suggestions available </li>" ));
-	  }
-        },
-        error: function(msg) { alert(msg); }
-    });
-    return false;
-    }
-    </script>
-  </head>
-  <body>
-        <h2>Spellchecker</h2></hr>
-        <p>This is a demo of spellcheck service of silpa. In this page, you can enter a word for checking the spelling and just get the result.
-        </p>
-        <p>For using Silpa spellcheck service in your web or desktop applications, read the <a href="apis.html#spellcheck">API documentation</a>. This page itself is an example for using silpa spellcheck APIs in a webpage.
-        </p>
-        <form action="" method="post" onsubmit="return docheck(this); " >
-        <p align="center">
-        Word : <input type="text"  id="word" style="width:12em;"/>
-        <input  type="submit" id="spellcheck" value="Spellcheck"  style="width:12em;"/>
-        </br>
-        </p>
-        </form>
-        <hr/>
-        <div id="progress"></div>
-        <div id="successmessage"></div>
-        <div id="errormessage"></div>
-        <div id="result"></div>
-<h3>Python Spell Check and Spell Suggest  API</h3>
-<ul>
-<li>Method: modules.Spellchecker.check
+<head>
+<title></title>
+<script type="text/javascript">
+   
+	function docheck ( form ) {
+		$( '#progress' ).html( "Checking. Please Wait.." );
+		$( '#result' ).html( "" );
+		var word = form.word.value;
+		var language = form.lang.value;
+		var jsonRequest = {
+			"method": "modules.Spellchecker.check",
+			"params": [ word, language ],
+			"id": ""
+		};
+		$.ajax( {
+			type: "POST",
+			contentType: "application/json; charset=utf-8",
+			url: "JSONRPC",
+			data: $.toJSON( jsonRequest ),
+			dataType: "json",
+			success: function ( msg ) {
+				var resultobj = msg.result;
+				$( '#progress' ).html( "" );
+				if ( resultobj ) {
+					$( '#result' ).html( "<b>  Correct Spelling.</b>" );
+				} else {
+					$( '#result' ).html( "<b> Wrong Spelling</b>" );
+					getSuggestions( word );
+				}
+			},
+			error: function (xhr, ajaxOptions, thrownError) {
+				alert( thrownError );
+			}
+		} );
+		return false;
+	}
+	function getSuggestions ( word ) {
+		$( '#progress' ).html( "Fetching suggestions. Please wait..." );
+		$( '#errormessage' ).html( "" );
+		var language = form.lang.value;
+		var jsonRequest = {
+			"method": "modules.Spellchecker.suggest",
+			"params": [ word, language ],
+			"id": ""
+		};
+		$.ajax( {
+			type: "POST",
+			contentType: "application/json; charset=utf-8",
+			url: "JSONRPC",
+			data: $.toJSON( jsonRequest ),
+			dataType: "json",
+			success: function ( msg ) {
+				var resultobj = eval( msg.result );
+				$( '#progress' ).html( "" );
+				$( '#result' ).html( "<b>Wrong Spelling. Suggestions :</b><ul id='list'></ul>" );
+				if ( resultobj.length != 0 ) {
+					$.each( resultobj, function ( index, item ) {
+						$( '#list' ).append( $( "<li>" + item + "</li>" ) );
+					} );
+				} else {
+					$( '#list' ).append( $( " <li> No suggestions available </li>" ) );
+				}
+			},
+			error: function (xhr, ajaxOptions, thrownError) {
+				alert( thrownError );
+			}
+		} );
+		return false;
+	}
+</script>
+</head>
+<body>
+	<h2>Spellchecker</h2>
+	</hr>
+	<p>This is a demo of spellcheck service of silpa. In this page, you
+		can enter a word for checking the spelling and just get the result.</p>
+	<p>
+		For using Silpa spellcheck service in your web or desktop
+		applications, read the <a href="apis.html#spellcheck">API
+			documentation</a>. This page itself is an example for using silpa
+		spellcheck APIs in a webpage.
+	</p>
+	<form action="" method="post" onsubmit="return docheck(this); ">
+		<p align="center">
+			<span>Word : </span> <input type="text" id="word"
+				style="width: 12em;" /> <select id="lang" name="lang"
+				style="width: 12em;">
+				<option value=''>Auto detect</option>
+				<option value="hi_IN">Hindi</option>
+				<option value="mr_IN">Marathi</option>
+				<option value="ml_IN">Malayalam</option>
+				<option value="bn_IN">Bengali</option>
+				<option value="ta_IN">Tamil</option>
+				<option value="te_IN">Telugu</option>
+				<option value="or_IN">Oriya</option>
+				<option value="gu_IN">Gujarati</option>
+				<option value="pa_IN">Punjabi</option>
+				<option value="kn_IN">Kannada</option>
+				<option value="en_US">English</option>
+				<option value="ISO15919">ISO 15919:2001</option>
+				<option value="IPA">International Phonetical Alphabet(IPA)</option>
+			</select> <input type="submit" id="spellcheck" value="Spellcheck"
+				style="width: 12em;" /> </br>
+		</p>
+	</form>
+	<hr />
+	<div id="progress"></div>
+	<div id="successmessage"></div>
+	<div id="errormessage"></div>
+	<div id="result"></div>
+	<h3>Python Spell Check and Spell Suggest API</h3>
+	<ul>
+		<li>Method: modules.Spellchecker.check
 
-<ul>
-	<li>arg1 : the word</li>
-	<li>arg2 : the language for the word(optional)</li>
-	<li>Return : True or False. True means the word is with correct spelling. Otherwise false.</li>
-</ul></li>
+			<ul>
+				<li>arg1 : the word</li>
+				<li>arg2 : the language for the word(optional)</li>
+				<li>Return : True or False. True means the word is with correct
+					spelling. Otherwise false.</li>
+			</ul>
+		</li>
 
 
-<li>Method: modules.Spellchecker.suggest
-<ul>
-	<li>arg1 : the word</li>
-	<li>arg2 : the language for the word(optional)</li>
-	<li>Return : List of string containing spelling suggestions</li>
-</ul></li>
-</ul>
-Sample usage is given below.
-<pre class="code">
+		<li>Method: modules.Spellchecker.suggest
+			<ul>
+				<li>arg1 : the word</li>
+				<li>arg2 : the language for the word(optional)</li>
+				<li>Return : List of string containing spelling suggestions</li>
+			</ul>
+		</li>
+	</ul>
+	Sample usage is given below.
+	<pre class="code">
 # -*- coding: utf-8 -*-
 >>>from jsonrpc import ServiceProxy
 >>>silpaService = ServiceProxy("http://smc.org.in/silpa/JSONRPC")
@@ -117,6 +142,6 @@ <h3>Python Spell Check and Spell Suggest  API</h3>
 >>>print silpaService.modules.Spellchecker.suggest("speling")
 ["spelling","spieling","spewing"]
 </pre>
-  </body>
+</body>
 </html>
 
diff --git a/src/silpa/modules/spellchecker/spellchecker.py b/src/silpa/modules/spellchecker/spellchecker.py
index 6d8a68f..6021d1f 100644
--- a/src/silpa/modules/spellchecker/spellchecker.py
+++ b/src/silpa/modules/spellchecker/spellchecker.py
@@ -2,26 +2,26 @@
 # Spellchecker
 # Copyright 2008-2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
 # http://www.smc.org.in
-#
+# 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
 # the Free Software Foundation; either version 3 of the License, or
 # (at your option) any later version.
-#
+# 
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Lesser General Public License for more details.
-#
+# 
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-#
+# 
 import sys
 import os
 import string
 import codecs
-from common import SilpaModule,ServiceMethod
+from common import SilpaModule, ServiceMethod
 from utils import detect_lang, silpautils, silpalogger
 from modules.inexactsearch import inexactsearch
 from indexer import DictionaryIndex
@@ -29,43 +29,42 @@
 import urllib
 
 class Spellchecker(SilpaModule):
-    
+
     def __init__(self):
-        self.template=os.path.join(os.path.dirname(__file__), 'spellchecker.html')
+        self.template = os.path.join(os.path.dirname(__file__), 'spellchecker.html')
         self.NWORDS = None
         self.lang = None
         self.dictionaries = {}
         self.response = SilpaResponse(self.template)
-        
-    def words(self,text): 
-        #no need to check for punctuation since we are loading a proof read wordlist
-        #for punct in string.punctuation:
+
+    def words(self, text):
+        # no need to check for punctuation since we are loading a proof read wordlist
+        # for punct in string.punctuation:
         #    text = text.replace(punct,"")
         words = text.split()
         return set(words)
 
-    def train(self,features=None):
+    def train(self, features=None):
         if not self.dictionaries.has_key(self.lang):
             index = DictionaryIndex()
-            self.dictionaries[self.lang] = index.load_index(self.lang+".dic")
-    
-    
-    def get_wordlist(self,word=""):
-        index = self.dictionaries.get(self.lang,None)
+            self.dictionaries[self.lang] = index.load_index(self.lang + ".dic")
+
+    def get_wordlist(self, word=""):
+        index = self.dictionaries.get(self.lang, None)
         if index == None:
             self.train()
-            index = self.dictionaries.get(self.lang,None)
+            index = self.dictionaries.get(self.lang, None)
 
         words = []
         if word == "":
             return words
 
-        byte_offset = index.get(word[0],None)
+        byte_offset = index.get(word[0], None)
         if byte_offset == None:
             return words
 
-        path = os.path.join(os.path.dirname(__file__),"dicts/"+self.lang+".dic")
-        fp = codecs.open(path,"r",encoding="utf-8",errors="ignore")
+        path = os.path.join(os.path.dirname(__file__), "dicts/" + self.lang + ".dic")
+        fp = codecs.open(path, "r", encoding="utf-8", errors="ignore")
         fp.seek(int(byte_offset))
 
         while True:
@@ -76,8 +75,8 @@ def get_wordlist(self,word=""):
 
         return words
 
-        
-    def levenshtein(self,s1, s2):
+
+    def levenshtein(self, s1, s2):
         """
         Return the levenshtein distance between two string
         """
@@ -85,96 +84,96 @@ def levenshtein(self,s1, s2):
             return self.levenshtein(s2, s1)
         if not s1:
             return len(s2)
-        
+
         previous_row = xrange(len(s2) + 1)
         for i, c1 in enumerate(s1):
             current_row = [i + 1]
             for j, c2 in enumerate(s2):
                 insertions = previous_row[j + 1] + 1
                 deletions = current_row[j] + 1
-                substitutions = previous_row[j] + (c1 != c2)
+                substitutions = previous_row[j] + 1  if (c1 != c2) else 0
                 current_row.append(min(insertions, deletions, substitutions))
             previous_row = current_row
         return previous_row[-1]
-    
+
     @ServiceMethod
-    def suggest(self,word, language=None, distance=2):
-        word=word.strip()
-        if word=="": 
+    def suggest(self, word, language=None, distance=2):
+        word = word.strip()
+        if word == "":
             return None
         if self.lang != language:
             self.NWORDS = None
-        if language==None :
+        if language == None :
             self.lang = detect_lang(word)[word]
         else :
             self.lang = language
         if self.NWORDS == None:
-            self.NWORDS = self.get_wordlist(word) 
+            self.NWORDS = self.get_wordlist(word)
         if word in self.NWORDS:
-            return word        
+            return word
         candidates = []
         for candidate in self.NWORDS:
-            #skip if the first letter is different
-            #if candidate[0] != word[0]:
+            # skip if the first letter is different
+            # if candidate[0] != word[0]:
             #    continue
-            #if the length difference is greater than the threshold distance, skip
-            if len(candidate) - len(word)  > distance or len(word) - len(candidate)  >    distance :
+            # if the length difference is greater than the threshold distance, skip
+            if len(candidate) - len(word) > distance or len(word) - len(candidate) > distance :
                 continue
             if not self.levenshtein(candidate, word) > distance :
                 candidates.append(candidate)
         candidates = self.filter_candidates(word, candidates)
-        if len(candidates)==0:
-            #try inserting spaces in between the letters to see if the word got merged
+        if len(candidates) == 0:
+            # try inserting spaces in between the letters to see if the word got merged
             pos = 2;
-            while pos < len(word)-2:
-                if self.check(word[:pos],self.lang) and self.check(word[pos:],self.lang):
-                    candidates.append(word[:pos]+" "+word[pos:])
-                    candidates.append(word[:pos]+"-"+word[pos:])
-                pos+=1    
+            while pos < len(word) - 2:
+                if self.check(word[:pos], self.lang) and self.check(word[pos:], self.lang):
+                    candidates.append(word[:pos] + " " + word[pos:])
+                    candidates.append(word[:pos] + "-" + word[pos:])
+                pos += 1
         return candidates
-        
+
     def filter_candidates(self, word, candidates):
-        filtered_candidates=[]
-        isearch = inexactsearch.getInstance() 
-        #TODO sort by score
+        filtered_candidates = []
+        isearch = inexactsearch.getInstance()
+        # TODO sort by score
         for candidate in candidates:
-            if isearch.compare(word,candidate) >= 0.6:  #if both words sounds alike - almost
+            if isearch.compare(word, candidate) >= 0.6:  # if both words sounds alike - almost
                 filtered_candidates.append(candidate)
         return filtered_candidates
 
-            
-    @ServiceMethod                  
+
+    @ServiceMethod
     def check(self, word, language=None):
-        word=word.strip()
-        if word == "": 
+        word = word.strip()
+        if word == "":
             return None
-        #If it is a number, don't do spelcheck
-        if silpautils.is_number(word): 
-            return True            
+        # If it is a number, don't do spelcheck
+        if silpautils.is_number(word):
+            return True
         if self.lang != language:
             self.NWORDS = None
         if language == None :
             self.lang = detect_lang(word)[word]
         else :
             self.lang = language
-        if word=="": return True
-        
-        if self.NWORDS == None: 
-            self.NWORDS = self.get_wordlist(word)  
-        if self.NWORDS == None:           
+        if word == "": return True
+
+        if self.NWORDS == None:
+            self.NWORDS = self.get_wordlist(word)
+        if self.NWORDS == None:
             # Dictionary not found
             return False
         result = word in self.NWORDS
-        #if it is english word, try converting the first letter to lower case.
-        #This will happen if the word is first word of a sentence
+        # if it is english word, try converting the first letter to lower case.
+        # This will happen if the word is first word of a sentence
         if result == False and word.upper() != word.lower():
-            newword = word[0].lower()+word[1:]
-            self.NWORDS = self.get_wordlist(newword)  
+            newword = word[0].lower() + word[1:]
+            self.NWORDS = self.get_wordlist(newword)
             return newword in self.NWORDS
         else:
-            return result    
-            
-    def strip_punctuations(self,s):
+            return result
+
+    def strip_punctuations(self, s):
         """
         Remove all the punctuation characters from the string and return the resulting string
         """
@@ -191,14 +190,14 @@ def check_batch(self, text, language=None):
        words = words.split()
        misspelled_words = []
        for word in words:
-           tempword = self.strip_punctuations(word) 
+           tempword = self.strip_punctuations(word)
            if not self.check(tempword, language):
                misspelled_words.append(word)
        return misspelled_words
 
     def get_module_name(self):
         return "Spellchecker"
-        
+
     def get_info(self):
         return "Indic Spellchecker"