Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #22 from van51/master
Added lang_detect backend for the demo
- Loading branch information
Showing
5 changed files
with
103 additions
and
1 deletion.
There are no files selected for viewing
Submodule data
updated
23 files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from modshogun import MulticlassLibLinear | ||
from numpy import array | ||
|
||
import json | ||
import sys | ||
import gzip as gz | ||
import pickle as pkl | ||
|
||
default_filepath = "data/lang_detection/default.svm.gz" | ||
|
||
id_to_lang = {0 : "English", 1 : "Greek", 2 : "German", | ||
3 : "Spanish", 4 : "Italian"} | ||
|
||
class LanguageClassifier: | ||
def __init__(self): | ||
self.svm = None | ||
|
||
def load_classifier(self): | ||
gz_stream = gz.open(default_filepath, 'rb') | ||
self.svm = pkl.load(gz_stream) | ||
gz_stream.close() | ||
|
||
def load_svm(self, filepath): | ||
from modshogun import SerializableAsciiFile | ||
|
||
print("Attempting to load a multiclass liblinear svm from \"" + | ||
filepath +"\"") | ||
self.svm = MulticlassLibLinear() | ||
loader = SerializableAsciiFile(filepath, "r") | ||
self.svm.load_serializable(loader) | ||
print("Svm succesfully loaded") | ||
|
||
|
||
def classify_doc(self, doc): | ||
from modshogun import StringCharFeatures, RAWBYTE | ||
from modshogun import HashedDocDotFeatures | ||
from modshogun import NGramTokenizer | ||
from modshogun import MulticlassLabels | ||
|
||
docs = [doc] | ||
string_feats = StringCharFeatures(docs, RAWBYTE) | ||
tokenizer = NGramTokenizer(4) | ||
normalize = True | ||
num_bits = 18 | ||
|
||
hashed_doc_feats = HashedDocDotFeatures(num_bits, string_feats, | ||
tokenizer, normalize, 3, 2) | ||
|
||
labels = self.svm.apply(hashed_doc_feats).get_labels() | ||
|
||
return id_to_lang[labels[0]] | ||
|
||
if __name__=='__main__': | ||
lc = LanguageClassifier() | ||
if len(sys.argv)==1: | ||
lc.load_classifier() | ||
else: | ||
lc.load_svm(sys.argv[1]) | ||
|
||
while True: | ||
print("Enter a sentence to classify or type \"!quit\" to quit") | ||
sentence = raw_input() | ||
if sentence=='!quit': | ||
break | ||
|
||
lang = lc.classify_doc(sentence) | ||
print("Your sentence \"" + sentence +"\" was classified as : " + lang) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,8 @@ | ||
from Ai import Ai | ||
from LanguageClassifier import LanguageClassifier | ||
|
||
ai = Ai() | ||
ai.load_classifier() | ||
|
||
lc = LanguageClassifier() | ||
lc.load_classifier() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from django.http import HttpResponse, Http404 | ||
from django.template import RequestContext | ||
from django.shortcuts import render_to_response | ||
|
||
def handler(request): | ||
if request.method == 'GET': | ||
return entrance(request) | ||
else: | ||
return recognize(request) | ||
|
||
def entrance(request): | ||
properties = { 'title' : 'Language Detection Demo' } | ||
#'template': {'type': 'drawing'}, | ||
#'panels': [ | ||
# { | ||
# 'panel_name': 'preview', | ||
# 'panel_label': 'Preview'}]} | ||
return render_to_response("application/lang_detect.html", | ||
properties, | ||
context_instance = RequestContext(request)) | ||
|
||
def recognize(request): | ||
try: | ||
text = json.loads(request.POST['text']) | ||
lang = lc.classify_doc(text) | ||
return HttpResponse(json.dumps({'predict': lang})) | ||
except: | ||
import traceback | ||
print traceback.format_exc() | ||
raise Http404 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters