diff --git a/data b/data index a4c963e..a98cfe7 160000 --- a/data +++ b/data @@ -1 +1 @@ -Subproject commit a4c963e96e51453c33fef9a90ac0f0b247614d13 +Subproject commit a98cfe7bf22330abe0a4b4cf8af49e5682f35d26 diff --git a/demos/application/LanguageClassifier.py b/demos/application/LanguageClassifier.py new file mode 100644 index 0000000..4700df1 --- /dev/null +++ b/demos/application/LanguageClassifier.py @@ -0,0 +1,67 @@ +from modshogun import MulticlassLibLinear +from numpy import array + +import json +import sys +import gzip as gz +import pickle as pkl + +default_filepath = "data/lang_detection/default.svm.gz" + +id_to_lang = {0 : "English", 1 : "Greek", 2 : "German", + 3 : "Spanish", 4 : "Italian"} + +class LanguageClassifier: + def __init__(self): + self.svm = None + + def load_classifier(self): + gz_stream = gz.open(default_filepath, 'rb') + self.svm = pkl.load(gz_stream) + gz_stream.close() + + def load_svm(self, filepath): + from modshogun import SerializableAsciiFile + + print("Attempting to load a multiclass liblinear svm from \"" + + filepath +"\"") + self.svm = MulticlassLibLinear() + loader = SerializableAsciiFile(filepath, "r") + self.svm.load_serializable(loader) + print("Svm succesfully loaded") + + + def classify_doc(self, doc): + from modshogun import StringCharFeatures, RAWBYTE + from modshogun import HashedDocDotFeatures + from modshogun import NGramTokenizer + from modshogun import MulticlassLabels + + docs = [doc] + string_feats = StringCharFeatures(docs, RAWBYTE) + tokenizer = NGramTokenizer(4) + normalize = True + num_bits = 18 + + hashed_doc_feats = HashedDocDotFeatures(num_bits, string_feats, + tokenizer, normalize, 3, 2) + + labels = self.svm.apply(hashed_doc_feats).get_labels() + + return id_to_lang[labels[0]] + +if __name__=='__main__': + lc = LanguageClassifier() + if len(sys.argv)==1: + lc.load_classifier() + else: + lc.load_svm(sys.argv[1]) + + while True: + print("Enter a sentence to classify or type \"!quit\" to quit") + sentence = raw_input() + if sentence=='!quit': + break + + lang = lc.classify_doc(sentence) + print("Your sentence \"" + sentence +"\" was classified as : " + lang) diff --git a/demos/application/__init__.py b/demos/application/__init__.py index 89f047e..46f7e7f 100644 --- a/demos/application/__init__.py +++ b/demos/application/__init__.py @@ -1,4 +1,8 @@ from Ai import Ai +from LanguageClassifier import LanguageClassifier ai = Ai() ai.load_classifier() + +lc = LanguageClassifier() +lc.load_classifier() diff --git a/demos/application/lang_detect.py b/demos/application/lang_detect.py new file mode 100644 index 0000000..b030e5d --- /dev/null +++ b/demos/application/lang_detect.py @@ -0,0 +1,30 @@ +from django.http import HttpResponse, Http404 +from django.template import RequestContext +from django.shortcuts import render_to_response + +def handler(request): + if request.method == 'GET': + return entrance(request) + else: + return recognize(request) + +def entrance(request): + properties = { 'title' : 'Language Detection Demo' } + #'template': {'type': 'drawing'}, + #'panels': [ + # { + # 'panel_name': 'preview', + # 'panel_label': 'Preview'}]} + return render_to_response("application/lang_detect.html", + properties, + context_instance = RequestContext(request)) + +def recognize(request): + try: + text = json.loads(request.POST['text']) + lang = lc.classify_doc(text) + return HttpResponse(json.dumps({'predict': lang})) + except: + import traceback + print traceback.format_exc() + raise Http404 diff --git a/shogun_demo/urls.py b/shogun_demo/urls.py index 5ff07d5..05fa037 100644 --- a/shogun_demo/urls.py +++ b/shogun_demo/urls.py @@ -15,6 +15,7 @@ url(r'^dimred/tapkee/promoters.json', 'dimred.tapkee.promoters'), url(r'^application/ocr/', 'application.ocr.handler'), + url(r'^application/ld/', 'application.lang_detect.handler'), url(r'^misc/kernel_matrix/', 'misc.kernel_matrix.handler'), url(r'^misc/tree/', 'misc.tree.handlers'),