In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
from tests import en_test_inputs, fr_test_ins, it_test_ins, lv_test_ins
import re
%matplotlib inline

languages = [
    'sv', 'da', 'de', 'nl', 'en', 'fr', 'es', 'pt', 'it', 'ro', 'et',
    'fi','lt', 'lv', 'pl', 'sk', 'cs', 'sl', 'hu', 'bg',  'el'
]

files = [
    "train/europarl-v7.{lang}-en.{lang}".format(lang=x)
    for x in languages
]

corpus_raw = [
    open(x).read(1500000) for x in files
]

corpus = [
    re.sub(r'[?”_"%()!--+,:;./\]\[\xad\n0-9\=<>]', '', x) for x in corpus_raw
]

zeros = np.zeros(len(languages))

len(corpus)

21

In [None]:
count_vectorizer = CountVectorizer(ngram_range=(4, 4), analyzer='char_wb')
analyze = count_vectorizer.build_analyzer()
counts = count_vectorizer.fit_transform(corpus)

a = np.array(counts.todense().tolist())
a = a/a.sum(axis=0)
counts = np.cbrt(a)
transformed_weights = a/a.sum(axis=1)

#counts.data = counts.data ** 0.4
#tfidf_transformer = TfidfTransformer(smooth_idf=False, sublinear_tf=False, use_idf=False)
#transformed_weights = tfidf_transformer.fit_transform(counts)

three_gram_to_index = dict(zip(count_vectorizer.get_feature_names(), range(len(count_vectorizer.get_feature_names()))))

transformed_weights

In [115]:
def score(text, lang):
    weight_indexes = [three_gram_to_index.get(three_gram, -1) for three_gram in analyze(text)]
    weight_indexes = list(filter(lambda x: x != -1, weight_indexes))
    return np.sum(transformed_weights[lang,weight_indexes])

adjustment = np.asarray([
    score(corpus_raw[i], i)/len(analyze(corpus_raw[i]))
    for i,lang in enumerate(languages)
])


In [4]:
def scoresV2(text):
    weight_indexes_all = [three_gram_to_index.get(three_gram, -1) for three_gram in analyze(text)]
    weight_indexes_filtered = list(filter(lambda x: x != -1, weight_indexes_all))
    out = np.sum(transformed_weights[:, weight_indexes_filtered], axis=1).T/adjustment
    lang = languages[np.argmax(out)]
    return lang, out

In [116]:
right = 1
wrong = []
def run_tests():
    global right, wrong
    tests = open('europarl.test')
    for x in range(100):
        try:
            for y in range(3000):
                line = tests.readline()
                [lang, text] = line.split('\t')
                if scoresV2(text)[0] == lang:
                    right = right + 1
                else:
                    wrong.append((line, scoresV2(text), lang))
        except:
            print("Final er: {er}%".format(er=100*len(wrong)/(right+len(wrong))))
            break
        print(
            "right: {right}, wrong: {wrong}, er: {er}%".format(
                right=right, wrong=len(wrong),er = 100*len(wrong)/(right+len(wrong))
            )
        )
run_tests()

right: 2999, wrong: 2, er: 0.06664445184938354%
right: 5999, wrong: 2, er: 0.03332777870354941%
right: 8999, wrong: 2, er: 0.022219753360737695%
right: 11999, wrong: 2, er: 0.016665277893508874%
right: 14998, wrong: 3, er: 0.01999866675554963%
right: 17997, wrong: 4, er: 0.022220987722904283%
right: 20995, wrong: 6, er: 0.02857006809199562%
Final er: 0.02857006809199562%


In [117]:
wrong

[('cs\tDnes myslím na nás, tedy na vás a na sebe.\n',
  ('sk', matrix([[  0.94641063,   1.68830936,   1.69163511,   1.7416031 ,
              3.41941703,   2.21683045,   2.64472688,   3.46579837,
              1.94570638,   1.5250961 ,   1.65197466,   0.51903424,
              1.6638313 ,   1.06472754,   4.30951664,  20.45483702,
             20.14926049,   7.08688228,   3.7683017 ,   0.35626362,
              0.13906134]])),
  'cs'),
 ('da\tEn nulsats er et specifikt tal.\n',
  ('sv', matrix([[ 11.89867942,  11.22638711,   2.9597506 ,   7.49570511,
              6.30079046,   6.45321667,   6.49040505,   5.33730654,
              6.39321838,   5.69665483,   4.98277098,   1.57433674,
              5.37249249,   4.93929535,   1.98375557,   4.61579446,
              4.36026103,   3.05217109,   4.42623671,   0.90414241,
              0.18904887]])),
  'da'),
 ('lv\tEs runāju par Banco Português de Negócios un Banco Privado Português.\n',
  ('pt', matrix([[  9.75346054,  10.1594535 ,   8.56

In [52]:
scoresV2("Avem")

('ro',
 matrix([[ 1.05385201,  0.72462437,  0.14895756,  0.18375937,  0.65581083,
           1.41197319,  0.6840087 ,  1.80449495,  1.2374858 ,  2.42291788,
           0.53790615,  0.12335426,  0.18238501,  0.15614746,  0.15169113,
           0.38689419,  0.79227191,  1.44413119,  0.40337961,  0.08271628,
           0.07205493]]))

In [84]:
a.sum(axis=0)

array([ 4.15873984,  1.31119353,  5.37525219, ...,  1.        ,
        1.        ,  1.        ])