In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
from tests import en_test_inputs, fr_test_ins, it_test_ins, lv_test_ins
import re
%matplotlib inline

languages = [
    'sv', 'da', 'de', 'nl', 'en', 'fr', 'es', 'pt', 'it', 'ro', 'et',
    'fi','lt', 'lv', 'pl', 'sk', 'cs', 'sl', 'hu', 'bg',  'el'
]

files = [
    "train/europarl-v7.{lang}-en.{lang}".format(lang=x)
    for x in languages
]

corpus_raw = [
    open(x).read(1500000) for x in files
]

corpus = [
    re.sub(r'[?”_"%()!--+,:;./\]\[\xad\n0-9\=<>]', '', x) for x in corpus_raw
]

zeros = np.zeros(len(languages))

len(corpus)

21

In [9]:
count_vectorizer = CountVectorizer(ngram_range=(4, 4), analyzer='char_wb')
analyze = count_vectorizer.build_analyzer()
counts = count_vectorizer.fit_transform(corpus)

a = np.array(counts.todense().tolist())
a = a/a.sum(axis=0)
counts = a ** 0.4

#counts.data = counts.data ** 0.4
tfidf_transformer = TfidfTransformer(smooth_idf=False, sublinear_tf=False, use_idf=False)
transformed_weights = tfidf_transformer.fit_transform(counts)

three_gram_to_index = dict(zip(count_vectorizer.get_feature_names(), range(len(count_vectorizer.get_feature_names()))))

transformed_weights

<21x272923 sparse matrix of type '<class 'numpy.float64'>'
	with 576205 stored elements in Compressed Sparse Row format>

In [None]:
def score(text, lang):
    weight_indexes = [three_gram_to_index.get(three_gram, -1) for three_gram in analyze(text)]
    weight_indexes = list(filter(lambda x: x != -1, weight_indexes))
    return np.sum(transformed_weights[lang,weight_indexes])

adjustment = np.asarray([
    score(corpus_raw[i], i)/len(analyze(corpus_raw[i]))
    for i,lang in enumerate(languages)
])


In [4]:
def scoresV2(text):
    weight_indexes_all = [three_gram_to_index.get(three_gram, -1) for three_gram in analyze(text)]
    weight_indexes_filtered = list(filter(lambda x: x != -1, weight_indexes_all))
    out = np.sum(transformed_weights[:, weight_indexes_filtered], axis=1).T/adjustment
    lang = languages[np.argmax(out)]
    return lang, out

In [None]:
right = 1
wrong = []
def run_tests():
    global right, wrong
    tests = open('europarl.test')
    for x in range(100):
        try:
            for y in range(3000):
                line = tests.readline()
                [lang, text] = line.split('\t')
                if scoresV2(text)[0] == lang:
                    right = right + 1
                else:
                    wrong.append((line, scoresV2(text), lang))
        except:
            print("Final er: {er}%".format(er=100*len(wrong)/(right+len(wrong))))
            break
        print(
            "right: {right}, wrong: {wrong}, er: {er}%".format(
                right=right, wrong=len(wrong),er = 100*len(wrong)/(right+len(wrong))
            )
        )
run_tests()

right: 2999, wrong: 2, er: 0.06664445184938354%
right: 5999, wrong: 2, er: 0.03332777870354941%
right: 8999, wrong: 2, er: 0.022219753360737695%
right: 11999, wrong: 2, er: 0.016665277893508874%
right: 14998, wrong: 3, er: 0.01999866675554963%


In [6]:
wrong

[('cs\t"Diktátorství a manipulace"?\n',
  ('sk', matrix([[  7.88983226,   7.46212799,   5.46535162,   6.95790038,
              9.09432677,   7.05300993,   8.47774249,   7.13755017,
              7.57636287,  10.16240306,   7.07980896,   5.39128196,
              7.40713674,   8.01884967,   7.01600614,  13.5747822 ,
             12.84442173,   7.9328793 ,   9.64807961,   0.45101119,
              0.40036148]])),
  'cs'),
 ('cs\tDnes myslím na nás, tedy na vás a na sebe.\n',
  ('sk', matrix([[  6.91232029,   5.24424095,   6.00262021,   5.07096292,
              7.5526454 ,   5.95803294,  12.16340129,   9.88997226,
              6.084623  ,   5.77402052,   7.54883319,   7.30112752,
              7.09976171,   5.99280088,  13.82502898,  29.10978335,
             27.97806266,  15.00538717,  11.25898849,   0.61771173,
              0.24975509]])),
  'cs'),
 ('cs\tJmenovali se William Meyer, Bernard Starie, Reginald Pike, Thomas Shaw, James McLeish, Archibald Barrowman a Albert Roberts a vši

In [7]:
scoresV2("Avem")

('sl',
 matrix([[ 2.47808185,  1.83907101,  1.23182241,  0.73041704,  2.78289419,
           2.84485251,  0.84658683,  3.50204499,  1.50820706,  2.3809649 ,
           1.42913665,  0.6408911 ,  0.42223341,  1.72162964,  0.94999773,
           1.49515652,  2.19258887,  3.55769923,  1.40086265,  0.09029498,
           0.08152254]]))

In [8]:
a

array([[  4.47154472e-03,   3.37154417e-04,   4.90921318e-02, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  3.04878049e-03,   0.00000000e+00,   4.16946873e-02, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  4.33604336e-03,   3.37154417e-04,   1.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  1.00000000e+00,   0.00000000e+00,   1.26429052e-01, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  3.72628726e-03,   0.00000000e+00,   1.27774042e-02, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  1.08401084e-03,   0.00000000e+00,   6.72494956e-04, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])