In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
from tests import en_test_inputs, fr_test_ins, it_test_ins, lv_test_ins
import re
%matplotlib inline

languages = [
    'sv', 'da', 'de', 'nl', 'en', 'fr', 'es', 'pt', 'it', 'ro', 'et',
    'fi','lt', 'lv', 'pl', 'sk', 'cs', 'sl', 'hu', 'bg',  'el'
]

files = [
    "train/europarl-v7.{lang}-en.{lang}".format(lang=x)
    for x in languages
]

corpus_raw = [
    open(x).read(1500000) for x in files
]

corpus = [
    re.sub(r'[?”_"%()!--+,:;./\]\[\xad\n0-9\=<>]', '', x) for x in corpus_raw
]

zeros = np.zeros(len(languages))

len(corpus)

21

In [8]:
count_vectorizer = CountVectorizer(ngram_range=(4, 4), analyzer='char_wb')
analyze = count_vectorizer.build_analyzer()
counts = count_vectorizer.fit_transform(corpus)
counts.data = np.cbrt(counts.data)
tfidf_transformer = TfidfTransformer(smooth_idf=False, sublinear_tf=False, use_idf=True)
transformed_weights = tfidf_transformer.fit_transform(counts)

three_gram_to_index = dict(zip(count_vectorizer.get_feature_names(), range(len(count_vectorizer.get_feature_names()))))

transformed_weights

<21x272923 sparse matrix of type '<class 'numpy.float64'>'
	with 576205 stored elements in Compressed Sparse Row format>

In [3]:
def score(text, lang):
    weight_indexes = [three_gram_to_index.get(three_gram, -1) for three_gram in analyze(text)]
    weight_indexes = list(filter(lambda x: x != -1, weight_indexes))
    return np.sum(transformed_weights[lang,weight_indexes])

adjustment = np.asarray([
    score(corpus_raw[i], i)/len(analyze(corpus_raw[i]))
    for i,lang in enumerate(languages)
])


In [4]:
def scoresV2(text):
    weight_indexes_all = [three_gram_to_index.get(three_gram, -1) for three_gram in analyze(text)]
    weight_indexes_filtered = list(filter(lambda x: x != -1, weight_indexes_all))
    out = np.sum(transformed_weights[:, weight_indexes_filtered], axis=1).T/adjustment
    lang = languages[np.argmax(out)]
    return lang, out

In [9]:
right = 1
wrong = []
def run_tests():
    global right, wrong
    tests = open('europarl.test')
    for x in range(100):
        try:
            for y in range(3000):
                line = tests.readline()
                [lang, text] = line.split('\t')
                if scoresV2(text)[0] == lang:
                    right = right + 1
                else:
                    wrong.append((line, scoresV2(text), lang))
        except:
            print("Final er: {er}%".format(er=100*len(wrong)/(right+len(wrong))))
            break
        print(
            "right: {right}, wrong: {wrong}, er: {er}%".format(
                right=right, wrong=len(wrong),er = 100*len(wrong)/(right+len(wrong))
            )
        )
run_tests()

right: 3000, wrong: 1, er: 0.03332222592469177%
right: 6000, wrong: 1, er: 0.016663889351774704%
right: 9000, wrong: 1, er: 0.011109876680368847%
right: 12000, wrong: 1, er: 0.008332638946754437%
right: 14999, wrong: 2, er: 0.013332444503699753%
right: 17996, wrong: 5, er: 0.027776234653630355%
right: 20994, wrong: 7, er: 0.033331746107328225%
Final er: 0.033331746107328225%


In [10]:
wrong

[('cs\tDnes myslím na nás, tedy na vás a na sebe.\n',
  ('sk', matrix([[  0.683384  ,   1.24785897,   1.35840468,   2.51819827,
              3.55319866,   4.1406514 ,   2.89778428,   6.72194545,
              2.12262584,   1.91063768,   1.62275797,   0.46892035,
              1.70793502,   0.89267997,   6.28315326,  22.32270818,
             21.17796105,   9.28481207,   3.16875322,   0.24476406,   0.1327154 ]])),
  'cs'),
 ('lv\tEs runāju par Banco Português de Negócios un Banco Privado Português.\n',
  ('pt', matrix([[  7.41159175,   7.95841401,   7.42933033,   7.78514701,
             11.40700656,  14.12762714,  22.69856082,  24.0382442 ,
             15.10199618,  13.36089251,   8.27083764,   4.91088823,
             10.97285119,  18.26736297,   6.04278578,   7.80980646,
              7.11745664,   9.41453624,   5.70509533,   1.15648353,
              0.91633373]])),
  'lv'),
 ('pl\tRegiony te to belgijski region Limburg, holenderski region Limburg i region Aachen.\n',
  ('en', mat

In [11]:
scoresV2("Avem")

('ro',
 matrix([[ 0.63491786,  0.49594623,  0.07402902,  0.09650742,  0.35586495,
           1.06591   ,  0.38154455,  1.24450211,  0.87784369,  1.93439186,
           0.3252581 ,  0.06481595,  0.11347094,  0.09579412,  0.09186615,
           0.23186923,  0.61289126,  1.06524151,  0.24238341,  0.04389155,
           0.0362359 ]]))