In [499]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
from tests import en_test_inputs, fr_test_ins, it_test_ins, lv_test_ins
import re
%matplotlib inline

languages = [
    'sv', 'da', 'de', 'nl', 'en', 'fr', 'es', 'pt', 'it', 'ro', 'et',
    'fi','lt', 'lv', 'pl', 'sk', 'cs', 'sl', 'hu', 'bg',  'el'
]

files = [
    "train/europarl-v7.{lang}-en.{lang}".format(lang=x)
    for x in languages
]

corpus_raw = [
    open(x).read(2000000) for x in files
]

corpus = [
    re.sub(r'[?”_"%()!--+,:;./\]\[\xad\n0-9\=<>]', '', x) for x in corpus_raw
]

zeros = np.zeros(len(languages))

len(corpus)

21

In [502]:
count_vectorizer = CountVectorizer(ngram_range=(4, 4), analyzer='char_wb')
analyze = count_vectorizer.build_analyzer()
counts = count_vectorizer.fit_transform(corpus)
counts_n = counts/(counts.sum(axis=1)/counts.sum(axis=1).mean())
a = np.array(counts_n)
a = a/a.sum(axis=0)
transformed_weights = a ** 0.3 # np.log10(a*1000 + 10)

three_gram_to_index = dict(zip(count_vectorizer.get_feature_names(), range(len(count_vectorizer.get_feature_names()))))

transformed_weights

array([[ 0.13188392,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.1162476 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.12535025,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.64746846,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.11415311,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.08043347,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [429]:
def scoresV2(text):
    weight_indexes_all = [three_gram_to_index.get(three_gram, -1) for three_gram in analyze(text)]
    weight_indexes_filtered = list(filter(lambda x: x != -1, weight_indexes_all))
    out = np.sum(transformed_weights[:, weight_indexes_filtered], axis=1)
    lang = languages[np.argmax(out)]
    return lang, out

In [503]:
right = 1
wrong = []
def run_tests():
    global right, wrong
    tests = open('europarl.test')
    for x in range(100):
        try:
            for y in range(3000):
                line = tests.readline()
                [lang, text] = line.split('\t')
                if scoresV2(text)[0] == lang:
                    right = right + 1
                else:
                    wrong.append((line, scoresV2(text), lang))
        except:
            print("Final er: {er}%".format(er=100*len(wrong)/(right+len(wrong))))
            break
        print(
            "right: {right}, wrong: {wrong}, er: {er}%".format(
                right=right, wrong=len(wrong),er = 100*len(wrong)/(right+len(wrong))
            )
        )
run_tests()

right: 3000, wrong: 1, er: 0.03332222592469177%
right: 6000, wrong: 1, er: 0.016663889351774704%
right: 9000, wrong: 1, er: 0.011109876680368847%
right: 12000, wrong: 1, er: 0.008332638946754437%
right: 14999, wrong: 2, er: 0.013332444503699753%
right: 17998, wrong: 3, er: 0.016665740792178214%
right: 20997, wrong: 4, er: 0.019046712061330413%
Final er: 0.019046712061330413%


In [268]:
a.T[5000]

array([ 0.12240418,  0.12473875,  0.11860804,  0.06184592,  0.12841689,
        2.45844542,  0.        ,  0.        ,  0.        ,  0.06098547,
        5.52284562,  7.1514297 ,  1.34130856,  1.36493535,  0.40374146,
        0.53479568,  0.53865676,  0.60692587,  0.45991633,  0.        ,  0.        ])

In [337]:
np.array(counts.T[0].todense())

array([[   66,    45,    64,    37,  4203,  1380,  5280,  9252,  3011,
         6005,    42,    57,    77,    55,   917,  7936,  8058,    67,
        14760,    55,    16]])

In [392]:
counts_n.T[0]

matrix([[    67.1517283 ,     46.65851558,     63.09734755,     38.04168867,
           4486.40556352,   1446.17900868,   5505.42599424,   9439.21391492,
           3032.37023353,   6088.16473113,     40.16871742,     52.53213159,
             74.65150897,     54.2617678 ,    879.26878032,   7839.60249878,
           8017.59047311,     67.60167513,  14106.59514931,     55.36571143,
             16.17098813]])

In [391]:
a.T[0]

array([ 0.00109338,  0.00075971,  0.00102737,  0.0006194 ,  0.07304884,
        0.02354707,  0.0896408 ,  0.15369178,  0.04937385,  0.09912911,
        0.00065404,  0.00085534,  0.0012155 ,  0.0008835 ,  0.01431649,
        0.12764648,  0.13054453,  0.00110071,  0.22968732,  0.00090148,
        0.0002633 ])

In [371]:
transformed_weights.T[0]

array([ 0.02270139,  0.01582791,  0.02134528,  0.01292363,  0.92980922,
        0.40178398,  1.05864301,  1.44161727,  0.71140496,  1.12548506,
        0.01364132,  0.01780277,  0.02520507,  0.01838358,  0.26286124,
        1.3030693 ,  1.31946926,  0.02285177,  1.76189006,  0.0187541 ,
        0.00551408])

In [325]:
counts_n.T.mean(axis=1)

matrix([[  2.92459610e+03],
        [  4.85826175e-02],
        [  3.28796384e-01],
        ..., 
        [  1.91742724e-01],
        [  9.39597711e-02],
        [  4.69798855e-02]])

In [470]:
wrong

[('da\tEn nulsats er et specifikt tal.\n',
  ('sv',
   array([ 5.59978674,  4.71585507, -0.57239618,  2.37870027,  1.19123826,
           1.4555499 ,  1.36576091,  0.75005963,  1.01127301,  1.14922529,
           0.67732482, -1.80348669,  1.19320005,  1.05557466, -1.26137387,
           0.35209451,  0.2162597 , -0.5240922 ,  1.07933476, -2.20365404,
          -2.95636023])),
  'da'),
 ('lv\tEs runāju par Banco Português de Negócios un Banco Privado Português.\n',
  ('pt', array([ -0.52666994,  -0.33682476,  -1.14248473,  -0.60082783,
            2.50346097,   0.90493576,  10.14873636,  18.78446024,
            4.23116872,   4.24179612,  -0.16618399,  -2.19759067,
            1.81695478,   8.30846792,  -0.74183124,  -0.33810728,
            0.20554241,   0.62568987,  -1.19808444,  -4.9264644 ,  -5.30775391])),
  'lv'),
 ('pl\tRegiony te to belgijski region Limburg, holenderski region Limburg i region Aachen.\n',
  ('en', array([  8.67096972,   8.277402  ,   8.55025511,   8.44024233,
   

In [426]:
count_vectorizer.get_feature_names()[30000:]

['bulv',
 'bulâ',
 'bulā',
 'bulė',
 'bulī',
 'bulų',
 'buma',
 'bumb',
 'bume',
 'bumi',
 'bumo',
 'bump',
 'bums',
 'bumu',
 'bumā',
 'bumą',
 'bumē',
 'bun ',
 'buna',
 'bunc',
 'bund',
 'bune',
 'bunf',
 'bung',
 'buni',
 'bunk',
 'bunl',
 'bunm',
 'buno',
 'bunp',
 'bunt',
 'bunu',
 'bunv',
 'buná',
 'bunî',
 'bunā',
 'bună',
 'buně',
 'buo ',
 'buoi',
 'buoj',
 'buol',
 'buon',
 'buop',
 'buor',
 'buos',
 'buot',
 'buov',
 'bup ',
 'bupa',
 'bupi',
 'bupl',
 'bupn',
 'bupo',
 'bupr',
 'bups',
 'bupē',
 'buqu',
 'bura',
 'burb',
 'burc',
 'burd',
 'bure',
 'burg',
 'buri',
 'burj',
 'burk',
 'burl',
 'burm',
 'burn',
 'buro',
 'burs',
 'burt',
 'buru',
 'burv',
 'bury',
 'burz',
 'buró',
 'bură',
 'burš',
 'bus ',
 'busa',
 'busb',
 'busc',
 'busd',
 'buse',
 'busf',
 'bush',
 'busi',
 'busj',
 'busk',
 'busl',
 'busm',
 'busn',
 'buso',
 'busp',
 'busq',
 'buss',
 'bust',
 'busy',
 'busā',
 'but ',
 'buta',
 'butc',
 'bute',
 'buti',
 'butl',
 'buto',
 'butr',
 'buts',
 'butt',
 

In [417]:
' ' in 'sadf '

True

In [440]:
counts_n = counts/(counts.sum(axis=1)/counts.sum(axis=1).mean())
a = np.array(counts_n)
a = a/a.sum(axis=0)

In [444]:
a - ((a == 0)*0.2)

array([[  1.09338221e-03,  -2.00000000e-01,  -2.00000000e-01, ...,
         -2.00000000e-01,  -2.00000000e-01,  -2.00000000e-01],
       [  7.59706297e-04,  -2.00000000e-01,  -2.00000000e-01, ...,
         -2.00000000e-01,  -2.00000000e-01,  -2.00000000e-01],
       [  1.02736771e-03,  -2.00000000e-01,  -2.00000000e-01, ...,
         -2.00000000e-01,  -2.00000000e-01,  -2.00000000e-01],
       ..., 
       [  2.29687315e-01,  -2.00000000e-01,  -2.00000000e-01, ...,
         -2.00000000e-01,  -2.00000000e-01,  -2.00000000e-01],
       [  9.01479164e-04,  -2.00000000e-01,  -2.00000000e-01, ...,
          1.00000000e+00,  -2.00000000e-01,  -2.00000000e-01],
       [  2.63300308e-04,  -2.00000000e-01,  -2.00000000e-01, ...,
         -2.00000000e-01,  -2.00000000e-01,  -2.00000000e-01]])

In [448]:
-2**0.5

-1.4142135623730951

In [462]:
sum(transformed_weights.T[0:50])

array([-42.719935  , -41.66585488, -13.09059339, -37.42871454,
       -27.54235471, -34.13190019, -29.09501215, -19.46561397,
       -34.99224726, -18.60438891, -29.77867705, -34.75852247,
       -39.2686406 , -37.82205849, -40.46514671, -36.30628362,
       -37.9944546 , -40.99319765, -40.19294259, -43.29372501, -47.68318695])