In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
import re

# Size of traning data to be read in bytes
TRAINING_DATA_SIZE = 2000000

languages = [
    'sv', 'da', 'de', 'nl', 'en', 'fr', 'es', 'pt', 'it', 'ro', 'et',
    'fi','lt', 'lv', 'pl', 'sk', 'cs', 'sl', 'hu', 'bg',  'el'
]

files = [
    "train/europarl-v7.{lang}-en.{lang}".format(lang=x)
    for x in languages
]

corpus_raw = [
    open(x).read(TRAINING_DATA_SIZE)
    for x in files
]

corpus = [
    re.sub(r'[?”_"%()!--+,:;./\]\[\xad\n0-9\=<>]', '', x)
    for x in corpus_raw
]

count_vectorizer = CountVectorizer(ngram_range=(4, 4), analyzer='char_wb')
analyze = count_vectorizer.build_analyzer()
counts = count_vectorizer.fit_transform(corpus)

normalized_counts = counts/(counts.mean(axis=1)*counts.sum(axis=0))
normalized_counts = normalized_counts/normalized_counts.mean()
transformed_weights = np.log10(normalized_counts + 1)

word_to_weights = {
    key: i
    for i, key in enumerate(count_vectorizer.get_feature_names())
}

def scores(text):
    weight_indexes_all = [
        word_to_weights.get(word, -1)
        for word in analyze(text)
    ]
    weight_indexes_filtered = list(filter(lambda x: x != -1, weight_indexes_all))
    weights = np.sum(transformed_weights[:, weight_indexes_filtered], axis=1)
    lang = languages[np.argmax(weights)]
    return lang, weights

def run_tests():
    right = 0
    wrong = 0
    tests = open('europarl.test')
    for x in range(TRAINING_DATA_SIZE):
        line = tests.readline()
        if line == '':
            print("Final er: {er}%".format(er=100*wrong/(right+wrong)))
            return
        [lang, text] = line.split('\t')
        if scores(text)[0] == lang:
            right = right + 1
        else:
            wrong = wrong + 1
run_tests()