In [72]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
from tests import en_test_inputs, fr_test_ins, it_test_ins, lv_test_ins
import re

languages = [
    'sl', 'en', 'sv', 'da', 'de', 'nl', 'fr', 'es', 'pt', 'it', 'ro',
    'et', 'lt', 'lv', 'pl', 'sk', 'cs', 'el', 'fi', 'hu', 'bg'
]

files = [
    "europarl-v7.{lang}-en.{lang}".format(lang=x)
    for x in languages
]

corpus_raw = [
    open(x).read(20000) for x in files
]

corpus = [
    re.sub(r'[?"%()!--+,:;./\]\[\xad\n0-9\=<>]', '', x) for x in corpus_raw
]

len(corpus)

21

In [73]:
bigram_vectorizer = CountVectorizer(ngram_range=(3, 3), analyzer='char_wb')
analyze = bigram_vectorizer.build_analyzer()
counts = bigram_vectorizer.fit_transform(corpus)
transformed_weights = TfidfTransformer(smooth_idf=True,use_idf=False, norm='l1').fit_transform(counts)
weights = [
   dict(zip(bigram_vectorizer.get_feature_names(), weight)) for weight in transformed_weights.toarray()
]
weights = dict(zip(languages, weights))
transformed_weights

<21x16252 sparse matrix of type '<class 'numpy.float64'>'
	with 43745 stored elements in Compressed Sparse Row format>

In [74]:
def scores(text):
    scores_list = [score(text, lang) for lang in languages]
    scores_dict = dict(zip(weights, scores_list))
    rev_dict = {v: k for k, v in scores_dict.items()}
    return rev_dict[max(scores_list)], scores_dict

def score(text, lang):
    return sum([weights[lang].get(three_gram, 0) for three_gram in analyze(text)])

In [75]:
[score(corpus_raw[i], lang)/len(analyze(corpus_raw[i])) for i,lang in enumerate(languages)]

[0.0029197170652436807,
 0.0035188091760417345,
 0.0024387017380211352,
 0.0031188767188491131,
 0.0030630144377589807,
 0.0035428439385828733,
 0.0029396805633141289,
 0.0033409206147969766,
 0.0029504896930561881,
 0.0026065841816880556,
 0.0034381407207649799,
 0.0025448595405165972,
 0.0026665798372641656,
 0.0025572560354571341,
 0.0027876627871760836,
 0.0030297960361699754,
 0.002694132907296773,
 0.0019344651086368909,
 0.0020117912835934094,
 0.0023685498334847606,
 0.0031366605569831648]

In [None]:
tests = open('')