In [62]:
import re

In [63]:
def LoadCorpus(language):
    with open("Data/"+str(language)) as f:
        text = f.read()
    #text = re.sub(r"\r\n", " ", text)
    text = re.sub(r"\d+", "", text)
    text = text.replace("|", "")
    text = text.replace("\n", " ")
    return text

In [64]:
def ngram(string, n):
    charList = []
    if n < len(string):
        for p in range(len(string) - n + 1) :
            tg = string[p:p+n]
            charList.append(tg)
    return charList

In [65]:
def xgram(string):
    return [w for n in range(1,4) for w in ngram(string.lower(),n)]

In [66]:
def buildmodel(text):
    model = {}

    xgramme = xgram(text)
    nr_of_ngs = len(xgramme)

    for w in xgramme:
        f = 1 + model.get(w,0)
        model[w] = f
    
    for w in model:
        model[w] = float(model[w]) / float(nr_of_ngs)

    return model

In [67]:
languages = ["English", "Dutch", "French", "German", "Italian", "Romanian", "Spanish"]
modelDict = {}

for language in languages:
    text = LoadCorpus(language)
    model = buildmodel(text)
    modelDict[language] = model

In [68]:
import math

def cosine(a,b):
    return sum([a[k]*b[k] for k in a if k in b]) / (math.sqrt(sum([a[k]**2 for k in a])) * math.sqrt(sum([b[k]**2 for k in b])))

In [69]:
"""English"""
text = "The quick brown fox jumps over lazy dog"
print(text)
textmodel = buildmodel(text)
for m in modelDict:
    print(m, cosine(modelDict[m],textmodel))

The quick brown fox jumps over lazy dog
Italian 0.6629813072710241
German 0.6104604119869661
French 0.6650084852508219
Romanian 0.6320522641170209
Spanish 0.6835991141779255
English 0.6891341846992005
Dutch 0.617581722106847


In [70]:
"""German"""
text = "Der schnelle braune Fuchs springt über faulen Hund"
print(text)
textmodel = buildmodel(text)
for m in modelDict:
    print(m, cosine(modelDict[m],textmodel))

Der schnelle braune Fuchs springt über faulen Hund
Italian 0.6645986950583889
German 0.7821955567518508
French 0.7225809611686217
Romanian 0.6793393193689158
Spanish 0.6839905971316019
English 0.6861257534040425
Dutch 0.7050858366842623


In [71]:
"""Dutch"""
text = "De vlugge bruine vos springt over luie hond"
print(text)
textmodel = buildmodel(text)
for m in modelDict:
    print(m, cosine(modelDict[m],textmodel))

De vlugge bruine vos springt over luie hond
Italian 0.6992118873019266
German 0.7124269350982289
French 0.7278369879429704
Romanian 0.6996429848813718
Spanish 0.6994821826990496
English 0.6858960651369385
Dutch 0.7061978904353363


In [72]:
"""French"""
text = "Le renard brun rapide saute par-dessus un chien paresseux"
print(text)
textmodel = buildmodel(text)
for m in modelDict:
    print(m, cosine(modelDict[m],textmodel))

Le renard brun rapide saute par-dessus un chien paresseux
Italian 0.6896178531670681
German 0.7485611237052727
French 0.7693235728544291
Romanian 0.7144022789783291
Spanish 0.7458075151416639
English 0.6763513544250095
Dutch 0.7006106286760567


In [73]:
"""Italian"""
text = "La volpe marrone veloce salta sul cane pigro"
print(text)
textmodel = buildmodel(text)
for m in modelDict:
    print(m, cosine(modelDict[m],textmodel))

La volpe marrone veloce salta sul cane pigro
Italian 0.7640391770463243
German 0.6309346023539755
French 0.7257505953303935
Romanian 0.7207123837023954
Spanish 0.7412753464566461
English 0.6802059241508205
Dutch 0.6442671155840797


In [None]:
"""Romanian"""
text = "Vulpea brună sărată peste câinele leneș"
print(text)
textmodel = buildmodel(text)
for m in modelDict:
    print(m, cosine(modelDict[m],textmodel))