In [1]:
# -*- coding: utf-8 -*-

from in2110.oblig1b import visualize_word_vectors
from in2110.corpora import aviskorpus_10_nn
import urllib.request, pandas, re, random, numpy as np, scipy, sklearn

In [2]:
ORDFILER = {"norsk":"https://github.com/open-dict-data/ipa-dict/blob/master/data/nb.txt?raw=true",
        "arabisk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/ar.txt?raw=true",
        "finsk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/fi.txt?raw=true",
        "patwa":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/jam.txt?raw=true",
        "farsi":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/fa.txt?raw=true",
        "tysk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/de.txt?raw=true",
        "engelsk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/en_UK.txt?raw=true",
        "rumensk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/ro.txt?raw=true",
        "khmer":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/km.txt?raw=true",
        "fransk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/fr_FR.txt?raw=true",
        "japansk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/ja.txt?raw=true",
        "spansk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/es_ES.txt?raw=true",
        "svensk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/sv.txt?raw?true",
        "koreansk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/ko.txt?raw?true",
        "swahilisk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/sw.txt?raw?true",
        "vietnamesisk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/vi_C.txt?raw?true",
        "mandarin":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/zh_hans.txt?raw?true",
        "malayisk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/ma.txt?raw?true",
        "kantonesisk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/yue.txt?raw?true",
        "islandsk":"https://raw.githubusercontent.com/open-dict-data/ipa-dict/master/data/is.txt?raw=true"}

In [3]:
class LanguageIdentifier:
    """Logistisk regresjonsmodell som tar IPA transkripsjoner av ord som input, 
    og predikerer hvilke språkene disse ordene hører til."""

    def __init__(self):
        """Initialiser modellen"""	  
        # selve regresjonsmodellen (som brukes all CPU-er på maskinen for trening)
        self.model = sklearn.linear_model.LogisticRegression(solver="lbfgs", multi_class='ovr', n_jobs=-1, max_iter=300)

        # Hvis den går for treigt kan dere også bruke:
        #self.model = sklearn.linear_model.SGDClassifier(loss="log", n_jobs=-1)
        
        self.symbols = None
        self.matrix = None
        self.lang_map = None

    def train(self, transcriptions, languages):
        """Gitt en rekke med IPA transkripsjoner og en rekke med språknavn, tren
        den logistisk regresjonsmodellen. De to rekkene må ha samme lendgen"""

        if self.matrix == None:
            self._extract_feats(transcriptions)
        self.lang_map = list(ORDFILER.keys())
        self.model.fit(self.matrix, [self.lang_map.index(lang) for lang in languages])

    def predict(self, transcriptions):
        """Gitt en rekke med IPA transkripsjoner, finn ut det mest sansynnlige språket
        for hver transkripsjon. Rekken som returneres må ha samme lengden som rekken i input"""

        if self.lang_map == None:
            print('Error: Model has to be trained first')
            pass
        predict_matrix = np.zeros((len(transcriptions), len(self.symbols)))
        for i in range(len(transcriptions)):
            for char in transcriptions[i]:
                if char in self.symbols:
                    predict_matrix[i, self.symbols.index(char)] = 1
        
        return [self.lang_map[n] for n in self.model.predict(predict_matrix)]

    def _extract_unique_symbols(self, transcriptions, min_nb_occurrences=10):
        """Gitt en rekke med IPA fonetiske transkripsjoner, ektraher en liste med alle IPA 
        symboler som finnes i transkripsjonene og forekommer minst min_nb_occurrences."""
        counts = {}
        for word in transcriptions:
            for char in word:
                if char in counts.keys():
                    counts[char] += 1
                else:
                    counts[char] = 1
        self.symbols = []
        for char in counts.keys():
            if counts[char] >= min_nb_occurrences:
                self.symbols.append(char)
        return self.symbols

    def _extract_feats(self, transcriptions):
        """Gitt en rekke med IPA transkripsjoner, ekstraher en matrise av størrelse |T|x|F|,
        hvor |T| er antall transkripsjoner, og |F| er antall features brukt i modellen."""

        if self.symbols == None:
            self._extract_unique_symbols(transcriptions)
        matrix = np.zeros((len(transcriptions), len(self.symbols)))
        i = 0
        for word in transcriptions:
            for char in word:
                if char in self.symbols:
                    matrix[i, self.symbols.index(char)] = 1
            i += 1
        self.matrix = scipy.sparse.coo_matrix(matrix)
        return self.matrix

    def evaluate(self, transcriptions, languages):  
        """Gitt en rekke med IPA transkripsjoner og en rekke med språknavn, evaluer hvor godt
        modellen fungerer ved å beregne:
        1) accuracy
        2) precision, recall og F1 for hvert språk
        3) micro- og macro-averaged F1.
        """

        predictions = self.predict(transcriptions)
        print('Accuracy score: %.3f' %  sklearn.metrics.accuracy_score(languages, predictions))
        
        report = sklearn.metrics.classification_report(languages, predictions, output_dict=True)
        print('\t\tPrecision\tRecall\t\tF1-score')
        for key in ORDFILER:
            lang = key + '\t\t' if len(key) < 8 else key + '\t'
            print(lang + '%.3f' % report[key]['precision'] + '\t\t' + '%.3f' % report[key]['recall'] + '\t\t' + '%.3f' % report[key]['f1-score'])
        
        print('Micro-averaged F1: %.3f' % sklearn.metrics.f1_score(languages, predictions, average='micro'))
        print('Macro-averaged F1: %.3f' % sklearn.metrics.f1_score(languages, predictions, average='macro'))

In [4]:
def extract_wordlist(max_nb_words_per_language=20000):
    """
    Laster ned fra Github ordlister med ord og deres phonetiske transkripsjoner i flere språk.
    Ordlistene er deretter satt sammen i en pandas DataFrame, og delt i en treningsett og en testsett.
    """

    full_wordlist = []
    for lang, wordfile in ORDFILER.items():

        #print("Nedlasting av ordisten for", lang, end="... ")
        data = urllib.request.urlopen(wordfile)

        wordlist_for_language = []
        for linje in data:
            linje = linje.decode("utf8").rstrip("\n")
            word, transcription = linje.split("\t")

            # Noen transkripsjoner har feil tegn for "primary stress"
            transcription = transcription.replace("\'", "ˈ")

            # vi tar den første transkripsjon (hvis det finnes flere) 
            # og fjerner slashtegnene ved start og slutten
            match = re.match("/(.+?)/", transcription)
            if not match:
                continue
            transcription = match.group(1) 
            wordlist_for_language.append({"ord":word, "IPA":transcription, "språk":lang})
        data.close()

        # Vi blander sammen ordene, og reduserer mengder hvis listen er for lang
        random.shuffle(wordlist_for_language)
        wordlist_for_language = wordlist_for_language[:max_nb_words_per_language]

        full_wordlist += wordlist_for_language
        #print("ferdig!")

    # Nå bygger vi en DataFrame med alle ordene
    full_wordlist = pandas.DataFrame.from_records(full_wordlist)
 
    # Og vi blander sammen ordene i tilfeldig rekkefølge
    full_wordlist = full_wordlist.sample(frac=1)

    # Lage et treningssett og en testsett (med 10% av data)
    wordlist_train, wordlist_test = sklearn.model_selection.train_test_split(full_wordlist, test_size=0.1)
    print("Treningsett: %i eksempler, testsett: %i eksempler"%(len(wordlist_train), len(wordlist_test)))

    return wordlist_train, wordlist_test

In [5]:
train_data, test_data = extract_wordlist()

Treningsett: 309074 eksempler, testsett: 34342 eksempler


In [6]:
li = LanguageIdentifier()
li.train(list(train_data.IPA), list(train_data.språk))
li.evaluate(list(test_data.IPA), list(test_data.språk))

Accuracy score: 0.927
		Precision	Recall		F1-score
norsk		0.872		0.812		0.841
arabisk		0.963		0.951		0.957
finsk		0.988		0.996		0.992
patwa		0.937		0.366		0.527
farsi		0.939		0.955		0.947
tysk		0.958		0.939		0.948
engelsk		0.959		0.962		0.961
rumensk		0.750		0.807		0.777
khmer		0.901		0.668		0.767
fransk		0.955		0.912		0.933
japansk		0.985		0.936		0.960
spansk		0.934		0.917		0.925
svensk		0.964		0.956		0.960
koreansk	1.000		0.994		0.997
swahilisk	0.815		0.916		0.863
vietnamesisk	0.974		0.975		0.974
mandarin	0.955		0.978		0.966
malayisk	0.765		0.836		0.799
kantonesisk	0.997		0.974		0.985
islandsk	0.950		0.930		0.940
Micro-averaged F1: 0.927
Macro-averaged F1: 0.901


In [7]:
nor_coef = list(li.model.coef_[li.lang_map.index('norsk')])
most_weighted = li.symbols[nor_coef.index(max(nor_coef))]
least_weighted = li.symbols[nor_coef.index(min(nor_coef))]
print('Most weighted symbol in Norwegian:', most_weighted)

# Tell antall forekomster av symbolet i norske og ikke-norske ord
count_nor = 0
nor_words = 0
count_not = 0
not_words = 0
for word, lang in zip(train_data.IPA, train_data.språk):
    count = 0
    for char in word:
        if char == most_weighted:
            count += 1
    if lang == 'norsk':
        nor_words += 1
        count_nor += count
    else:
        not_words += 1
        count_not += count
print('Average occurrence of %s in Norwegian words: %f' % (most_weighted, count_nor / nor_words))
print('Average occurrence of %s in other words: %f' % (most_weighted, count_not / not_words))
print('Least weighted symbol in Norwegian: %s' % least_weighted)

Most weighted symbol in Norwegian: ʋ
Average occurrence of ʋ in Norwegian words: 0.173192
Average occurrence of ʋ in other words: 0.000000
Least weighted symbol in Norwegian: a
