In [1]:
import time
from typing import List, Tuple, Optional, Dict  # noqa # pylint: disable=unused-import
import random
import json
import csv

## The Text Classifier Model

In [2]:
from typing import Dict, Tuple, List, Optional, Union

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

import numpy as np


class TextClassifierNB(object):

    def __init__(self) -> None:
        self._labels = set()  # type: Set[str]
        # Label to idx mapping. Maintain during training.
        self._label_to_idx = {}  # type: Dict[str, int]

        self._next_label_idx = 0
        self._union_vectorizer = None  # type: Optional[FeatureUnion]
        self._clsfr = None  # type: Optional[MultinomialNB]
        

    def train(self, training_list: List[Tuple[str, str]]) -> bool:
        """
        Reset and train the classifier with expression-label pairs

        :param training_list: List of labelled samples (text, label).
        :return: Boolean indicating whether or not the training was successful.
        """
        _word_ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), max_features=5000000,
                                                 lowercase=True)
        _char_2gram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2), max_features=5000000,
                                                 lowercase=True)
        _char_4gram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(4, 4), max_features=5000000,
                                                 lowercase=True)
        _char_6gram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(6, 6), max_features=5000000,
                                                 lowercase=True)

        self._union_vectorizer = FeatureUnion([
            ('w', _word_ngram_vectorizer),
            ('c2', _char_2gram_vectorizer),
            ('c4', _char_4gram_vectorizer),
            ('c6', _char_6gram_vectorizer),
        ])

        self._label_to_idx.clear()
        self._next_label_idx = 0
        self._labels.clear()

        if len(training_list) > 0:
            text_inputs = []  # type: List[str]
            labels = []  # type: List[str]
            class_size_dict = {}  # type: Dict[str, int]

            for training_sample in training_list:
                text_inputs.append(training_sample[0])
                labels.append(training_sample[1])
                class_size_dict[training_sample[1]] = class_size_dict.get(training_sample[1], 0) + 1

            min_class_size = min(class_size_dict.values())
            feature_counts = self._union_vectorizer.fit_transform(text_inputs)
            cross_validation_folds = 5
            
            if min_class_size >= (cross_validation_folds*3):
                # Do grid search with cross validation if the smallest class has enough samples for cross validation.
                estimator = MultinomialNB()
                parameters = {'alpha': [0.01]}
                grid_search_cv = GridSearchCV(estimator=estimator, 
                                              param_grid=parameters, 
                                              cv=cross_validation_folds,
                                              n_jobs=2,
                                              verbose=0)
                grid_search_cv.fit(feature_counts, labels)
                
                print("best_params_ =", grid_search_cv.best_params_, flush=True)
                print("cv_results_ =", grid_search_cv.cv_results_, flush=True)                
                self._clsfr = grid_search_cv.best_estimator_
            else:
                self._clsfr = MultinomialNB(alpha=0.01)
                self._clsfr.fit(feature_counts, labels)

            # === Update the _labels set and the _label_to_idx mapping ===
            for training_sample in training_list:
                label = training_sample[1]

                self._labels.add(label)
                if self._label_to_idx.get(label) is None:
                    self._label_to_idx[label] = self._next_label_idx
                    self._next_label_idx += 1

            return True
        else:
            return False

    def get_informative_features(self, top_n: int) -> Dict[str, List[Tuple[str, float]]]:
        """
        Get the list of most informative features per class.

        :param top_n: The number of top features per class to list.
        :return: A dictionary of features per class.
        """
        informative_feature_dict = {}  # type: Dict[str, List[Tuple[str, float]]]

        if (self._clsfr is not None) and (self._union_vectorizer is not None):
            feature_names = self._union_vectorizer.get_feature_names()
            class_labels = self._clsfr.classes_

            # Note: _clsfr.coef_ behaves different when only two classes!
            if len(class_labels) > 2:
                for i, class_label in enumerate(class_labels):
                    argsorted_feature_indexes = np.argsort(self._clsfr.coef_[i])
                    top_n_feature_indexes = argsorted_feature_indexes[-min(top_n, len(argsorted_feature_indexes)):]
                    informative_feature_dict[class_label] = []
                    for j in top_n_feature_indexes:
                        informative_feature_dict[class_label].append((feature_names[j], self._clsfr.coef_[i][j]))
            else:
                i, class_label = 0, 'success'
                argsorted_feature_indexes = np.argsort(self._clsfr.coef_[i])
                top_n_feature_indexes = argsorted_feature_indexes[-min(top_n, len(argsorted_feature_indexes)):]
                informative_feature_dict[class_label] = []
                for j in top_n_feature_indexes:
                    informative_feature_dict[class_label].append((feature_names[j], self._clsfr.coef_[i][j]))

        return informative_feature_dict

    def classify(self, input_text: str,
                 weak_match_threshold: float,
                 top_n: int) -> List[Tuple[str, float]]:
        """ Find the n best matches to the input text.

        :param input_text: The expression to look for in the expression set.
        :param weak_match_threshold: The threshold above which matches are weak; 0.7 culls much; 1.1 culls less, etc.
        :param top_n: The max number (i.e. k) of sorted matches to return.
        :return: Sorted list of scored labels [(str, float)].
        """
        if (self._clsfr is not None) and (self._union_vectorizer is not None):
            feature_counts = self._union_vectorizer.transform([input_text])
            y_proba_list = self._clsfr.predict_proba(feature_counts)

            scored_labels = []  # type: List[Tuple[str, float]]

            for i, class_label in enumerate(self._clsfr.classes_):
                prob = y_proba_list[0, i]  # type: float

                if prob >= (1.0 - weak_match_threshold):
                    scored_labels.append((class_label, prob))

            scored_labels.sort(key=lambda scored_label: scored_label[1], reverse=True)

            return scored_labels[:min(top_n, len(scored_labels))]
        else:
            return []


In [3]:
_text_clsfr_dict = {}  # type: Dict[str, TextClassifierNB]

def train_text_clsfr(name: str, training_list: List[Tuple[str, str]]) -> bool: 
    text_clsfr = TextClassifierNB()

    if text_clsfr is not None:
        success = text_clsfr.train(training_list)

        if success:
            _text_clsfr_dict[name] = text_clsfr
            return True

    return False


def retrieve_text_class(name: str,
                        input_text: str,
                        weak_match_threshold: float,
                        top_n: int) -> List[Tuple[str, float]]:
    text_clsfr = _text_clsfr_dict.get(name)

    if text_clsfr is not None:
        scored_label_list = text_clsfr.classify(input_text, weak_match_threshold, top_n)
        # scored_label_list = List[Tuple[label, score]]

        return scored_label_list
    else:
        return []


## Helper Functions

In [4]:
import sklearn 
import sklearn.metrics

def print_confusion_matrix(confusion_dict: Dict[str, Dict[str, List[Tuple[str, Optional[str]]]]],
                           proposed_label_list: List[str] = None) -> None:
    """
    Print the confusion matrix. This method also serves as an example of how to use the sparse confusion matrix.

    :param confusion_dict: The sparse confusion matrix stored as a dict of dicts.
    :param proposed_label_list: The proposed order and subset of class labels to use.
    """
    print()
    print("label_list:", proposed_label_list)
    print()

    # Generate the proposed label list if none provided.
    if proposed_label_list is None:
        row_label_set = set()  # type: Set[str]
        column_label_set = set()  # type: Set[str]

        for row_label, row_dict in confusion_dict.items():
            row_label_set.add(row_label)

            for column_label, _ in row_dict.items():
                column_label_set.add(column_label)

        proposed_label_list = list(row_label_set.union(column_label_set))
        proposed_label_list.sort()

    # Calculate the row and column totals.
    row_total_dict = {}  # type: Dict[str, int]
    column_total_dict = {}  # type: Dict[str, int]

    for row_label, row_dict in confusion_dict.items():
        if row_label in proposed_label_list:
            row_total = 0

            for column_label, cell in row_dict.items():
                if column_label in proposed_label_list:
                    cell_len = len(cell)
                    row_total += cell_len
                    column_total = column_total_dict.get(column_label, 0) + cell_len
                    column_total_dict[column_label] = column_total

            row_total_dict[row_label] = row_total

    # Print the recall confusion matrix
    print("===")
    print("Recall Confusion Matrix:")
    for column_label in proposed_label_list:
        print(column_label[:min(len(column_label), 5)], ".\t", end='')
    print()

    diag_total = 0.0
    diag_count = 0

    for row_label in proposed_label_list:
        row_dict = confusion_dict.get(row_label, {})

        row_total = row_total_dict.get(row_label, 0)

        for column_label in proposed_label_list:
            if row_total > 0:
                cell = row_dict.get(column_label, [])
                count = len(cell)
                print('\033[%dm' % int(37.0 - round((count / row_total) * 7)), end='')
                print(round(count / row_total, 3), "\t", end='')

                if (column_label != '_nc') and (column_label == row_label):
                    diag_total += count / row_total
                    diag_count += 1
            else:
                print('\033[0m', end='')
                print('--- \t', end='')

        print('\033[0m')

    print(f"AVRG = {diag_total / diag_count}")
    print("===")
    print()

    # Print the precision confusion matrix
    print("===")
    print("Precision Confusion Matrix:")
    for column_label in proposed_label_list:
        print(column_label[:min(len(column_label), 5)], ".\t", end='')
    print()

    diag_total = 0.0
    diag_count = 0

    for row_label in proposed_label_list:
        row_dict = confusion_dict.get(row_label, {})

        for column_label in proposed_label_list:
            column_total = column_total_dict.get(column_label, 0)

            if column_total > 0:
                cell = row_dict.get(column_label, [])
                count = len(cell)
                print('\033[%dm' % int(37.0 - round((count / column_total) * 7)), end='')
                print(round(count / column_total, 3), "\t", end='')

                if (column_label != '_nc') and (column_label == row_label):
                    diag_total += count / column_total
                    diag_count += 1
            else:
                print('\033[0m', end='')
                print('--- \t', end='')

        print('\033[0m')

    print(f"AVRG = {diag_total / diag_count}")
    print("===")
    print()


def analyse_clsfr_results(result_list: List[Tuple[str, str, List[str], Optional[str]]]) -> \
                          Tuple[float, float, Dict[str, Dict[str, List[Tuple[str, Optional[str]]]]]]:
    """
    Analyse the classifier results.

    :param result_list: is a list of results tuples (image, true_label and predicted_labels/top-n-labels)
    :return: The classifier accuracy, f1 score and confusion matrix.
    """
    labels_true = []  # type: List[str]
    labels_predicted = []  # type: List[str]
    num_matched = 0

    # Sparse confusion matrix ...
    confusion_dict = {}  # type: Dict[str, Dict[str, List[Tuple[str, Optional[str]]]]]

    if len(result_list) > 0:
        count = 0

        for result_sample in result_list:
            image = result_sample[0]
            true_label = result_sample[1]  # the matrix row label
            predicted_result_labels = result_sample[2]
            sample_uuid = result_sample[3]

            if len(predicted_result_labels) > 0:
                predicted_label = predicted_result_labels[0]  # the matrix column label
            else:
                predicted_label = '_nc'

            row_dict = confusion_dict.get(true_label, {})

            cell = row_dict.get(predicted_label)

            if cell is None:
                cell = [(image, sample_uuid)]
            else:
                cell.append((image, sample_uuid))

            row_dict[predicted_label] = cell
            confusion_dict[true_label] = row_dict

            # === Update the global scoring ===
            # if true_label == predicted_label:
            if true_label in predicted_result_labels:
                num_matched += 1

            labels_true.append(true_label)
            labels_predicted.append(predicted_label)
            # === ===

            # if count % 100 == 0:
            #     print(".", end="", flush=True)
            count += 1

        accuracy = num_matched / len(result_list)
        f1 = sklearn.metrics.f1_score(labels_true, labels_predicted, average='weighted')
        # print(".")

        return accuracy, f1, confusion_dict
    else:
        return 0.0, 0.0, {}


## Lexicon LID

In [5]:
def create_lang_token_dict(sent_list: List[Tuple[str, str]], drop_prob: float = 0.0) -> Dict[str, Dict[str, int]]:
    lang_token_dict = {}  # type: Dict[str, Dict[str, int]]
    # Dict[lang, Dict[token, count]]

    for text, label in sent_list:
        tokens = text.lower().split()

        token_dict = lang_token_dict.get(label, {})

        for token in tokens:
            if random.random() >= drop_prob:
                count = token_dict.get(token, 0)
                token_dict[token] = count + 1

        lang_token_dict[label] = token_dict

    return lang_token_dict


def pred_language_lex(lang_token_dict: Dict[str, Dict[str, int]],
                      input_text: str) -> List[Tuple[str, float]]:
    """
    Use the language lexicons to predict which language the text is written in.

    :param lang_token_dict: The token vs. count lexicon for each language Dict[lang, Dict[token, count]]
    :param input_text: The input text to LID.
    :return: The scored language labels.
    """
    scored_labels = []  # List[Tuple[str, float]]
    input_tokens = input_text.lower().split()

    for lang_code, word_dict in lang_token_dict.items():
        score = 0.0
        # Count the number of tokens from language lang_code that appear in input_text
        for token in input_tokens:
            if word_dict.get(token, 0) > 0:
                score += 1

        scored_labels.append((lang_code, score))

    scored_labels.sort(key=lambda scored_label: scored_label[1], reverse=True)
    
    # Return an empty list if the LID result had low confidence.
    if scored_labels[0][1] <= scored_labels[1][1]:
        scored_labels = []

    return scored_labels


def add_pred_labels_lex(lang_token_dict: Dict[str, Dict[str, int]],
                        sent_list: List[Tuple[str, str]]) -> List[Tuple[str, str, List[str]]]:
    """
    Add the predicted language labels to the sentences.

    :param sent_list: The list of sentences labelled with only the truth.
    :return: The list of sentences labelled with the truth and the predicted label.
    """
    sent_list_pred = []  # type: List[Tuple[str, str, List[str]]]
    sent_list_len = len(sent_list)
    sentence_num = 0

    for sentence, truth in sent_list:
        prediction = pred_language_lex(lang_token_dict, sentence)

        sent_list_pred.append((sentence, truth, [prediction[0][0]] if len(prediction)>0 else []))

        if (len(prediction)>0) and (truth != prediction[0][0]):
            print(truth, prediction[:3], sentence)
            print()

        sentence_num += 1
        if (sentence_num % (sent_list_len // 10)) == 0:
            print(".", end="", flush=True)

    return sent_list_pred


## Naive Bayesian LID

In [6]:
def pred_language(text_clsfr_name,
                  text: str, 
                  threshold: float) -> List[Tuple[str, float]]:
    scored_labels = retrieve_text_class(text_clsfr_name, text, 1.0 - threshold, 10)
    return scored_labels


def add_pred_labels(text_clsfr_name, 
                    sent_list: List[Tuple[str, str]], 
                    threshold: float) -> List[Tuple[str, str, List[str]]]:
    """
    Add the predicted language labels to the sentences.

    :param sent_list: The list of sentences labelled with only the truth.
    :return: The list of sentences labelled with the truth and the predicted label.
    """
    sent_list_pred = []  # type: List[Tuple[str, str, List[str]]]
    sent_list_len = len(sent_list)
    sentence_num = 0

    for sentence, truth in sent_list:
        prediction = pred_language(text_clsfr_name, sentence, threshold)

        sent_list_pred.append((sentence, truth, [prediction[0][0]] if len(prediction)>0 else []))

        # if (len(prediction)>0) and (truth != prediction[0][0]):
        #     print(truth, prediction[:3], sentence)
        #     print()

        sentence_num += 1
        if (sentence_num % (sent_list_len // 10)) == 0:
            print(".", end="", flush=True)

    return sent_list_pred

In [7]:
## Data Loading Functions

In [8]:
import string 

def cleanup_text(input_text: str) -> str:
    """
    Apply some basic cleanup to the input text. NOTE: Only used by the LID_ZA model.

    :param input_text: The input text.
    :return: The cleaned input text
    """

    text = input_text.lower()
    punc_to_remove = string.punctuation.replace('-', '') + '0123456789'
    text = text.translate(str.maketrans(punc_to_remove, ' ' * len(punc_to_remove)))

    text = text.replace('ã…â¡', 'š')
    text = text.replace('ï¿½', '')
    text = text.replace('ª', '')

    text = " ".join(text.split())
    text = text.strip()

    # All special characters are kept.
    return text


def shorten_sentences(sentences: List[Tuple[str, str]], min_length: int) -> List[Tuple[str, str]]:
    sentences_shortened = []  # List[Tuple[str, str]]

    for text, label in sentences:
        text_end_i = min_length

        if min_length <= len(text):
            while (text_end_i < len(text)) and (text[text_end_i] != ' '):
                text_end_i += 1

            sentences_shortened.append((text[:text_end_i], label))

    return sentences_shortened


def load_sentences_nchlt(filename: str, label: str) -> List[Tuple[str, str]]:
    """
    Load the sentences/lines of text from the text corpora.

    :param filename: Name of the file to load.
    :param label: The label to assign to each sentence.
    :return: List of labelled sentence strings.
    """
    sent_list = []  # type: List[Tuple[str, str]]

    # Iterate over the lines of the file
    with open(filename, 'rt') as f:
        print("Loading sentences from", filename)
        for line in f:
            if not line.startswith("<fn"):
                text = cleanup_text(line.strip())

                # if text != '':
                if 200 < len(text) < 300:
                    text_end_i = len(text)  # 30

                    while (text_end_i < len(text)) and (text[text_end_i] != ' '):
                        text_end_i += 1

                    sent_list.append((text[:text_end_i], label))
                    # sent_list.append((text, label))

    return sent_list


def save_sentences_nchlt(filename: str, label_to_save: str, labelled_sentences: List[Tuple[str, str]]) -> None:
    """
    Save the sentences to a text corpora.

    :param filename: Name of the file to load.
    :param labelled_sentences:
    """
    with open(filename, 'wt') as f:
        print("Saving sentences to", filename)
        for sentence, label in labelled_sentences:
            if label == label_to_save:
                f.write(sentence + '\n')


def load_sentences_dsl(filename: str) -> List[Tuple[str, str]]:
    sent_list = []  # type: List[Tuple[str, str]]

    with open(filename, newline='') as csvfile:
        csvdoc = csv.reader(csvfile, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in csvdoc:
            if len(row) == 2:
                text, label = row
                sent_list.append((text, label))

    return sent_list


def save_sentences_lanidenn(filename: str, labelled_sentences: List[Tuple[str, str]]) -> None:
    with open(filename, 'wt') as f:
        print("Saving sentences to", filename)
        for sentence, label in labelled_sentences:
            f.write(f"{label}.txt {sentence}\n")


def save_sentences_fasttext(filename: str, labelled_sentences: List[Tuple[str, str]]) -> None:
    with open(filename, 'wt') as f:
        print("Saving sentences to", filename)
        for sentence, label in labelled_sentences:
            f.write(f"__label__{label} , {sentence}\n")


## Load either the NCHLT Data _OR_ the DSL 2015 Data _OR_ the DSL 2017 Data

In [None]:
# Checkout the NCHLT data.
# NCHLT part of the repo in data folder.


In [None]:
# Download the DSL 2015 data.
!wget https://github.com/Simdiva/DSL-Task/raw/master/data/DSLCC-v2.0/train-dev/train.txt -O train.txt
!wget https://github.com/Simdiva/DSL-Task/raw/master/data/DSLCC-v2.0/gold/test-gold.txt -O test-gold.txt
# Will be loaded like:
#  sent_list_train = load_sentences_dsl("train.txt")  # DSL2015
#  sent_list_test = load_sentences_dsl("test-gold.txt")  # DSL2015


In [None]:
# Download the DSL 2017 data.
!wget http://scholar.harvard.edu/files/malmasi/files/dslcc4.zip -O dslcc4.zip
!unzip dslcc4.zip -d dslcc4
# Will be loaded like:
#  sent_list_train = load_sentences_dsl("dslcc4/DSL-TRAIN.txt")  # DSL2017
#  sent_list_test = load_sentences_dsl("dslcc4/DSL-TEST-GOLD.txt")  # DSL2017


### Load NCHLT data.

In [None]:
### Load either the original NCHLT data ...
language_set = {'afr': '../data/afr/original_NCHLT_afr_CLEAN.2.0.txt',
                'eng': '../data/eng/original_NCHLT_eng_CLEAN.1.0.0.txt',
                'nbl': '../data/nbl/original_NCHLT_nbl_CLEAN.2.0.txt',
                'xho': '../data/xho/original_NCHLT_xho_CLEAN.2.0.txt',
                'zul': '../data/zul/original_NCHLT_zul_CLEAN.2.0.txt',
                'nso': '../data/nso/original_NCHLT_nso_CLEAN.2.0.txt',
                'sot': '../data/sot/original_NCHLT_sot_CLEAN.2.0.txt',
                'tsn': '../data/tsn/original_NCHLT_tsn_CLEAN.2.0.txt',
                'ssw': '../data/ssw/original_NCHLT_ssw_CLEAN.2.0.txt',
                'ven': '../data/ven/original_NCHLT_ven_CLEAN.2.0.txt',
                'tso': '../data/tso/original_NCHLT_tso_CLEAN.2.0.txt'}


In [9]:
### OR the improved NCHLT data that has been checked by some humans. See https://arxiv.org/abs/1711.00247
language_set = {'afr': '../data/afr/improved_afr.txt',
                'eng': '../data/eng/improved_eng.txt',
                'nbl': '../data/nbl/improved_nbl.txt',
                'xho': '../data/xho/improved_xho.txt',
                'zul': '../data/zul/improved_zul.txt',
                'nso': '../data/nso/improved_nso.txt',
                'sot': '../data/sot/improved_sot.txt',
                'tsn': '../data/tsn/improved_tsn.txt',
                'ssw': '../data/ssw/improved_ssw.txt',
                'ven': '../data/ven/improved_ven.txt',
                'tso': '../data/tso/improved_tso.txt'}


In [10]:
## List the languages and language families.
text_clsfr_lang_dict = {}  # type: Dict[str, Set[str]]
text_clsfr_lang_dict['all'] = {'zul', 'xho', 'nbl', 'ssw', 'nso', 'sot', 'tsn', 'ven', 'tso', 'afr', 'eng'}
text_clsfr_lang_dict['nguni'] = {'zul', 'xho', 'nbl', 'ssw'}
text_clsfr_lang_dict['sotho'] = {'nso', 'sot', 'tsn'}

# All of the datasets will be loaded, but only the ones selected above will be used to build the classifiers!!
num_training_samples_pl = 3500  # samples per language! Adjust to fit within the number of samples available per language.
num_testing_samples_pl = 600  # samples per language! Adjust to fit within the number of samples available per language.

# Note: if the values above are too big for the data, you'll get an index out of bounds exception.

sent_list_train = []
sent_list_test = []

for label, path in language_set.items():
    sent_list_pl = load_sentences_nchlt(path, label)
    random.shuffle(sent_list_pl)
    print(f"len(sent_list_pl) for {label} =", len(sent_list_pl), flush=True)
    sent_list_train.extend(sent_list_pl[:num_training_samples_pl])
    sent_list_test.extend(sent_list_pl[num_training_samples_pl:(num_training_samples_pl + num_testing_samples_pl)])

print("len(training_samples):", len(sent_list_train))
print("len(testing_samples):", len(sent_list_test))

random.shuffle(sent_list_train)
random.shuffle(sent_list_test)

# Shortens the test sentences to make the problem harder ...
sent_list_test_shortened = shorten_sentences(sent_list_test, 15)
# OR make the short test sentences exactly the test sentences.
# sent_list_test_shortened = sent_list_test

Loading sentences from ../data/afr/improved_afr.txt
len(sent_list_pl) for afr = 11310
Loading sentences from ../data/eng/improved_eng.txt
len(sent_list_pl) for eng = 25255
Loading sentences from ../data/nbl/improved_nbl.txt
len(sent_list_pl) for nbl = 5422
Loading sentences from ../data/xho/improved_xho.txt
len(sent_list_pl) for xho = 7951
Loading sentences from ../data/zul/improved_zul.txt
len(sent_list_pl) for zul = 10485
Loading sentences from ../data/nso/improved_nso.txt
len(sent_list_pl) for nso = 7208
Loading sentences from ../data/sot/improved_sot.txt
len(sent_list_pl) for sot = 6719
Loading sentences from ../data/tsn/improved_tsn.txt
len(sent_list_pl) for tsn = 5056
Loading sentences from ../data/ssw/improved_ssw.txt
len(sent_list_pl) for ssw = 5113
Loading sentences from ../data/ven/improved_ven.txt
len(sent_list_pl) for ven = 4139
Loading sentences from ../data/tso/improved_tso.txt
len(sent_list_pl) for tso = 5117
len(training_samples): 38500
len(testing_samples): 6600


### OR load DSL2015 data.

In [None]:
# List the languages and language families.
text_clsfr_lang_dict = {}  # type: Dict[str, Set[str]]
text_clsfr_lang_dict['all'] = {'bs', 'hr', 'sr', 'id', 'my', 'cz', 'sk', 'pt-BR', 'pt-PT', 'es-AR', 'es-ES', 'bg', 'mk', 'xx'}
text_clsfr_lang_dict['group_a'] = {'bs', 'hr', 'sr'}
text_clsfr_lang_dict['spanish'] = {'es-AR', 'es-ES'}
text_clsfr_lang_dict['portuguese'] = {'pt-BR', 'pt-PT'}

text_clsfr_lang_dict['group_b'] = {'id', 'my'}
text_clsfr_lang_dict['group_c'] = {'cz', 'sk'}
text_clsfr_lang_dict['group_f'] = {'bg', 'mk'}

sent_list_train = load_sentences_dsl("train.txt")  # DSL2015
sent_list_test = load_sentences_dsl("test-gold.txt")  # DSL2015

print("training_samples:", len(sent_list_train))
print("testing_samples:", len(sent_list_test))

random.shuffle(sent_list_train)
random.shuffle(sent_list_test)

sent_list_test_shortened = sent_list_test  # shorten_sentences(sent_list_test, 15)

### OR load DSL2017 data.

In [None]:
# List the languages and language families.
text_clsfr_lang_dict = {}  # type: Dict[str, Set[str]]
text_clsfr_lang_dict['all'] = {'bs', 'hr', 'sr', 'id', 'my', 'pt-BR', 'pt-PT', 'es-AR', 'es-ES', 'es-PE', 'fr-CA', 'fr-FR', 'fa-IR', 'fa-AF'}
text_clsfr_lang_dict['group_a'] = {'bs', 'hr', 'sr'}
text_clsfr_lang_dict['group_b'] = {'id', 'my'}
text_clsfr_lang_dict['portuguese'] = {'pt-BR', 'pt-PT'}
text_clsfr_lang_dict['spanish'] = {'es-AR', 'es-ES', 'es-PE'}
text_clsfr_lang_dict['french'] = {'fr-CA', 'fr-FR'}
text_clsfr_lang_dict['group_f'] = {'fa-IR', 'fa-AF'}

sent_list_train = load_sentences_dsl("dslcc4/DSL-TRAIN.txt")  # DSL2017
sent_list_test = load_sentences_dsl("dslcc4/DSL-TEST-GOLD.txt")  # DSL2017

print("training_samples:", len(sent_list_train))
print("testing_samples:", len(sent_list_test))

random.shuffle(sent_list_train)
random.shuffle(sent_list_test)

sent_list_test_shortened = sent_list_test  # shorten_sentences(sent_list_test, 15)

## Train the LID Model
Each group in `text_clsfr_lang_dict` as gets a model. 

`text_clsfr_lang_dict` was defined when the data was loaded.

This is typically one 'all' model + one model per language group/family.

In [11]:
## Add training samples to all the classifiers in text_clsfr_lang_dict.
## Each group in `text_clsfr_lang_dict` gets a model. This is typically one 'all' model + one model per language group/family.

clsfr_sent_list_train = {}  # type: Dict[str, List[Tuple[str, str]]]
clsfr_sent_list_test = {}  # type: Dict[str, List[Tuple[str, str]]]
# Dict[clsfr_name, List[Tuple[text, lang_code]]]

# Add training samples to the various classifiers.
for sent_text, sent_lang in sent_list_train:
    for clsfr_name, clsfr_lang_set in text_clsfr_lang_dict.items():
        if sent_lang in clsfr_lang_set:
            sent_list = clsfr_sent_list_train.get(clsfr_name, [])
            sent_list.append((sent_text, sent_lang))
            clsfr_sent_list_train[clsfr_name] = sent_list
            
# Add shortened testing samples to the various classifiers.
for sent_text, sent_lang in sent_list_test_shortened:
    for clsfr_name, clsfr_lang_set in text_clsfr_lang_dict.items():
        if sent_lang in clsfr_lang_set:
            sent_list = clsfr_sent_list_test.get(clsfr_name, [])
            sent_list.append((sent_text, sent_lang))
            clsfr_sent_list_test[clsfr_name] = sent_list
            
# print(clsfr_sent_list_train['all'][0])
# print(clsfr_sent_list_test['all'][0])

In [12]:
## Generate token dict for LEXICON LID.
# The support of the lexicon makes significantly impacts the results

lang_token_dict = create_lang_token_dict(sent_list_train + sent_list_test, drop_prob = 0.0)
# lang_token_dict = create_lang_token_dict(sent_list_train + sent_list_test, drop_prob = 0.5)
# lang_token_dict = create_lang_token_dict(sent_list_train, drop_prob = 0.0)

for lang_code, token_dict in lang_token_dict.items():
    print(f"Size of '{lang_code}' lexicon is {len(token_dict)} words.")

Size of 'ven' lexicon is 9982 words.
Size of 'xho' lexicon is 30763 words.
Size of 'sot' lexicon is 11263 words.
Size of 'eng' lexicon is 11929 words.
Size of 'nbl' lexicon is 24535 words.
Size of 'afr' lexicon is 14625 words.
Size of 'ssw' lexicon is 28330 words.
Size of 'zul' lexicon is 33736 words.
Size of 'nso' lexicon is 12712 words.
Size of 'tsn' lexicon is 12907 words.
Size of 'tso' lexicon is 10585 words.


In [13]:
## Test LEXICON based LID
print("Running Lexicon LID on test data ... ")
start_time = time.time()

sent_list_pred = add_pred_labels_lex(lang_token_dict, sent_list_test_shortened)

end_time = time.time()
print('done. Testing time = ' + str(end_time - start_time) + 's OR ' +
      str(round(len(sent_list_test) / (end_time - start_time), 2)) + " LIDs/sec")
print()

print("Analysing LID test results ...")
start_time = time.time()

result_list = []  # type: List[Tuple[str, str, List[str], Optional[str]]]

for sentence, truth, pred_list in sent_list_pred:
    result_list.append((sentence, truth, pred_list, None))

acc_nb, f1_nb, confusion_dict_nb = analyse_clsfr_results(result_list)

print("acc_lex, f1_lex", acc_nb, f1_nb)
print_confusion_matrix(confusion_dict_nb)


Running Lexicon LID on test data ... 
..........done. Testing time = 0.15487885475158691s OR 42613.95 LIDs/sec

Analysing LID test results ...
acc_lex, f1_lex 0.8409090909090909 0.9090042726142343

label_list: None

===
Recall Confusion Matrix:
_nc .	afr .	eng .	nbl .	nso .	sot .	ssw .	tsn .	tso .	ven .	xho .	zul .	
[0m--- 	[0m--- 	[0m--- 	[0m--- 	[0m--- 	[0m--- 	[0m--- 	[0m--- 	[0m--- 	[0m--- 	[0m--- 	[0m--- 	[0m
[37m0.023 	[30m0.977 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[0m
[36m0.18 	[37m0.0 	[31m0.82 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[0m
[36m0.145 	[37m0.0 	[37m0.0 	[31m0.855 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[0m
[35m0.292 	[37m0.0 	[37m0.0 	[37m0.0 	[32m0.708 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[0m
[35m0.285 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[3

  'recall', 'true', average, warn_for)


In [14]:
## Train the NB classifiers.

print("Training the NB text classifiers ...")
start_time = time.time()

for clsfr_name, clsfr_lang_set in text_clsfr_lang_dict.items():
    print(clsfr_name, clsfr_lang_set, "...")
    train_text_clsfr(clsfr_name, 
                          [(text, label) for text, label in clsfr_sent_list_train[clsfr_name]])
    
end_time = time.time()

print('done. Training time = ' + str(end_time - start_time) + 's.')


Training the NB text classifiers ...
all {'tso', 'nso', 'xho', 'sot', 'eng', 'zul', 'tsn', 'ven', 'afr', 'nbl', 'ssw'} ...
best_params_ = {'alpha': 0.01}
cv_results_ = {'mean_fit_time': array([1.56876283]), 'std_fit_time': array([0.39141409]), 'mean_score_time': array([0.27869906]), 'std_score_time': array([0.09435877]), 'param_alpha': masked_array(data=[0.01],
             mask=[False],
       fill_value='?',
            dtype=object), 'params': [{'alpha': 0.01}], 'split0_test_score': array([0.99961039]), 'split1_test_score': array([0.99948052]), 'split2_test_score': array([0.99974026]), 'split3_test_score': array([0.99987013]), 'split4_test_score': array([0.99974026]), 'mean_test_score': array([0.99968831]), 'std_test_score': array([0.00013244]), 'rank_test_score': array([1], dtype=int32)}
nguni {'ssw', 'xho', 'nbl', 'zul'} ...
best_params_ = {'alpha': 0.01}
cv_results_ = {'mean_fit_time': array([0.22843504]), 'std_fit_time': array([0.03145271]), 'mean_score_time': array([0.03035741]

## Test the LID Model

In [15]:
## Test the NB classifiers' accuracies in isolation.

for clsfr_name, clsfr_lang_set in text_clsfr_lang_dict.items():
    print(clsfr_name, clsfr_lang_set, "...")

    sent_list_pred = add_pred_labels(clsfr_name, clsfr_sent_list_test[clsfr_name], 0.0)
    result_list = []  # type: List[Tuple[str, str, List[str], Optional[str]]]
    
    for sentence, truth, pred_list in sent_list_pred:
        result_list.append((sentence, truth, pred_list, None))
        
    acc, f1, confusion_dict = analyse_clsfr_results(result_list)

    print("  acc, f1", acc, f1)
    print_confusion_matrix(confusion_dict)
    print()

all {'tso', 'nso', 'xho', 'sot', 'eng', 'zul', 'tsn', 'ven', 'afr', 'nbl', 'ssw'} ...
..........  acc, f1 0.9431818181818182 0.9431654645889058

label_list: None

===
Recall Confusion Matrix:
afr .	eng .	nbl .	nso .	sot .	ssw .	tsn .	tso .	ven .	xho .	zul .	
[30m0.988 	[37m0.003 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.007 	[37m0.0 	[37m0.0 	[37m0.002 	[37m0.0 	[0m
[37m0.003 	[30m0.99 	[37m0.0 	[37m0.002 	[37m0.0 	[37m0.0 	[37m0.002 	[37m0.003 	[37m0.0 	[37m0.0 	[37m0.0 	[0m
[37m0.0 	[37m0.002 	[30m0.947 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.008 	[37m0.043 	[0m
[37m0.0 	[37m0.003 	[37m0.0 	[31m0.903 	[37m0.03 	[37m0.0 	[37m0.062 	[37m0.0 	[37m0.0 	[37m0.002 	[37m0.0 	[0m
[37m0.0 	[37m0.007 	[37m0.0 	[37m0.025 	[31m0.907 	[37m0.002 	[37m0.06 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[0m
[37m0.0 	[37m0.003 	[37m0.01 	[37m0.0 	[37m0.0 	[30m0.967 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.003 	[37m0.017 	[0m

In [16]:
## Stacked NB + NB LID function.

def add_pred_labels_cmb_nb(sent_list: List[Tuple[str, str]]) -> Tuple[List[Tuple[str, str, List[str]]],
                                                                      List[Tuple[str, str, List[str]]]]:
    """
    Add the predicted language labels to the sentences.

    """
    sent_list_len = len(sent_list)
    sentence_num = 0

    sent_list_pred_all = []  # type: List[Tuple[str, str, List[str]]]
    sent_list_pred_cmb = []  # type: List[Tuple[str, str, List[str]]]

    correct_all = 0
    correct_cmb = 0

    for sentence, truth in sent_list:
        # Get the base naive Bayes LID prediction.
        prediction_all = pred_language('all', sentence, 0.0)
        sentence_lang = prediction_all[0][0] 

        prediction_cmb = prediction_all

        for clsfr_name, clsfr_lang_set in text_clsfr_lang_dict.items():
            if (clsfr_name != 'all') and (sentence_lang in clsfr_lang_set):
                prediction_fam = pred_language(clsfr_name, sentence, 0.0)

                if len(prediction_fam) > 0:
                    prediction_cmb = prediction_fam
                else:
                    prediction_cmb = prediction_all
                    
                break;

        # ========================================
        # === Record the predictions and stats ===
        sent_list_pred_all.append((sentence, truth, [prediction_all[0][0]]))
        sent_list_pred_cmb.append((sentence, truth, [prediction_cmb[0][0]]))

        if prediction_all[0][0] == truth:
            correct_all += 1

        if prediction_cmb[0][0] == truth:
            correct_cmb += 1

        sentence_num += 1

        if (prediction_all[0][0] != prediction_cmb[0][0]) or (truth != prediction_cmb[0][0]):
            print(truth, sentence, flush=True)
            print("prediction_nb[:3] =", prediction_all[:3])
            print("prediction_cmb =", prediction_cmb)
            print()

        if (sentence_num % 100) == 0:
            print(f"{str(round(sentence_num / float(sent_list_len) * 100.0, 2))}%", flush=True)
            print("  acc =", correct_all / float(sentence_num))
            print("  acc_cmb =", correct_cmb / float(sentence_num))
            print()

    return sent_list_pred_all, sent_list_pred_cmb


In [17]:
## Stacked NB + Lexicon-based LID function.

def add_pred_labels_cmb_lexicon(sent_list: List[Tuple[str, str]]) -> Tuple[List[Tuple[str, str, List[str]]],
                                                                      List[Tuple[str, str, List[str]]]]:
    """
    Add the predicted language labels to the sentences.

    """
    sent_list_len = len(sent_list)
    sentence_num = 0

    sent_list_pred_all = []  # type: List[Tuple[str, str, List[str]]]
    sent_list_pred_cmb = []  # type: List[Tuple[str, str, List[str]]]

    correct_all = 0
    correct_cmb = 0

    for sentence, truth in sent_list:
        # Get the base naive Bayes LID prediction.
        prediction_all = pred_language('all', sentence, 0.0)
        sentence_lang = prediction_all[0][0] 

        prediction_cmb = prediction_all

        for clsfr_name, clsfr_lang_set in text_clsfr_lang_dict.items():
            if (clsfr_name != 'all') and (sentence_lang in clsfr_lang_set):
                prediction_group = pred_language_lex(lang_token_dict, sentence)
                prediction_group = [(lang_code, score) for lang_code, score in prediction_group if lang_code in clsfr_lang_set]

                if len(prediction_group) > 0:
                    prediction_cmb = prediction_group
                else:
                    prediction_cmb = prediction_all
                    
                break;

        # ========================================
        # === Record the predictions and stats ===
        sent_list_pred_all.append((sentence, truth, [prediction_all[0][0]]))
        sent_list_pred_cmb.append((sentence, truth, [prediction_cmb[0][0]]))

        if prediction_all[0][0] == truth:
            correct_all += 1

        if prediction_cmb[0][0] == truth:
            correct_cmb += 1

        sentence_num += 1

        if (prediction_all[0][0] != prediction_cmb[0][0]) or (truth != prediction_cmb[0][0]):
            print(truth, sentence, flush=True)
            print("  prediction_all[:3] =", prediction_all[:3])
            print("  prediction_cmb =", prediction_cmb)
            print()

        if (sentence_num % 100) == 0:            
            print(f"Progress: {str(round(sentence_num / float(sent_list_len) * 100.0, 2))}%", flush=True)
            print("  acc =", correct_all / float(sentence_num))
            print("  acc_cmb =", correct_cmb / float(sentence_num))
            print()

    return sent_list_pred_all, sent_list_pred_cmb



In [18]:
print("Running LID on test data ... ")
start_time = time.time()


# sent_list_pred_all, sent_list_pred_cmb = add_pred_labels_cmb_nb(sent_list_test_shortened)
# OR
sent_list_pred_all, sent_list_pred_cmb = add_pred_labels_cmb_lexicon(sent_list_test_shortened)


end_time = time.time()
print('done. Testing time = ' + str(end_time - start_time) + 's OR ' +
      str(round(len(sent_list_test) / (end_time - start_time), 2)) + " LIDs/sec")
print()

print("Analysing LID test results ...")
start_time = time.time()

result_list = []  # type: List[Tuple[str, str, List[str], Optional[str]]]

result_list.clear()
for sentence, truth, pred_list in sent_list_pred_all:
    result_list.append((sentence, truth, pred_list, None))
    
acc_nb, f1_nb, confusion_dict_nb = analyse_clsfr_results(result_list)

print("acc_nb, f1_nb", acc_nb, f1_nb)
print_confusion_matrix(confusion_dict_nb)

result_list.clear()
for sentence, truth, pred_list in sent_list_pred_cmb:
    result_list.append((sentence, truth, pred_list, None))

acc_cmb, f1_cmb, confusion_dict_cmb = analyse_clsfr_results(result_list)

print("acc_cmb, f1_cmb", acc_cmb, f1_cmb)
print_confusion_matrix(confusion_dict_cmb)

end_time = time.time()
print('done. Analysis time = ' + str(end_time - start_time) + 's.')
print()


Running LID on test data ... 
zul kubasebenzi bemisebenzi
  prediction_all[:3] = [('nbl', 0.9994315518077075), ('zul', 0.0005684481922705018), ('xho', 1.2748067137392315e-14)]
  prediction_cmb = [('nbl', 0.9994315518077075), ('zul', 0.0005684481922705018), ('xho', 1.2748067137392315e-14), ('ssw', 1.2714070189571497e-75), ('sot', 6.665679304372445e-123), ('afr', 9.362935967385874e-155), ('ven', 2.8019768615276932e-158), ('nso', 1.0837499983919093e-161), ('tsn', 4.621697089211585e-165), ('tso', 1.1763578245132924e-165)]

zul izakhiwo zomphakathi
  prediction_all[:3] = [('nbl', 0.6712169467669256), ('zul', 0.32878305323309154), ('xho', 2.792820135709824e-21)]
  prediction_cmb = [('nbl', 0.6712169467669256), ('zul', 0.32878305323309154), ('xho', 2.792820135709824e-21), ('ssw', 2.7958047981162513e-76), ('ven', 5.796264712230272e-101), ('tso', 1.8710465109740508e-104), ('nso', 1.162551408426634e-113), ('tsn', 1.791840379291441e-117), ('sot', 2.5708713276892165e-125), ('eng', 7.31743707177705

  prediction_all[:3] = [('xho', 0.98755564556432), ('zul', 0.01244435443303589), ('nbl', 2.654926749877162e-12)]
  prediction_cmb = [('zul', 2.0), ('xho', 1.0), ('nbl', 1.0), ('ssw', 1.0)]

sot e ne e le ngwana
  prediction_all[:3] = [('tsn', 0.9998145426612534), ('sot', 0.0001854573387335882), ('nso', 5.461637480670491e-16)]
  prediction_cmb = [('tsn', 0.9998145426612534), ('sot', 0.0001854573387335882), ('nso', 5.461637480670491e-16), ('nbl', 1.5499314647260894e-36), ('ssw', 2.514351365390573e-41), ('zul', 1.4893989240200538e-47), ('ven', 8.246286106018631e-49), ('xho', 3.9850975017814443e-50), ('tso', 1.794776804541462e-51), ('afr', 2.585868354138164e-75)]

tso loko regional manager
  prediction_all[:3] = [('eng', 0.9999999999573106), ('tso', 4.270003589148227e-11), ('zul', 2.8835708898729087e-27)]
  prediction_cmb = [('eng', 0.9999999999573106), ('tso', 4.270003589148227e-11), ('zul', 2.8835708898729087e-27), ('tsn', 1.5151138663695513e-42), ('nso', 5.045494104085552e-45), ('sot', 

zul nginesiqiniseko
  prediction_all[:3] = [('nbl', 0.9972994130603305), ('zul', 0.0027005869396721344), ('xho', 5.53112394299491e-16)]
  prediction_cmb = [('nbl', 0.9972994130603305), ('zul', 0.0027005869396721344), ('xho', 5.53112394299491e-16), ('ssw', 3.625648418726692e-50), ('tso', 1.8188831600391504e-71), ('eng', 1.5705381804411706e-83), ('afr', 2.81606554536735e-85), ('tsn', 4.249133149030309e-87), ('ven', 1.689248990458223e-89), ('sot', 7.992177817506577e-90)]

nso setifikeiti se se
  prediction_all[:3] = [('tsn', 0.7288116315781198), ('nso', 0.2711301735615391), ('sot', 5.8194860334975094e-05)]
  prediction_cmb = [('tsn', 0.7288116315781198), ('nso', 0.2711301735615391), ('sot', 5.8194860334975094e-05), ('tso', 1.1820576262091557e-57), ('afr', 1.1915051318973546e-60), ('ssw', 7.454355464557445e-63), ('nbl', 1.1631515276797749e-80), ('zul', 2.5157108735209275e-90), ('xho', 1.0744739958828987e-92), ('eng', 1.4562604466074985e-106)]

xho ilungu ngalinye
  prediction_all[:3] = [('

zul masisukume sakhe
  prediction_all[:3] = [('ssw', 0.7593963199146028), ('zul', 0.24060367991324438), ('xho', 1.721502964511276e-10)]
  prediction_cmb = [('zul', 2.0), ('xho', 1.0), ('nbl', 1.0), ('ssw', 1.0)]

Progress: 13.64%
  acc = 0.9344444444444444
  acc_cmb = 0.9477777777777778

tsn ee ente tsa mmoko
  prediction_all[:3] = [('sot', 0.9191575618339202), ('tsn', 0.08084243816609529), ('nso', 3.226819161777476e-21)]
  prediction_cmb = [('tsn', 4.0), ('sot', 3.0), ('nso', 2.0)]

xho sebenzisa incwadi
  prediction_all[:3] = [('nbl', 0.9995161053964766), ('xho', 0.0004338460041001127), ('zul', 5.004859941137243e-05)]
  prediction_cmb = [('nbl', 0.9995161053964766), ('xho', 0.0004338460041001127), ('zul', 5.004859941137243e-05), ('ssw', 1.8349728615082818e-59), ('sot', 1.099641949465199e-93), ('nso', 3.472905188809963e-100), ('tsn', 2.4435247970919214e-104), ('ven', 2.3439515242585965e-107), ('tso', 6.903473642951697e-108), ('afr', 8.537824743415182e-112)]

tsn ditokomane di tla
  pr

xho ungakubonisa ke
  prediction_all[:3] = [('ssw', 0.8455168141670591), ('xho', 0.15433306275886852), ('nbl', 0.00014333467081252942)]
  prediction_cmb = [('xho', 2.0), ('nbl', 1.0), ('ssw', 1.0), ('zul', 1.0)]

zul a ukubeka iso ekulawulweni
  prediction_all[:3] = [('nbl', 0.9989479737325054), ('zul', 0.001026118889811247), ('xho', 2.5907376311922037e-05)]
  prediction_cmb = [('zul', 4.0), ('nbl', 3.0), ('xho', 2.0), ('ssw', 2.0)]

sot moruti ray mabhena
  prediction_all[:3] = [('tsn', 0.9988365075135556), ('nso', 0.0011575638500698766), ('sot', 5.928636266777272e-06)]
  prediction_cmb = [('sot', 3.0), ('nso', 1.0), ('tsn', 1.0)]

sot di-royalty kapa
  prediction_all[:3] = [('nso', 0.9998887034200642), ('sot', 0.00011129647962549625), ('eng', 9.614549821292288e-11)]
  prediction_cmb = [('sot', 2.0), ('nso', 1.0), ('tsn', 1.0)]

sot molao wa taolo ya
  prediction_all[:3] = [('nso', 0.9866536399970558), ('tsn', 0.012764600140735502), ('sot', 0.0005817598621828438)]
  prediction_cmb = [

  prediction_all[:3] = [('tsn', 0.9997561034867412), ('sot', 0.00024229449771062084), ('nso', 1.6020155388516674e-06)]
  prediction_cmb = [('sot', 4.0), ('nso', 3.0), ('tsn', 3.0)]

Progress: 25.76%
  acc = 0.94
  acc_cmb = 0.9570588235294117

nbl henry into engcono
  prediction_all[:3] = [('zul', 0.9976499628703432), ('nbl', 0.002349451523948405), ('ssw', 5.856056029477257e-07)]
  prediction_cmb = [('nbl', 3.0), ('xho', 2.0), ('zul', 2.0), ('ssw', 1.0)]

sot thutong re bone
  prediction_all[:3] = [('tsn', 0.9074703771345275), ('sot', 0.09252962283591826), ('nso', 2.95792066188179e-11)]
  prediction_cmb = [('tsn', 0.9074703771345275), ('sot', 0.09252962283591826), ('nso', 2.95792066188179e-11), ('ven', 6.991823689428864e-51), ('eng', 3.393416923083507e-56), ('nbl', 1.110099406559769e-63), ('tso', 3.8084456213265635e-64), ('afr', 3.3473660867192823e-65), ('xho', 5.927953781648298e-70), ('ssw', 1.0544208384885645e-70)]

sot re na le rekoto
  prediction_all[:3] = [('tsn', 0.79854823588770

xho utshepo usebenze
  prediction_all[:3] = [('zul', 1.0), ('xho', 1.3449815676331019e-15), ('nbl', 2.496023191780976e-21)]
  prediction_cmb = [('xho', 2.0), ('nbl', 1.0), ('zul', 1.0), ('ssw', 0.0)]

zul lapho khona umhlaba
  prediction_all[:3] = [('ssw', 0.8314943821327492), ('zul', 0.16850561786726237), ('xho', 1.3174333858168199e-15)]
  prediction_cmb = [('ssw', 0.8314943821327492), ('zul', 0.16850561786726237), ('xho', 1.3174333858168199e-15), ('nbl', 2.9886892791598374e-17), ('nso', 1.4761588284556919e-65), ('sot', 7.185786653377569e-68), ('ven', 9.761629998095392e-83), ('tso', 8.360790547679756e-83), ('tsn', 1.2519475277602006e-85), ('eng', 5.656105539864059e-117)]

nso mang le mang a ka
  prediction_all[:3] = [('sot', 0.9999997897795304), ('nso', 2.1000501281466845e-07), ('tsn', 2.154551004128249e-10)]
  prediction_cmb = [('sot', 0.9999997897795304), ('nso', 2.1000501281466845e-07), ('tsn', 2.154551004128249e-10), ('tso', 1.622062318619916e-44), ('xho', 2.7639703594857717e-61),

sot foromo ya kopo bi-
  prediction_all[:3] = [('tsn', 0.9132847880482085), ('sot', 0.08671521195143081), ('nso', 3.696502916750422e-13)]
  prediction_cmb = [('tsn', 0.9132847880482085), ('sot', 0.08671521195143081), ('nso', 3.696502916750422e-13), ('ven', 3.6020562547901197e-42), ('tso', 6.38411656985725e-52), ('nbl', 1.367608968407695e-54), ('ssw', 4.981143236720187e-69), ('xho', 5.458280987067404e-73), ('zul', 2.6497013596076042e-73), ('afr', 1.745405773523623e-83)]

zul u- senior supt joseph
  prediction_all[:3] = [('eng', 0.9999999999993747), ('zul', 6.249612947914744e-13), ('afr', 1.4993967194268277e-28)]
  prediction_cmb = [('eng', 0.9999999999993747), ('zul', 6.249612947914744e-13), ('afr', 1.4993967194268277e-28), ('nbl', 1.0546173247970774e-28), ('tsn', 1.0731217608169363e-32), ('ven', 6.664378169508329e-48), ('xho', 9.549235725369017e-49), ('sot', 4.5588857991217655e-56), ('ssw', 9.359847140906019e-57), ('nso', 2.5963617110207124e-58)]

xho i-sagnc ifumana
  prediction_all[:

  prediction_all[:3] = [('ssw', 0.9106130256453784), ('zul', 0.08938697435461967), ('nbl', 1.20298734391067e-15)]
  prediction_cmb = [('zul', 2.0), ('ssw', 1.0), ('xho', 0.0), ('nbl', 0.0)]

nso go fela ga kua ntlheng
  prediction_all[:3] = [('tsn', 0.9991879208617472), ('nso', 0.0008120791382796054), ('sot', 1.0354584581374977e-54)]
  prediction_cmb = [('nso', 5.0), ('tsn', 4.0), ('sot', 2.0)]

Progress: 43.94%
  acc = 0.9427586206896552
  acc_cmb = 0.96

xho nokuqhuba nokusebenzela
  prediction_all[:3] = [('zul', 0.9004567039462141), ('xho', 0.09954329605379263), ('nbl', 5.277507801830562e-27)]
  prediction_cmb = [('zul', 0.9004567039462141), ('xho', 0.09954329605379263), ('nbl', 5.277507801830562e-27), ('ssw', 2.3151817660942656e-112), ('sot', 8.254672434524069e-150), ('ven', 8.003481599069103e-154), ('nso', 5.822924950488952e-155), ('tso', 1.1659070227681539e-161), ('tsn', 1.3723541631969548e-163), ('afr', 1.6434731191752063e-178)]

zul isibalo esiphezulu
  prediction_all[:3] = [('

  acc = 0.9428125
  acc_cmb = 0.9590625

tsn yo ke sarah ndlovu
  prediction_all[:3] = [('tso', 0.9999999998014459), ('ssw', 1.98561737576488e-10), ('zul', 8.27342236976856e-15)]
  prediction_cmb = [('tso', 0.9999999998014459), ('ssw', 1.98561737576488e-10), ('zul', 8.27342236976856e-15), ('sot', 1.3693827443968917e-15), ('xho', 3.73142548475144e-17), ('tsn', 7.757679802217954e-24), ('nso', 1.2439026045786016e-24), ('ven', 2.958854817958539e-26), ('eng', 6.297582660858926e-34), ('nbl', 6.043247226328718e-36)]

nbl ibhonasi yomnyaka
  prediction_all[:3] = [('xho', 0.9763444326448216), ('zul', 0.013919228782802285), ('nbl', 0.009736338572393454)]
  prediction_cmb = [('nbl', 2.0), ('xho', 1.0), ('ssw', 0.0), ('zul', 0.0)]

zul umthethomgomo wolimi
  prediction_all[:3] = [('nbl', 0.9894801963516382), ('zul', 0.010519803648358875), ('xho', 6.828291219769539e-45)]
  prediction_cmb = [('zul', 2.0), ('nbl', 1.0), ('xho', 0.0), ('ssw', 0.0)]

tsn ikonomi - basadi
  prediction_all[:3] = [('nso',

sot solomon mahlangu
  prediction_all[:3] = [('ssw', 0.9997141478276967), ('tso', 0.00028585212651893597), ('nso', 4.263962771189567e-11)]
  prediction_cmb = [('ssw', 0.9997141478276967), ('tso', 0.00028585212651893597), ('nso', 4.263962771189567e-11), ('sot', 3.1572176092545153e-12), ('nbl', 3.2952097916995036e-15), ('ven', 2.1105406072620058e-16), ('zul', 5.432052581293967e-18), ('afr', 4.659515105972098e-23), ('xho', 1.0566225481011248e-26), ('eng', 2.781419682089473e-46)]

sot lapeng mma ngwana
  prediction_all[:3] = [('tsn', 0.732462798837663), ('nso', 0.2633288219196088), ('sot', 0.004208379242749733)]
  prediction_cmb = [('tsn', 0.732462798837663), ('nso', 0.2633288219196088), ('sot', 0.004208379242749733), ('nbl', 1.2115888797146381e-32), ('ssw', 3.2740625794197995e-47), ('ven', 2.2332056282329997e-48), ('tso', 5.873187000284455e-57), ('zul', 5.943579368285775e-58), ('xho', 1.3011503082505786e-62), ('eng', 1.0535466440835846e-69)]

zul i-akhawunti yakho
  prediction_all[:3] = [

tsn oa ke boemo bo bo
  prediction_all[:3] = [('sot', 0.9998453823816036), ('nso', 0.00015427001260164417), ('tsn', 3.4760579371780495e-07)]
  prediction_cmb = [('sot', 0.9998453823816036), ('nso', 0.00015427001260164417), ('tsn', 3.4760579371780495e-07), ('zul', 1.5543859037926045e-60), ('ssw', 1.1203622600753441e-60), ('xho', 3.87774188656128e-61), ('ven', 3.126799296893795e-64), ('afr', 1.2352464263252277e-68), ('nbl', 3.510414273464744e-79), ('tso', 2.586672331586201e-81)]

sot forormo ya kopo
  prediction_all[:3] = [('tsn', 0.9999949920335839), ('sot', 5.007966411862512e-06), ('nso', 2.672904711060095e-19)]
  prediction_cmb = [('sot', 3.0), ('tsn', 2.0), ('nso', 1.0)]

tsn tumelelo ya kgwebo
  prediction_all[:3] = [('nso', 0.9998943822537444), ('tsn', 0.00010561774625357127), ('sot', 5.625325115101424e-27)]
  prediction_cmb = [('nso', 0.9998943822537444), ('tsn', 0.00010561774625357127), ('sot', 5.625325115101424e-27), ('xho', 1.9329236587492805e-68), ('tso', 1.7586408216010336e-6

xho esinye sezifundo
  prediction_all[:3] = [('zul', 0.999919909947199), ('xho', 8.008122712390361e-05), ('nbl', 8.825665320540177e-09)]
  prediction_cmb = [('xho', 2.0), ('nbl', 1.0), ('zul', 1.0), ('ssw', 0.0)]

xho umnu jan botha wasethaba
  prediction_all[:3] = [('tsn', 0.9996143717794252), ('nso', 0.00020451678952008502), ('zul', 0.00018104656746666906)]
  prediction_cmb = [('nso', 2.0), ('tsn', 2.0), ('sot', 1.0)]

xho umphathiswa wezimali
  prediction_all[:3] = [('zul', 0.9072996295933845), ('xho', 0.0927003704066044), ('nbl', 3.327478101332513e-14)]
  prediction_cmb = [('xho', 2.0), ('zul', 1.0), ('nbl', 0.0), ('ssw', 0.0)]

tsn molaetsa thuto molaetsa
  prediction_all[:3] = [('sot', 0.5506939889826112), ('tsn', 0.4493060110174097), ('nso', 1.1695295052241771e-39)]
  prediction_cmb = [('sot', 0.5506939889826112), ('tsn', 0.4493060110174097), ('nso', 1.1695295052241771e-39), ('ven', 9.642867989192325e-119), ('tso', 6.135232323888798e-122), ('ssw', 4.458714728436454e-123), ('xho'

  acc = 0.9438297872340425
  acc_cmb = 0.9593617021276596

xho dlamini ebuchasela
  prediction_all[:3] = [('ssw', 0.5453529812063814), ('xho', 0.4546470187936412), ('zul', 6.282084426243034e-16)]
  prediction_cmb = [('xho', 2.0), ('ssw', 1.0), ('nbl', 0.0), ('zul', 0.0)]

tsn molao wa khansele
  prediction_all[:3] = [('nso', 0.6964436330325596), ('tsn', 0.3035563655224265), ('sot', 1.4450229815861276e-09)]
  prediction_cmb = [('nso', 0.6964436330325596), ('tsn', 0.3035563655224265), ('sot', 1.4450229815861276e-09), ('tso', 1.2494979559166268e-47), ('zul', 4.322925348874409e-67), ('ssw', 1.048516805919369e-72), ('nbl', 5.957114052982877e-73), ('ven', 3.470603898623087e-75), ('xho', 2.156009535340802e-87), ('afr', 8.343768607290665e-98)]

sot hlahloba level ya
  prediction_all[:3] = [('nso', 0.9999999937890607), ('sot', 6.21094642674388e-09), ('ssw', 3.133136058650767e-34)]
  prediction_cmb = [('nso', 0.9999999937890607), ('sot', 6.21094642674388e-09), ('ssw', 3.133136058650767e-34), ('t

nso matlakala ohle a
  prediction_all[:3] = [('sot', 0.9682010398350184), ('nso', 0.03179896016498911), ('tsn', 3.5749228646243615e-23)]
  prediction_cmb = [('sot', 0.9682010398350184), ('nso', 0.03179896016498911), ('tsn', 3.5749228646243615e-23), ('tso', 1.873951166204655e-31), ('zul', 1.3237955106304827e-32), ('nbl', 2.460017297804607e-38), ('xho', 1.1564766498282197e-46), ('ssw', 7.733395989989772e-50), ('ven', 3.1867141564246997e-63), ('afr', 1.3815386448925127e-63)]

ssw umbhalo kungaba
  prediction_all[:3] = [('zul', 0.533959212727913), ('ssw', 0.46604078727162357), ('xho', 4.620646733156478e-13)]
  prediction_cmb = [('zul', 0.533959212727913), ('ssw', 0.46604078727162357), ('xho', 4.620646733156478e-13), ('nbl', 7.485155052114475e-22), ('tso', 6.275188605066722e-64), ('ven', 1.123185637703894e-68), ('tsn', 3.310602841370779e-79), ('nso', 8.724483560994574e-83), ('sot', 7.568599156240117e-83), ('eng', 2.582557305434195e-98)]

afr die afdeling authority
  prediction_all[:3] = [('

zul siyabonga dokotela
  prediction_all[:3] = [('ssw', 0.9999994915422277), ('zul', 5.084577958126145e-07), ('ven', 4.807185441475948e-41)]
  prediction_cmb = [('ssw', 0.9999994915422277), ('zul', 5.084577958126145e-07), ('ven', 4.807185441475948e-41), ('nbl', 2.778301932010519e-43), ('xho', 2.077933652079841e-43), ('tso', 1.8886539878645947e-43), ('nso', 8.312990971623442e-57), ('sot', 3.7020504252656295e-57), ('tsn', 2.1510162872121137e-58), ('afr', 1.3595942322386587e-88)]

Progress: 81.82%
  acc = 0.9438888888888889
  acc_cmb = 0.9585185185185185

nso meritimebedi meriti
  prediction_all[:3] = [('sot', 0.9999996758966702), ('nso', 3.237519301446809e-07), ('tsn', 3.5140793528504797e-10)]
  prediction_cmb = [('nso', 2.0), ('sot', 1.0), ('tsn', 0.0)]

xho khona ukuze umsebenzi
  prediction_all[:3] = [('zul', 0.999158468287792), ('nbl', 0.0008400694987722032), ('xho', 1.4622134244621248e-06)]
  prediction_cmb = [('zul', 0.999158468287792), ('nbl', 0.0008400694987722032), ('xho', 1.4622

xho c sibeke izizathu
  prediction_all[:3] = [('zul', 0.9998747953843455), ('xho', 0.00012520461563068604), ('nbl', 1.4009869961706213e-21)]
  prediction_cmb = [('zul', 0.9998747953843455), ('xho', 0.00012520461563068604), ('nbl', 1.4009869961706213e-21), ('ssw', 2.5513606141989183e-45), ('ven', 1.14169949790415e-67), ('sot', 2.4679236299598575e-71), ('tsn', 1.2042271696013705e-73), ('tso', 5.952319666908427e-76), ('nso', 7.187517769169286e-78), ('eng', 1.292398424956008e-82)]

Progress: 87.88%
  acc = 0.9441379310344827
  acc_cmb = 0.9587931034482758

tsn maloko a setšhaba
  prediction_all[:3] = [('nso', 0.9999998912210858), ('tsn', 1.0877890658309753e-07), ('sot', 5.485543397919319e-73)]
  prediction_cmb = [('nso', 0.9999998912210858), ('tsn', 1.0877890658309753e-07), ('sot', 5.485543397919319e-73), ('tso', 3.992776755359791e-87), ('ssw', 3.1973367729412944e-87), ('zul', 1.054989151714499e-90), ('nbl', 6.553710762916582e-92), ('xho', 9.326358894148752e-93), ('ven', 5.156307812380344e

  prediction_all[:3] = [('nso', 0.9942803531339578), ('tsn', 0.005714794997981095), ('sot', 4.851868079681422e-06)]
  prediction_cmb = [('nso', 0.9942803531339578), ('tsn', 0.005714794997981095), ('sot', 4.851868079681422e-06), ('nbl', 5.876918245049438e-50), ('zul', 1.855589470571622e-57), ('ven', 5.644738865302989e-59), ('tso', 1.926507453843757e-61), ('xho', 2.0412987841679813e-63), ('ssw', 5.407763390692683e-68), ('afr', 5.664501408102825e-82)]

nbl ukusaba lokho nokungavikeleki
  prediction_all[:3] = [('zul', 0.9999872163179755), ('nbl', 1.2783682009020601e-05), ('ssw', 1.6497856927987106e-33)]
  prediction_cmb = [('nbl', 3.0), ('zul', 2.0), ('xho', 0.0), ('ssw', 0.0)]

tsn a e e leng ya boleng
  prediction_all[:3] = [('sot', 0.7740766459518758), ('tsn', 0.2259233540481246), ('nso', 6.389640798629354e-19)]
  prediction_cmb = [('sot', 0.7740766459518758), ('tsn', 0.2259233540481246), ('nso', 6.389640798629354e-19), ('tso', 2.7560247222451067e-69), ('ven', 1.104084522713974e-71), ('

Progress: 96.97%
  acc = 0.94328125
  acc_cmb = 0.9578125

xho umthetho wezamanzi
  prediction_all[:3] = [('nbl', 0.9999998919565259), ('xho', 8.705823857176415e-08), ('zul', 2.0985245305928476e-08)]
  prediction_cmb = [('nbl', 0.9999998919565259), ('xho', 8.705823857176415e-08), ('zul', 2.0985245305928476e-08), ('ven', 3.252765308248834e-92), ('ssw', 1.0591306930599789e-102), ('nso', 3.403481539605054e-111), ('sot', 1.2950293259063398e-115), ('eng', 8.066083193933448e-125), ('tso', 2.932791887323909e-128), ('tsn', 1.8013774276126893e-128)]

sot e thusa boto ya
  prediction_all[:3] = [('tsn', 0.9817035586695991), ('sot', 0.01829644133040639), ('nso', 2.5606728615133075e-24)]
  prediction_cmb = [('tsn', 0.9817035586695991), ('sot', 0.01829644133040639), ('nso', 2.5606728615133075e-24), ('ven', 5.204721798992578e-25), ('tso', 6.014463583194522e-48), ('zul', 4.582908538349672e-55), ('ssw', 8.605285074289815e-57), ('xho', 2.0456479196777392e-60), ('nbl', 3.6157044949997982e-68), ('eng', 8.

[37m0.0 	[37m0.003 	[37m0.002 	[37m0.0 	[37m0.0 	[30m0.99 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.005 	[0m
[37m0.005 	[37m0.002 	[37m0.0 	[37m0.055 	[37m0.028 	[37m0.002 	[31m0.905 	[37m0.002 	[37m0.0 	[37m0.0 	[37m0.002 	[0m
[37m0.0 	[37m0.002 	[37m0.0 	[37m0.0 	[37m0.002 	[37m0.002 	[37m0.0 	[30m0.992 	[37m0.002 	[37m0.002 	[37m0.0 	[0m
[37m0.0 	[37m0.005 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[30m0.995 	[37m0.0 	[37m0.0 	[0m
[37m0.002 	[37m0.003 	[37m0.013 	[37m0.002 	[37m0.002 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[30m0.948 	[37m0.03 	[0m
[37m0.0 	[37m0.007 	[37m0.043 	[37m0.0 	[37m0.0 	[37m0.008 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.023 	[31m0.918 	[0m
AVRG = 0.9578787878787879
===

===
Precision Confusion Matrix:
afr .	eng .	nbl .	nso .	sot .	ssw .	tsn .	tso .	ven .	xho .	zul .	
[30m0.99 	[37m0.003 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.0 	[37m0.007 	[37m0.0 	[37m0.0 	[37m0.002 	[37m0.0 	