In [1]:
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import re
import numpy as np
import pandas as pd

In [65]:
train_data = pd.read_csv("../data/train_full_3k.csv")
language_code = {'afr':1, 'nbl':2, 'nso':3, 'sot':4, 'ssw':5, 'tso':6, 'tsn':7, 'ven': 8, 'xho':9,
'zul':10, 'eng':11}
train_data["lang_id"] = train_data["lang_id"].map(language_code).astype(int)
code = dict((v,k) for k,v in language_code.items())

In [71]:
def clean_text(input_text):
    text = input_text.lower()
    #punc_to_remove = text.replace('-', '') + '0123456789'
    text = replace_numbers.sub('', text)

    text = text.replace('ã…â¡', 'š')
    text = text.replace('ï¿½', '')
    text = text.replace('ª', '')

    text = text.rstrip('"')
    text = text.lstrip(' "')

    # All special characters are kept.
    return text

In [50]:
docs_train, docs_test, y_train, y_test = train_test_split(
train_data[" text"], train_data["lang_id"], test_size=0.2)

In [72]:
# Split the dataset in training and test set:
#docs_train, docs_valid, y_train, y_t = train_test_split(dataset.data, dataset.target, test_size=0.5)

docs_train = [clean_text(text) for text in docs_train]
docs_test = [clean_text(text) for text in docs_test]
#docs_train, docs_valid = train_dataset.data, validation_dataset.data
#y_train, y_valid = train_dataset.target, validation_dataset.target

In [53]:
vectorizer = TfidfVectorizer(ngram_range=(1, 6),
                             analyzer='char',)

pipe = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', MultinomialNB())
])

pipe.fit(docs_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 6), norm='l2', preprocessor=None, smooth_idf=Tr...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [54]:
y_predicted = pipe.predict(docs_test)

In [69]:
sentences = ['Ndzi khense ngopfu']
cleaned_sentence = [cleanup_text(text) for text in sentences]
predicted_languages = pipe.predict(cleaned_sentence)
for sentence, lang in zip(sentences, predicted_languages):
    print(u'{} ----> {}'.format(sentence, code[lang]))



Ndzi khense ngopfu ----> tso


In [87]:
# read in csv file
def generate_output(input_path, output_path):
    """
    Generate required output file
    """
    outside_test = pd.read_csv(input_path)
    test_text = outside_test["Text"]
    test_text = [clean_text(text) for text in test_text]
    predictions = pipe.predict(test_text)
    ids = outside_test["ID"].values
    df = pd.DataFrame(
                  {"ID" : ids,
                   "Class" : predictions})
    df.to_csv(output_path, index=None)

In [88]:
generate_output("../data/sample_input.csv","../language_output/test_output.csv")

In [57]:
print(metrics.classification_report(y_test, y_predicted,
                                    target_names=language_code.keys(),
                                     digits=5      ))

             precision    recall  f1-score   support

        afr    1.00000   1.00000   1.00000       613
        nbl    0.99835   1.00000   0.99918       606
        nso    1.00000   0.99838   0.99919       617
        sot    1.00000   1.00000   1.00000       577
        ssw    1.00000   1.00000   1.00000       549
        tso    1.00000   1.00000   1.00000       598
        tsn    0.99844   1.00000   0.99922       641
        ven    1.00000   1.00000   1.00000       590
        xho    0.99834   0.99834   0.99834       603
        zul    1.00000   0.99842   0.99921       632
        eng    0.99826   0.99826   0.99826       574

avg / total    0.99939   0.99939   0.99939      6600



In [103]:
cm = metrics.confusion_matrix(y_test, y_predicted)

In [104]:
cm

array([[613,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0, 606,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0, 616,   0,   0,   0,   1,   0,   0,   0,   0],
       [  0,   0,   0, 577,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0, 549,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0, 598,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0, 641,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0, 590,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0, 602,   0,   1],
       [  0,   1,   0,   0,   0,   0,   0,   0,   0, 631,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   1,   0, 573]])

In [106]:
metrics.confusion_matrix??

In [100]:
language_code.keys()

dict_keys(['afr', 'nbl', 'nso', 'sot', 'ssw', 'tso', 'tsn', 'ven', 'xho', 'zul', 'eng'])