In [1]:
import numpy as np
import orchest
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [2]:
sentence_data = orchest.get_inputs()["sentence_data"]

### Language labels

0 = nl<br>
1 = de

### Concatenate languages

In [3]:
X_nl = sentence_data["nl"]
Y_nl = np.zeros(len(X_nl)).astype(int)

X_de = sentence_data["de"]
Y_de = (np.zeros(len(X_de)) + 1).astype(int)

In [4]:
X, y = [X_nl + X_de, np.concatenate((Y_nl, Y_de))]

### Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [6]:
print("Dataset class ratio\t %.3f" % (y.sum() / y.shape[0]))
print("Train class ratio\t %.3f" % (y_train.sum() / y_train.shape[0]))
print("Test class ratio\t %.3f" % (y_test.sum() / y_test.shape[0]))

Dataset class ratio	 0.500
Train class ratio	 0.508
Test class ratio	 0.483


In [7]:
print(X[0])
print(X[-1])

Dat recht bepaalt dat niemand zomaar eigendom van iemand anders mag afpakken, ook de staat niet. Als de overheid met terugwerkende kracht dingen belast, dan moeten bedrijven als Unilever dat wel weten, zegt Van de Streek.
#FEED: http://tools.wmflabs.org/persondata/beacon/dewiki.txt


In [8]:
print(y[0])
print(y[-1])

0
1


### Fit

In [9]:
count_vect = CountVectorizer()

In [10]:
X_train_counts = count_vect.fit_transform(X_train)

In [11]:
X_train_counts.shape

(1086, 8429)

In [12]:
list(count_vect.vocabulary_.keys())[0:10]

['de',
 'herbeleggingsreserve',
 'die',
 'is',
 'gevormd',
 'ingevolge',
 'nadere',
 'regelen',
 'zijn',
 'gegeven']

In [13]:
list(count_vect.vocabulary_.keys())[-10:]

['herum',
 'geflogen',
 'ausgegast',
 'wahrscheinlicher',
 'oben',
 'beschrieben',
 'entstehungszeit',
 'planetensystems',
 'flüchtiges',
 'kometare']

In [14]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1086, 8429)

In [15]:
clf = SGDClassifier(
    loss="hinge", penalty="l2", alpha=1e-3, random_state=42, max_iter=5, tol=None
).fit(X_train_tfidf, y_train)

In [16]:
(X_train_tfidf.shape, y_train.shape)

((1086, 8429), (1086,))

In [17]:
docs_new = ["poort vier verdieping we niet steen", "dieser auf"]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, language in zip(docs_new, predicted):
    print("%r => %s" % (doc, language))

'poort vier verdieping we niet steen' => 1
'dieser auf' => 1


### Evaluate test

In [18]:
#X_test

In [19]:
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [20]:
predicted = clf.predict(X_test_tfidf)
print("Test set accuracy %f" % np.mean(predicted == y_test))

Test set accuracy 0.985075


In [21]:
conf_df = pd.DataFrame(confusion_matrix(y_test, predicted)); conf_df

Unnamed: 0,0,1
0,269,8
1,0,259


In [22]:
confusion_path = "/data/confusion-scrape-pipeline.csv"
examples_path = "/data/wrong-examples-scrape-pipeline.csv"

In [23]:
conf_df.to_csv(confusion_path)

In [24]:
[errors] = np.where(predicted != y_test)
errors_df = pd.DataFrame([[X_test[error], y_test[error], predicted[error]] for error in errors], columns=["Sentence", "Predicted", "Actual"]); errors_df

Unnamed: 0,Sentence,Predicted,Actual
0,"view as: RDF, Turtle, N-Triples, JSON-LD",0,1
1,Prijs exclusief verzendkosten. Betaling contan...,0,1
2,Prijs exclusief verzendkosten. Betaling contan...,0,1
3,"Premier Rutte, september 2018, Algemene Bescho...",0,1
4,E info@lecturis.nl T +31 (0)40 281 45 45,0,1
5,====================================,0,1
6,�{�0�m��?���\�`n;v�K��.��\?����\��� �k�...,0,1
7,Schelphoek 2 1813 SE Alkmaar Postadres: Postbu...,0,1


In [25]:
errors_df.to_csv(examples_path)