In [86]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report

import cltrier_nlp as nlp

In [87]:
CORPUS_FILE: str = './data/corpus.txt'
SAMPLE_SIZE: int = 2000

TYPE: str = 'tfidf' # 'tfidf', 'encoder'

In [88]:
corpus = nlp.corpus.Corpus.from_txt(CORPUS_FILE)

In [89]:
X_train, X_test, y_train, y_test = train_test_split(
    [sent.raw for sent in corpus.sentences[:SAMPLE_SIZE]], 
    [sent.language for sent in corpus.sentences[:SAMPLE_SIZE]], 
)

In [90]:
vectorizer = TfidfVectorizer()
# ---
encoder = nlp.encoder.Encoder()
pooler = nlp.encoder.EncoderPooler()



In [91]:
if TYPE == 'tfidf':
    X_train_embed = vectorizer.fit_transform(X_train)
    X_test_embed = vectorizer.transform(X_test)

else:
    X_train_embed = np.stack([embed.detach().numpy() for embed in pooler(encoder(X_train), form="sent_cls")])
    X_test_embed = np.stack([embed.detach().numpy() for embed in pooler(encoder(X_test), form="sent_cls")])

In [92]:
classifier = SVC(kernel='linear', C=1, probability=True)

In [93]:
classifier.fit(X_train_embed, y_train)

In [94]:
print(TYPE)
print(classification_report(y_test, classifier.predict(X_test_embed), zero_division=1.))

something
              precision    recall  f1-score   support

   afrikaans       0.00      1.00      0.00         0
     catalan       1.00      0.00      0.00         1
       dutch       1.00      0.50      0.67         4
     english       0.95      1.00      0.97        38
      german       0.98      0.99      0.99       109
     italian       1.00      0.00      0.00         1
     swedish       1.00      1.00      1.00         2

    accuracy                           0.97       155
   macro avg       0.85      0.64      0.52       155
weighted avg       0.97      0.97      0.96       155

