# Testing of Classic models

In [1]:
import os
import sys

def adding_module_path():
    module_path = os.path.abspath(os.path.sep.join([".."]*3))

    if module_path not in sys.path:
        sys.path.append(module_path)

adding_module_path()

### Vectorizers

In [2]:
from src.vectorizers.classic.bow_vectorizer import BoWVectorizer
from src.vectorizers.classic.tfidf_vectorizer import TfidfVectorizer
from src.vectorizers.transformer.bert_base_vectorizer import BertBaseUncasedVectorizer
from src.vectorizers.transformer.distil_bert_base_vectorizer import DistilBertBaseUncasedVectorizer
from src.vectorizers.transformer.electra_small_vectorizer import ElectraSmallVectorizer
from src.vectorizers.embedding.glove_vectorizer import GloveVectorizer
from src.vectorizers.embedding.word2vec_vectorizer import Word2VecVectorizer

### Models

In [3]:
from src.models.classic.linear import LinearClassifier
from src.models.classic.naive_bayes import NaiveBayes
from src.models.classic.random_forest import RandomForest

### Preprocessing factory

In [4]:
from src.preprocessing.preprocessing_factory import PreprocessingFactory, PreprocessingType

In [5]:
factory = PreprocessingFactory()

Creating shorting method with min = 3
Creating lemma method with instance <WordNetLemmatizer>


### Load data

In [6]:
from src.utils.create_path_to_gutenberg import get_path_to_gutenberg_sets
from src.data_loading.get_dataset_object_from import get_datasets
from src.config.config import PATH_TO_DATASET_FOLDER_TEST
import numpy as np

In [7]:
test_corpus = np.array([
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
])
y = np.array([0,0,0,0])

path_data, path_authors = get_path_to_gutenberg_sets(10, 3, PATH_TO_DATASET_FOLDER_TEST)
train, valid, test = get_datasets(path_data, ';', factory.create(PreprocessingType.Default))

Loading dataset from=C:\Users\Vojta\Desktop\diploma\data_test\gutenberg\10Authors\Sentence3\train.csv
Loading dataset from=C:\Users\Vojta\Desktop\diploma\data_test\gutenberg\10Authors\Sentence3\valid.csv
Loading dataset from=C:\Users\Vojta\Desktop\diploma\data_test\gutenberg\10Authors\Sentence3\test.csv


#### Transform data

In [14]:
from src.utils.from_dataset_arrays import from_dataset_dataframe
from src.config.config import TEXT_COLUMN, LABEL_COLUMN

In [11]:
train_df = from_dataset_dataframe(train)
test_df = from_dataset_dataframe(test)

In [10]:
train_df

Unnamed: 0,text,label
0,miss nugent received information civil bow ren...,1865
1,hour familiar place instinct method true trave...,1285
2,twin accepted invitation reception progress vo...,53
3,cried sheriff like father master ask boy like man,3840
4,wonder said offended kybird banns week murmure...,1865
...,...,...
695,picked remains laid palm said poor little anty...,53
696,mean mean seventy naturally ninety,53
697,illustration try said little john whereabouts ...,3840
698,mark telford john gladney thing pleasant consi...,1285


In [12]:
test_df

Unnamed: 0,text,label
0,looked laughed recognized strange guest approa...,1800
1,awoke morning joyful eager start home company ...,3840
2,forest haunt run prime repast paying blow yoke...,520
3,quid lent said prout pained voice said carter ...,1865
4,came day clipped thread touch lip lay cold ear...,520
...,...,...
145,look fellow toe make comfortable heritage assa...,1800
146,mark telford ambition child ride horse man lik...,1285
147,superficial listener blame music discord heard...,1800
148,historical novel georg ebers index edited davi...,1800


## Testing classifiers

#### Embedding and classic


In [36]:
vectorizer = Word2VecVectorizer()

In [49]:
X_train = vectorizer.fit_transform(train_df[TEXT_COLUMN])
vectorizer.get_state()

Missed=418, counter=18102, accuracy=2.3091371119213346


(418, 18102, 2.3091371119213346)

In [50]:
X_test = vectorizer.fit_transform(test_df[TEXT_COLUMN])
vectorizer.get_state()

Missed=103, counter=3553, accuracy=2.898958626512806


(103, 3553, 2.898958626512806)

In [51]:
y_train = train_df[LABEL_COLUMN]

In [52]:

y_test = test_df[LABEL_COLUMN]

#### Transformer


In [60]:
bert = BertBaseUncasedVectorizer()

In [61]:
X_train, y_train = bert.fit_transform(train)
X_test, y_test = bert.fit_transform(test)

In [62]:
model = LinearClassifier()
model.fit(X_train, y_train)

SGDClassifier()

In [63]:
y_pred = model.predict(X_test)

### Result

In [64]:
from src.experiments.experiment_evaluate import ExperimentEvaluate

In [65]:
evaluate = ExperimentEvaluate("test") 

In [66]:
evaluate.calc(y_test, y_pred)

In [67]:
evaluate.state

{'Accuracy': 0.3,
 'F1': 0.3,
 'Precision': 0.3,
 'Recall': 0.3,
 'ConsfusionMatrix': array([[11,  0,  0,  2,  0,  0,  0,  2,  0,  0],
        [13,  0,  0,  0,  0,  0,  0,  2,  0,  0],
        [10,  0,  0,  3,  0,  0,  0,  2,  0,  0],
        [ 8,  0,  0,  5,  0,  0,  0,  1,  1,  0],
        [ 8,  0,  0,  6,  0,  0,  0,  0,  0,  1],
        [ 4,  0,  0,  1,  0, 10,  0,  0,  0,  0],
        [13,  0,  0,  2,  0,  0,  0,  0,  0,  0],
        [ 6,  0,  0,  0,  0,  0,  0,  7,  1,  1],
        [ 3,  0,  0,  1,  0,  0,  0,  1, 10,  0],
        [ 9,  0,  0,  4,  0,  0,  0,  0,  0,  2]], dtype=int64)}