## Imports

In [2]:
import mltlk
print(mltlk.__version__)
from mltlk import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, Dropout
from tensorflow.keras.optimizers import Adam

0.1.33


## Load data
Load data, clean text and use Keras word vector embeddings preprocessing.

In [3]:
session = load_data("data/wikipedia_300.csv.gz",
                    preprocess="embeddings",
                    stopwords=["english","stopwords/custom.csv"],
                    clean_text="letters digits",
                    encode_categories=True,
                    embeddings_size=75,
                    embeddings_max_length=7500,
                    shuffle_data=True,
)

[1m[33mInfo: [0mClean texts keeping letters and digits
[1m[33mInfo: [0mCategories encoded
[1m[33mInfo: [0mLoad 180 stopwords from [36menglish, stopwords/custom.csv[0m
[1m[33mInfo: [0mVocabulary size is [34m53284[0m
[1m[33mInfo: [0m[34m91.67%[0m of sequences covered by max length [34m7500[0m
[1m[33mInfo: [0mLoaded [34m300[0m examples in [34m2[0m categories


#### Show data stats

In [4]:
data_stats(session)

0,1,2,3,4,5,6,7,8,9,10,11
Category,No,%,Σ%,Category,No,%,Σ%,Category,No,%,Σ%
Programming (1),150,50.0%,50.0%,Games (0),150,50.0%,100.0%,,,,
Examples:,300,,,Features:,7500,,,Categories:,2,,


## Define Keras model
Builds the structure for the Keras model to use.

In [5]:
def get_model(session, mode):
    if mode == "nn":
        model = Sequential()
        model.add(Embedding(input_dim=session["vocab_size"], output_dim=session["embeddings_size"], input_length=session["max_length"]))
        model.add(Flatten())
        model.add(Dense(128, activation="relu", kernel_initializer="he_uniform"))
        model.add(Dropout(0.2))
        model.add(Dense(2, activation="softmax"))
        return model
    elif mode == "convnet":
        model = Sequential()
        model.add(Embedding(input_dim=session["vocab_size"], output_dim=session["embeddings_size"], input_length=session["max_length"]))
        model.add(Conv1D(filters=64, kernel_size=8, padding="same", activation="relu", kernel_initializer="he_uniform"))
        model.add(MaxPooling1D())
        model.add(Flatten())
        model.add(Dense(128, activation="relu", kernel_initializer="he_uniform"))
        model.add(Dropout(0.2))
        model.add(Dense(2, activation="softmax"))
        return model
    else:
        print("Unknown mode " + mode)
        return None

In [6]:
split_data(session,
           test_size=0.1,
           seed=4,
           stratify=True,
)

[1m[33mInfo: [0mSplit data using [34m90%[0m training data ([34m270[0m samples) and [34m10%[0m test data ([34m30[0m samples) with seed [34m4[0m and stratify


## Neural Network model

In [7]:
evaluate_model(get_model(session, "nn"), 
               session, 
               reload=False,
               mode="split",
               categories=True,
               epochs=8,
               batch_size=32,
               loss="categorical_crossentropy",
               optimizer=Adam(learning_rate=0.01)
)

[1m[33mInfo: [0mBuilding and evaluating model using train-test split took [34m45.74[0m sec



0,1
Results,
Accuracy:,90.00%
F1-score:,89.90%
Precision:,91.67%
Recall:,90.00%





0,1,2,3
Category,Accuracy,Correct,n
Programming (1),100.00%,15,15
Games (0),80.00%,12,15
Programming (1),25.00%,3,





## ConvNet model

In [9]:
evaluate_model(get_model(session, "convnet"), 
               session, 
               reload=True,
               mode="split",
               categories=True,
               epochs=8,
               batch_size=32,
               loss="categorical_crossentropy",
               optimizer=Adam(learning_rate=0.01)
)

[1m[33mInfo: [0mBuilding and evaluating model using train-test split took [34m48.37[0m sec



0,1
Results,
Accuracy:,90.00%
F1-score:,89.99%
Precision:,90.18%
Recall:,90.00%





0,1,2,3
Category,Accuracy,Correct,n
Programming (1),93.33%,14,15
Games (0),7.14%,1,
Games (0),86.67%,13,15
Programming (1),15.38%,2,



