# Setup

#### Dependencies

In [1]:
pip install pydatasci

Collecting pydatasci
  Using cached https://files.pythonhosted.org/packages/18/5f/4e07fc0aae654eac617ad887efe1676e87dac4570c9d7ebfa19462f79a9a/pydatasci-0.0.55-py3-none-any.whl
Installing collected packages: pydatasci
Successfully installed pydatasci-0.0.55
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
#! jupyter labextension install jupyterlab-plotly

In [1]:
%%capture
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import *

from keras.metrics import *
from keras.models import Sequential
from keras.callbacks import History
from keras.layers import Dense, Dropout

#### Create the database.

In [8]:
import pydatasci as pds
pds.create_folder()
pds.create_config()

from pydatasci import aidb
aidb.create_db()


=> Welcome to PyDataSci.
To get started, run `pds.create_folder()` followed by `pds.create_config()` in Python shell.


=> Info - it appears the following folder does not exist on your system:
/Users/layne/Library/Application Support/pydatasci/


=> Fix - you can attempt to fix this by running `pds.create_folder()`.


=> Success - created folder at file path:
/Users/layne/Library/Application Support/pydatasci/


=> Fix - now try running `pds.create_config()` again.


=> Success - the following file path already exists on your system:
/Users/layne/Library/Application Support/pydatasci/


=> Success - created config file for settings at path:
/Users/layne/Library/Application Support/pydatasci/config.json


=> Success - created database file for machine learning metrics at path:
/Users/layne/Library/Application Support/pydatasci/aidb.sqlite3


=> Success - created the following tables within database:
['algorithm', 'batch', 'dataset', 'featureset', 'fold', 'foldset', 'hyperparamcombo', '

---

# Data

#### Ingest file, dataframe, or array.

In [4]:
import os
os.chdir('/Users/layne/Desktop')

In [11]:
df = pd.read_csv('pydatasci/data/iris.csv')
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [13]:
dataset = aidb.Dataset.from_pandas(
	dataframe = df
	, file_format = 'tsv'
	, name = 'tab-separated plants'
	, perform_gzip = True
)

TypeError: object of type 'NoneType' has no len()

#### Select features and labels.

In [None]:
label_name = 'target'

label = dataset.make_label(columns=[label_name])

featureset = dataset.make_featureset(exclude_columns=[label_name])

#### Assign sample IDs to training, validation, and test splits.

In [None]:
splitset = featureset.make_splitset(
	label_id = label.id
	, size_test = 0.20
	, size_validation = 0.12
)

In [None]:
foldset = splitset.make_foldset(fold_count=6)

In [None]:
encoder_features = StandardScaler()

In [None]:
encoder_labels = OneHotEncoder(sparse=False)

In [None]:
preprocess = aidb.Preprocess.from_splitset(
    splitset_id = splitset.id
    , description = "Scale features and OHE labels."
    , encoder_features = encoder_features
    , encoder_labels = encoder_labels
)

---

# Algorithm

### Define model to be trained.

In [None]:
def function_model_build(**hyperparameters):
    model = Sequential()
    model.add(Dense(13, input_shape=(4,), activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(hyperparameters['l2_neuron_count'], activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(3, activation='softmax', name='output'))

    model.compile(
        loss='categorical_crossentropy'
        , optimizer=hyperparameters['optimizer']
        , metrics=['accuracy']
    )
    return model

In [None]:
def function_model_train(model, samples_train, samples_evaluate, **hyperparameters):
    model.fit(
        samples_train["features"], samples_train["labels"]
        , validation_data = (
            samples_evaluate["features"], samples_evaluate["labels"]
        )
        , verbose = 0
        , batch_size = 3
        , epochs = hyperparameters['epochs']
        , callbacks=[History()]
    )
    return model

In [None]:
def function_model_predict(model, samples_predict):
    probabilities = model.predict(samples_predict['features'])
    predictions = np.argmax(probabilities, axis=-1)
    
    return predictions, probabilities

In [None]:
def function_model_loss(model, samples_evaluate):
    loss, _ = model.evaluate(samples_evaluate['features'], samples_evaluate['labels'], verbose=0)
    return loss

In [None]:
hyperparameters = {
    "l2_neuron_count": [13, 9]
    , "optimizer": ["adamax", "adam"]
    , "epochs": [60, 30]
}

### Stage the model.

In [None]:
algorithm = aidb.Algorithm.create(
    library = "Keras"
    , analysis_type = "classification_multi"
    , description = "dense, 2 layers, medium height"
	, function_model_build = function_model_build
	, function_model_train = function_model_train
    , function_model_predict = function_model_predict
    , function_model_loss = function_model_loss
)

In [None]:
hyperparamset = aidb.Hyperparamset.from_algorithm(
    algorithm_id = algorithm.id
    , preprocess_id = preprocess.id
    , description = "experimenting with number of epochs"
	, hyperparameters = hyperparameters
)

---

# Hypertune

In [None]:
batch = aidb.Batch.from_algorithm(
    algorithm_id = algorithm.id
    , splitset_id = splitset.id
    , hyperparamset_id = hyperparamset.id
    , foldset_id = None #foldset.id
    , only_folded_training = False
)

In [None]:
batch.run_jobs(verbose=False)

In [None]:
batch.stop_jobs()

In [None]:
batch.get_statuses()

In [None]:
batch = aidb.Batch.get_by_id(1)

In [None]:
batch.metrics_to_pandas().head()

In [None]:
batch.jobs[0].results[0].plot_learning_curve()

In [None]:
batch.jobs[0].results[0].plot_roc_curve()

In [None]:
batch.jobs[0].results[0].plot_precision_recall()

In [None]:
batch.jobs[0].results[0].plot_precision_recall()

In [None]:
batch.jobs[0].results[0].plot_confusion_matrix()

In [None]:
batch.plot_performance(max_loss=0.40, min_metric_2=0.85)