In [None]:
# parameters that will be injected by machine learning platform
project_id = "457e13bc-d47e-4a2e-be13-bcd47eba2ea5"
training_id = "c81fbe3a-a06a-4704-9fbe-3aa06a07047c"
data_set_source = [ "iris.csv" ]
feature_set = {
  "sepal width (cm)": {
    "type": "string",
    "description": None
  },
  "target_iris_type": {
    "type": "string",
    "description": None
  },
  "petal width (cm)": {
    "type": "string",
    "description": None
  },
  "petal length (cm)": {
    "type": "string",
    "description": None
  },
  "sepal length (cm)": {
    "type": "string",
    "description": None
  }
}
output_dir = "out"
training_metrics_file = "training.metrics"
cross_validation_metrics_file = "cross_validation.metrics"
testing_metrics_file = "testing.metrics"
feature_importance_file = "feature.importance"
model_file = "model.pkl"
metrics_feedback_url = f"http://localhost:8080/projects/{project_id}/trainings/{training_id}/metrics"
test_data_proportion = 0.4
num_neighbors = 1

In [None]:
print("project_id =", project_id)
print("training_id =", training_id)
print("data_set_source =", data_set_source)
print("output_dir", output_dir)
print("training_metrics_file", training_metrics_file)
print("cross_validation_metrics_file", cross_validation_metrics_file)
print("testing_metrics_file", testing_metrics_file)
print("model_file", model_file)
print("metrics_feedback_url =", metrics_feedback_url)
print("test_data_proportion =", test_data_proportion)
print("num_neighbors =", num_neighbors)

In [None]:
# define a function that prints the iris' classification based on the algorithm's output
def classifyiris(z):
    if z[0] == 0:
        print("The iris is setosa.\n")
    elif z[0] == 1:
        print("The iris is versicolor.\n")
    else:
        print("The iris is virginica.\n")

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

f = open(data_set_source[0])
f.readline()  # skip the header
iris = np.loadtxt(fname = f, delimiter = ',')
iris_data = iris[:, 0:4]
iris_target = iris[:, 4:5].reshape(-1)
X_train, X_test, y_train, y_test = train_test_split(
    iris_data, 
    iris_target, 
    test_size=test_data_proportion, 
    random_state=0
)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# using the K Nearest Neighbor Algorithm
knn = KNeighborsClassifier(n_neighbors = num_neighbors)
knn.fit(X_train, y_train)

z = knn.predict([[3,5,4,2]])
print("Using the k nearest neighbor algorithm =", knn.predict([[3,5,4,2]]))
classifyiris(z)


In [None]:
from sklearn import metrics

# get training metrics
z = knn.predict(X_train)
training_accuracy = metrics.accuracy_score(z, y_train)
training_precision_macro = metrics.precision_score(z, y_train, average='macro')
training_recall_macro = metrics.recall_score(y_train, z, average='macro')
training_f1_score_macro = metrics.f1_score(y_train, z, average='macro')
training_confusion_matrix = metrics.confusion_matrix(y_train, z)
print("training accuracy =", training_accuracy)
print("training precision =", training_precision_macro)
print("training recall =", training_recall_macro)
print("training f1 score =", training_f1_score_macro)
print("training confusion matrix = \n", training_confusion_matrix)

In [None]:
import numpy as np
import json

# saving training metrics to machine learning platform
training_confusion_matrix_string = np.array2string(training_confusion_matrix, separator=',')

metrics_payload = {
    'type': 'TRAINING',
    'accuracy': training_accuracy,
    'precisionMacro': training_precision_macro,
    'recallMacro': training_recall_macro,
    'f1Macro': training_f1_score_macro,
    'confusionMatrix': training_confusion_matrix_string
}

with open(training_metrics_file, 'w') as metrics_out:
    json.dump(metrics_payload, metrics_out, ensure_ascii=False, indent=4)

In [None]:
# get cross validation metrics
z = knn.predict(X_test)
cv_accuracy = metrics.accuracy_score(z, y_test)
cv_precision_macro = metrics.precision_score(z, y_test, average='macro')
cv_recall_macro = metrics.recall_score(y_test, z, average='macro')
cv_f1_score_macro = metrics.f1_score(y_test, z, average='macro')
cv_confusion_matrix = metrics.confusion_matrix(y_test, z)
print("cross validation accuracy", cv_accuracy)
print("cross validation precision =", cv_precision_macro)
print("cross validation recall", cv_recall_macro)
print("cross validation f1 score =", cv_f1_score_macro)
print("training confusion matrix = \n", cv_confusion_matrix)

In [None]:
# saving cross validation metrics to machine learning platform
cv_confusion_matrix_string = np.array2string(cv_confusion_matrix, separator=',')

metrics_payload = {
    'type': 'CROSS_VALIDATION',
    'accuracy': cv_accuracy,
    'precisionMacro': cv_precision_macro,
    'recallMacro': cv_recall_macro,
    'f1Macro': cv_f1_score_macro,
    'confusionMatrix': cv_confusion_matrix_string
}

with open(cross_validation_metrics_file, 'w') as metrics_out:
    json.dump(metrics_payload, metrics_out, ensure_ascii=False, indent=4)

In [None]:
import random

# saving feature importance
feature_importance = {}
for feature_name in feature_set.keys():
    feature_importance[feature_name] = random.uniform(0.0, 1.0)

with open(feature_importance_file, 'w') as feature_importance_out:
    json.dump(feature_importance, feature_importance_out, ensure_ascii=False, indent=4)

In [None]:
# saving the model
from joblib import dump

dump(knn, model_file)

In [None]:
# saving arbitrary output files.
with open(f'{output_dir}/testing.json', 'w') as testing_json:
    json.dump(metrics_payload, testing_json, ensure_ascii=False, indent=4)