In [None]:
from sklearn.datasets import load_breast_cancer
import mlflow, datetime, os, pickle
from joblib import dump
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score

: 

In [2]:

# pickle.dump(rcv1.data, open('../data/data.pickle', 'wb'))
# pickle.dump(rcv1.target, open('../data/target.pickle', 'wb'))

In [None]:
# Load the breast cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target
if os.path.exists('../data/'):
    with open('../data/data.pickle', 'wb') as data_file:
        pickle.dump(X, data_file)
    with open('../data/target.pickle', 'wb') as target_file:
        pickle.dump(y, target_file)
else:
    os.makedirs('../data/')
    with open('../data/data.pickle', 'wb') as data_file:
        pickle.dump(X, data_file)
    with open('../data/target.pickle', 'wb') as target_file:
        pickle.dump(y, target_file)

In [None]:
mlflow.set_tracking_uri("./mlruns")
dataset_name = "Breast Cancer Wisconsin"
current_time = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
experiment_name = f"{dataset_name}_{current_time}"
experiment_id = mlflow.create_experiment(f"{experiment_name}")

with mlflow.start_run(experiment_id=experiment_id, run_name=f"{dataset_name}"):
    params = {
        "dataset_name": dataset_name,
        "number of datapoint": X.shape[0],
        "number of dimensions": X.shape[1]
    }
    mlflow.log_params(params)

    from sklearn.model_selection import train_test_split
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, shuffle=True)

    model = CatBoostClassifier(verbose=0, random_state=0)
    model.fit(train_X, train_y)

    y_predict = model.predict(test_X)
    mlflow.log_metrics({'Accuracy': accuracy_score(test_y, y_predict),
                        'F1 Score': f1_score(test_y, y_predict)})

    if not os.path.exists('../model/'):
        os.makedirs('../model/')
    dump(model, f'../model/{experiment_id}.joblib')