In [73]:
import os
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from getpass import getpass
import neptune.new as neptune
import neptune.new.integrations.sklearn as npt_utils

In [74]:
if "NEPTUNE_API_TOKEN" not in os.environ.keys() or os.environ["NEPTUNE_API_TOKEN"] == '':
    os.environ["NEPTUNE_API_TOKEN"] = getpass("Enter your Neptune API token: ")

In [75]:
home_dir = str(Path.home())

In [76]:
training_df = pd.read_parquet(f"{home_dir}/loan_table_training.parquet") # load train dataset

In [77]:
params = {
    "max_depth": 10,
    "max_leaf_nodes": 10,
}

In [78]:
run = neptune.init_run(
    source_files=['neptune_experiment.ipynb'],
    project="zsotnich/mltest"
)

https://app.neptune.ai/zsotnich/mltest/e/MLTES-26
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [79]:
run["model/parameters"] = params

In [112]:
classifier = tree.DecisionTreeClassifier(**params)

In [93]:
target = "loan_status"

x = training_df[
    training_df.columns.drop(target)
    .drop("event_timestamp")
    .drop("created_timestamp")
    .drop("loan_id")
    .drop("zipcode")
    .drop("dob_ssn")
]
X = x.reindex(sorted(x.columns), axis=1)
y = training_df.loc[:, target]

In [82]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=1234
)

In [83]:
# log (save) train and test datasets version
pd.DataFrame(X_train, y_train).to_parquet("train_df.parquet")
pd.DataFrame(X_test, y_test).to_parquet("test_df.parquet")

run["datasets/train"].track_files("train_df.parquet")
run["datasets/test"].track_files("test_df.parquet")

In [84]:
classifier.fit(X_train[sorted(X_train)], y_train)

In [85]:
y_train_pred = classifier.predict_proba(X_train)
y_test_pred = classifier.predict_proba(X_test)

train_f1 = f1_score(y_train, y_train_pred.argmax(axis=1), average="macro")
test_f1 = f1_score(y_test, y_test_pred.argmax(axis=1), average="macro")

run["train/f1"] = train_f1
run["test/f1"] = test_f1

In [None]:
score = classifier.score(X_test, y_test)

run["test/score"] = score

In [86]:
run["cls_summary"] = npt_utils.create_classifier_summary(
    classifier, X_train, X_test, y_train, y_test
)

In [87]:
run.stop()

Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 42 operations to synchronize with Neptune. Do not kill this process.
All 42 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/zsotnich/mltest/e/MLTES-26


In [118]:
from sklearn_genetic import GASearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold

In [120]:
cv = StratifiedKFold(n_splits=3, shuffle=True)

In [None]:
param_grid = {}

In [None]:
evolved_estimator = GASearchCV(estimator=classifier,
                              cv=cv,
                              scoring='accuracy',
                              param_grid=param_grid,
                              n_jobs=-1,
                              verbose=True)

In [119]:
! pip install sklearn-genetic-opt

