In [None]:
# res = compare_experiments_barplot(
#     experiment_paths=[experiment_output_dir],
#     title="TARS eval.",
# )


## WANDB dev

In [42]:
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
import wandb
from skmultilearn.model_selection import iterative_train_test_split


def label_dictionary_to_label_mat(label_dictionary_list, thresh=0.75):
    return (
        pd.DataFrame.from_records(list(label_dictionary_list))
        .pipe(lambda x: x >= thresh)
        .astype(int)
    )


def label_mat_to_label_dictionary(label_mat):
    return list(label_mat.to_dict(orient="index").values())


def create_multi_label_train_test_splits(
    df: pd.core.frame.DataFrame,
    label_col: str,
    test_size=0.25,
):
    df[label_col] = df[label_col].apply(
        lambda x: eval(x) if type(x) == str else x
    )  # string > dict

    # threshold, iteratively split
    y_df = label_dictionary_to_label_mat(df[label_col])
    y_cols = list(y_df.columns)
    x_df = df.drop(label_col, axis=1)
    x_cols = list(x_df.columns)

    x_train, y_train, x_test, y_test = iterative_train_test_split(
        x_df.values, y_df.astype(int).values, test_size=test_size
    )

    # convert back to label object form
    y_train = label_mat_to_label_dictionary(
        pd.DataFrame(y_train, columns=y_cols))
    y_test = label_mat_to_label_dictionary(
        pd.DataFrame(y_test, columns=y_cols))

    # re-stack x/y
    train = pd.DataFrame(np.column_stack((x_train, y_train))).set_axis(
        labels=x_cols + [label_col], axis="columns", inplace=False
    )

    test = pd.DataFrame(np.column_stack((x_test, y_test))).set_axis(
        labels=x_cols + [label_col], axis="columns", inplace=False
    )
    return train, test


def log_dataframe(run, df, name, description):
    # any type of df within a run
    df_artifact = wandb.Artifact(
        name, type="dataset", description=description
    )
    df_artifact.add(wandb.Table(dataframe=df), name=name)
    run.log_artifact(df_artifact)


def create_classification_report(test, test_pred):
    label_names = label_dictionary_to_label_mat(
        test_pred.pred).columns.tolist()
    class_report_dict = classification_report(label_dictionary_to_label_mat(
        test[CONFIG["label_col"]]), label_dictionary_to_label_mat(test_pred.pred), target_names=label_names, output_dict=True,)
    return (pd.DataFrame(class_report_dict)
            .T
            .reset_index()
            .rename(mapper={"index": "label"}, axis="columns", inplace=False))


def create_slim_class_report(class_report):
    return (class_report
            .query('label in @label_names')
            .pipe(lambda x: x[["label", "f1-score", "support"]])
            .set_index("label")
            .to_dict(orient="index"))


In [6]:
from pathlib import Path

import pandas as pd
import yaml

CONFIG = yaml.safe_load(
    Path(
        "/Users/samhardyhey/Desktop/blog/blog-multi-label/training_config.yaml"
    ).read_bytes()
)

# 1.1 create splits
df = pd.read_csv(CONFIG["dataset"])
train, test = create_multi_label_train_test_splits(
    df, label_col=CONFIG["label_col"], test_size=CONFIG["test_size"]
)
test, dev = create_multi_label_train_test_splits(
    test, label_col=CONFIG["label_col"], test_size=CONFIG["test_size"]
)

# # 1.2 log splits
# with wandb.init(
#     project=CONFIG["wandb_project"],
#     name="reddit_aus_finance",
#     group=CONFIG["wandb_group"],
#     entity="cool_stonebreaker",
# ) as run:
#     log_dataframe(run, train, "train_split", "Train split")
#     log_dataframe(run, dev, "dev_split", "Dev split")
#     log_dataframe(run, test, "test_split", "Test split")

## Dictionary classifier

In [52]:
for model in CONFIG['models']:
    model['model']

'dictionary_classifier'

'sklearn_svm'

'flair_tars'

In [55]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from clear_bow.classifier import DictionaryClassifier


def fit_and_log_dictionary_classifier(train, dev, test, model_config):
    with wandb.init(
        project=CONFIG["wandb_project"],
        name=model_config['model'],
        group=CONFIG["wandb_group"],
        entity=CONFIG["wandb_entity"],
    ) as run:
        wandb.config.model = model_config['model']
        wandb.config.group = CONFIG["wandb_group"]

        # instantiate
        dc = DictionaryClassifier(
            classifier_type=model_config["classifier_type"],
            label_dictionary=model_config["label_dictionary"],
        )

        # predict/evaluate
        test_preds = test.assign(
            pred=lambda x: x[CONFIG["text_col"]].apply(dc.predict_single))
        class_report = create_classification_report(test, test_preds)
        class_report_slim = create_classification_report(class_report)

        # log
        log_dataframe(run, test_preds, "test_preds", "Test predictions")
        run.log(class_report_slim)
        run.summary["test_f1"] = class_report.query(
            'label == "weighted avg"')['f1-score'].iloc[0]
        run.summary["test_support"] = class_report.query(
            'label == "weighted avg"')['support'].iloc[0]


def fit_and_log_linear_svc(train, dev, test, model_config):
    with wandb.init(
        project=CONFIG["wandb_project"],
        name=model_config['model'],
        group=CONFIG["wandb_group"],
        entity=CONFIG["wandb_entity"],
    ) as run:
        wandb.config.model = model_config['model']
        wandb.config.group = CONFIG["wandb_group"]

        # define a basic pipeline
        pipeline = Pipeline(
            [
                ("tfidf", TfidfVectorizer()),
                ("vt", VarianceThreshold()),
                ("linear_svc", BinaryRelevance(LinearSVC())),
            ]
        )
        pipeline_param_grid = {
            "C": [0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5],
            "class_weight": ["balanced", None],
            "max_iter": [100, 250, 500, 750, 1000],
            "random_state": [42],
        }

        # fit
        train_dev = pd.concat([train, dev], sort=True)
        pipeline.fit(train_dev[CONFIG['text_col']], label_dictionary_to_label_mat(
            train_dev[CONFIG['label_col']]))

        # predict/evaluate
        test_preds = [dict(zip(test.columns.values, pipeline.predict(
            [e])[0].toarray()[0])) for e in test[CONFIG['text_col']].tolist()]

        class_report = create_classification_report(test, test_preds)
        class_report_slim = create_classification_report(class_report)

        # log
        log_dataframe(run, test_preds, "test_preds", "Test predictions")
        run.log(class_report_slim)
        run.summary["test_f1"] = class_report.query(
            'label == "weighted avg"')['f1-score'].iloc[0]
        run.summary["test_support"] = class_report.query(
            'label == "weighted avg"')['support'].iloc[0]


In [54]:
for model in CONFIG['models']:
    if model['name'] == 'dictionary_classifier':
        fit_and_log_dictionary_classifier(train, dev, test, model)

    elif model['name'] == 'sklearn_linear_svc':
        fit_and_log_linear_svc(train, dev, test, model)

    else:
        print(f"Unsupported model: {model['name']} found")

{'model': 'dictionary_classifier',
 'classifier_type': 'multi_label',
 'label_dictionary': {'regulation': ['asic', 'government', 'federal', 'tax'],
  'contribution': ['contribution',
   'concession',
   'personal',
   'after tax',
   '10%',
   '10.5%'],
  'covid': ['covid', 'lockdown', 'downturn', 'effect'],
  'retirement': ['retire', 'house', 'annuity', 'age'],
  'fund': ['unisuper',
   'aus super',
   'australian super',
   'sun super',
   'qsuper',
   'rest',
   'cbus']}}

{'model': 'sklearn_svm'}

{'model': 'flair_tars'}

# Multi-label SVC

In [229]:
# seperate models as seperate runs
with wandb.init(
    project=CONFIG["wandb_project"],
    name="flair_tars",
    group=CONFIG["wandb_group"],
    entity="cool_stonebreaker",
) as run:
    wandb.config.model = model_config['model']

    # log dev/pred preds
    log_dataframe(run, dev_pred, "dev_preds", "Dev predictions")
    log_dataframe(run, test_pred, "test_preds", "Test predictions")

    run.log(slim_class_report)
    run.summary["test_f1"] = class_report.query('label == "weighted avg"')['f1-score'].iloc[0]
    run.summary["test_support"] = class_report.query('label == "weighted avg"')['support'].iloc[0]


# # seperate models as seperate runs
# with wandb.init(
#     project=CONFIG["wandb_project"],
#     name="flair_tars",
#     group=CONFIG["wandb_group"],
#     entity="cool_stonebreaker",
# ) as run:
#     wandb.config.model = "flair_tars"

#     # log dev/pred preds
#     log_dataframe(run, dev_pred, "dev_preds", "Dev predictions")
#     log_dataframe(run, test_pred, "test_preds", "Test predictions")

#     run.log(slim_class_report)
#     run.summary["test_f1"] = class_report.query('label == "weighted avg"')['f1-score'].iloc[0]
#     run.summary["test_support"] = class_report.query('label == "weighted avg"')['support'].iloc[0]


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.03335033257802327, max=1.0)…

VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
test_f1,0.21818
test_support,11.0


In [233]:
# clear out for dev purposes
import wandb

api = wandb.Api()

for run in api.runs(path="cool_stonebreaker/tyre_kick"):
    run = api.run(f"cool_stonebreaker/tyre_kick/{run.id}")
    run.delete()


In [235]:
!pip install plotly

Collecting plotly
  Downloading plotly-5.10.0-py2.py3-none-any.whl (15.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.2/15.2 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.10.0 tenacity-8.0.1


In [1]:
import wandb
import matplotlib.pyplot as plt
fibonacci = [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
plt.plot(fibonacci)
plt.ylabel('some interesting numbers')

# Initialize run
with wandb.init(
        project=CONFIG["wandb_project"],
        name="flair_tars",
        group=CONFIG["wandb_group"],
        entity="cool_stonebreaker",
    ) as run:

    # Log plot object
    wandb.log({"plot": plt})

[<matplotlib.lines.Line2D at 0x7fdd487bf160>]

Text(0, 0.5, 'some interesting numbers')

[34m[1mwandb[0m: Currently logged in as: [33msamhardyhey[0m ([33mcool_stonebreaker[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…