In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.append("../..")

In [3]:
from json import dumps, loads
from time import time
from typing import Any, NamedTuple

import chardet
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import py_stringmatching as sm
import tensorflow as tf
from sklearn.linear_model import LogisticRegression as skLogisticRegression
from sklearn.metrics import (classification_report, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import cross_validate, train_test_split
from tensorflow.losses import sigmoid_cross_entropy
from tensorflow.python.ops.parallel_for import jacobian

from mlsql.archiver import Record
from mlsql.experiments.duti import create_gradient_model, create_dataset, create_influence_model, DogCatConfig
from mlsql.tensorflow.problem import LogisticRegression as tfLogisticRegression
from mlsql.tensorflow.utils import EarlyStopTrainer

from tqdm import tnrange

# Single Experiment

In [4]:
seed = 1558545695

In [9]:
corrupt_type = DogCatConfig.Duti
query = "SELECT COUNT(*) WHERE clf() = 1"
EXPERIMENT_NAME = "DUTI-GD"
EXPERIMENT_DESC = {
    "seed": seed,
    "query": query,
    "dataset": "duti",
    "corrupt_type": str(corrupt_type),
}

In [10]:
(Xtrain, ytrain), (Xtest, ytest), ycrptd, sel_crpt = create_dataset(seed, corrupt_type)

In [11]:
Xquery = Xtest

In [14]:
def query_loss_func(Xquery, theta):
    nquery = Xquery.shape[0].value
    sum = tf.reduce_sum(tf.sigmoid(Xquery @ theta))
    return 1 / nquery * tf.norm(ytest.sum() - sum)
    
(theta, delta, logistic_objective, total_loss), (train_delta_op, train_theta_op) = create_gradient_model(Xtrain, Xquery, ycrptd, query_loss_func)

In [20]:
clf = skLogisticRegression(solver="lbfgs", max_iter=500, random_state=seed).fit(Xtrain, ycrptd)
clff1 = f1_score(ytest, clf.predict(Xtest))
clfrecall = f1_score(ytest, clf.predict(Xtest))
clfprecision = f1_score(ytest, clf.predict(Xtest))
vdeltas = []

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in EarlyStopTrainer(sess, total_loss, max_iter=200000, tol=1e-6, tol_range=100, progress="Train"):
        for _ in EarlyStopTrainer(sess, logistic_objective, max_iter=1000, tol=1e-4):
            sess.run(train_theta_op)
        sess.run(train_delta_op)
        
        if (epoch + 1) % 10 == 0:
            vdelta, vtheta = sess.run([delta, theta])
            vdeltas.append(vdelta)
vdeltas = np.asarray(vdeltas)            

HBox(children=(IntProgress(value=0, description='Train', max=200000, style=ProgressStyle(description_width='in…

In [21]:
tdelta = (1 - sel_crpt)

In [24]:
print(classification_report(tdelta, vdeltas[-1,:] > 0.5))

              precision    recall  f1-score   support

           0       0.50      0.25      0.33        36
           1       0.67      0.86      0.75        64

   micro avg       0.64      0.64      0.64       100
   macro avg       0.59      0.55      0.54       100
weighted avg       0.61      0.64      0.60       100



In [37]:
r = Record(EXPERIMENT_NAME)
r.input(EXPERIMENT_DESC)
r.output({"model_perf": [clff1, clfprecision, clfrecall], "deltas": vdeltas.tolist(), "tdelta": tdelta.tolist()})
r.insert()

# Multi Runner

In [40]:
seed = 1558545695
EXPERIMENT_NAME = "DUTI-Influence-Retrain"
query = "SELECT COUNT(*) WHERE clf() = 1"

for corrupt_type in DogCatConfig:
    EXPERIMENT_DESC = {
        "seed": seed,
        "query": query,
        "dataset": "duti",
        "corrupt_type": str(corrupt_type),
    }

    desc = f"[{corrupt_type}]"

    (Xtrain, ytrain), (Xtest, ytest), ycrptd, sel_crpt = create_dataset(seed, corrupt_type)

    user_expectation = ytest.sum()

    def query_loss_func(Xquery, theta):
        nquery = Xquery.shape[0].value
        sum = tf.reduce_sum(tf.sigmoid(Xquery @ theta))
        return 1 / nquery * tf.norm(user_expectation - sum)

    clf = skLogisticRegression(solver="lbfgs", max_iter=500, random_state=seed).fit(Xtrain, ycrptd)
    clff1 = f1_score(ytest, clf.predict(Xtest))
    clfrecall = f1_score(ytest, clf.predict(Xtest))
    clfprecision = f1_score(ytest, clf.predict(Xtest))

    Xquery = Xtest

    (theta, delta, logistic_objective), train_logistic_op, influence = create_influence_model(Xtrain, Xquery, ycrptd, query_loss_func)


    K = int(sel_crpt.sum() * 2)
    vdeltas = np.empty((K, len(Xtrain)))
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for k in tnrange(K, desc=desc):
            for _ in EarlyStopTrainer(sess, logistic_objective, max_iter=5000, tol=1e-8):
                sess.run(train_logistic_op)

            vinfluence, vtheta = sess.run([influence, theta])

            i = np.argmax(vinfluence)

            sess.run(delta[i].assign([0]))
            vdelta = sess.run(delta)

            vdeltas[k,:] = vdelta.squeeze()

    tdelta = (1 - sel_crpt)

    r = Record(EXPERIMENT_NAME)
    r.input(EXPERIMENT_DESC)
    r.output({"model_perf": [clff1, clfprecision, clfrecall], "deltas": vdeltas.tolist(), "tdelta": tdelta.tolist()})
    r.insert()

HBox(children=(IntProgress(value=0, description='[DogCatConfig.Duti]', max=72, style=ProgressStyle(description…

HBox(children=(IntProgress(value=0, description='[DogCatConfig.FarLine]', max=20, style=ProgressStyle(descript…

HBox(children=(IntProgress(value=0, description='[DogCatConfig.CloseBoundary]', max=20, style=ProgressStyle(de…