In [5]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append("../")
#sys.path.append("../../extra-package")

import gurobipy
from json import dumps, loads
from time import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression as skLogisticRegression
from sklearn.metrics import (classification_report, f1_score, precision_score, recall_score)
from tqdm import tnrange, trange
import tensorflow as tf

from mlsql import InfluenceRanker, SelfLossInfluenceRanker, AutoFixer, ModelManagerLM, LossRanker, TiresiasRanker, multi_ambiguity_count
# from mlsql.models import SimpleCNN, LogReg
from mlsql.models.nn import SimpleCNN1D, SimpleCNN1D_Linear, MLP, MLP_Linear

from mlsql.utils import setdiff1d
from processors.adultNoCorr import AdultNoCorrProcessor

from itertools import groupby
from functools import partial

import logging
logging.getLogger("tensorflow").setLevel(logging.CRITICAL)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import time
import altair as alt
alt.data_transformers.disable_max_rows()

@tf.function
def rank_fix(ranker, fixer, n):
    rank = ranker.predict()
    fixer.fix(rank, n)
    return rank


@tf.function
def rankit(ranker):
    rank = ranker.predict()
    return rank


@tf.function
def fixit(fixer, rank, n):
    fixer.fix(rank, n)


@tf.function
def train(manager):
    manager.fit()

In [7]:
seed = 2987429
proc = AdultNoCorrProcessor(seed)
print(proc.ytrain.shape)
print(proc.Xtrain.shape)

# model = SimpleCNN1D(proc, 1, input_shape=[proc.Xtrain.shape[1], 1])
model = MLP(proc, 1)
manager0 = ModelManagerLM(proc.X_Atrain, proc.X_Btrain, proc.ytrain, model)
manager0.fit(print_value=True)
# print("SimpleCNN1D")
print("MLP")
print("On Training\n", classification_report(proc.ytrain.numpy(), manager0.predict(proc.X_Atrain, proc.X_Btrain).numpy()))
print("On Testing\n", classification_report(proc.ytest.numpy(), manager0.predict(proc.X_Atest, proc.X_Btest).numpy()))

(26048,)
(26048, 17)
tf.Tensor(nan, shape=(), dtype=float32)
17
MLP
On Training
               precision    recall  f1-score   support

        -1.0       0.76      1.00      0.86     19788
         1.0       0.00      0.00      0.00      6260

    accuracy                           0.76     26048
   macro avg       0.38      0.50      0.43     26048
weighted avg       0.58      0.76      0.66     26048

On Testing
               precision    recall  f1-score   support

        -1.0       0.76      1.00      0.86      2468
         1.0       0.00      0.00      0.00       788

    accuracy                           0.76      3256
   macro avg       0.38      0.50      0.43      3256
weighted avg       0.57      0.76      0.65      3256



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
K = 2600
corrsel = tf.cast(tf.ones(proc.ytrain.shape[0]), dtype='bool')

In [None]:
from tqdm.notebook import tnrange, trange
# model_plus = SimpleCNN1D(proc, 1, input_shape=[proc.Xtrain.shape[1], 1])
model_plus = MLP(proc, 1)
manager = ModelManagerLM(proc.X_Atrain, proc.X_Btrain, proc.ytrain, model_plus)
manager.model.set_weights(manager0.model.get_weights())
manager.delta = tf.Variable(manager0.delta.value(), name="delta")
ranker = InfluenceRanker(manager=manager, on=proc.complain)
fixer = AutoFixer(manager, corrsel, K)

AQs = []
weighted_f1 = []
rank_list = []
rank_time_rain = 0
model_time_rain = 0
_, AQ, _, _ = proc.complain(manager)
f1 = f1_score(proc.ytest.numpy(), manager.predict(proc.X_Atest, proc.X_Btest).numpy(), average='weighted')
AQs.append(float(AQ))
weighted_f1.append(f1)

step_size = 10
rain_k = int(np.ceil(K / step_size))
for k in trange(0, rain_k):
    nfix = min(step_size, K - step_size * k)
    assert nfix > 0

    start = time.time()
    rank = rank_fix(ranker, fixer, nfix)
    middle = time.time()
    manager.fit()
    end = time.time()
    
    rank_list.append(rank.numpy())
    rank_time_rain += middle - start
    model_time_rain += end - middle

    _, AQ, _, _ = proc.complain(manager)
    f1 = f1_score(proc.ytest.numpy(), manager.predict(proc.X_Atest, proc.X_Btest).numpy(), average='weighted')
    AQs.append(float(AQ))
    weighted_f1.append(f1)

print("Rank_time:", rank_time_rain)
print("Model_time:", model_time_rain)

df_rain = pd.DataFrame({
    "Complain": np.array(AQs),
    "F1": np.array(weighted_f1),
    "K": [1] + list(range(step_size, K + step_size, step_size)),
    "Method": np.repeat("Rain", len(AQs)),
})
alt.Chart(pd.concat([df_rain])).mark_line().encode(
    x = "K",
    y = "Complain",
    color = "Method"
)

In [None]:
rain_del = set(fixer.deletions.numpy())

In [None]:
#LinearComb

# model = SimpleCNN1D_Linear(proc, 1, input_shape_a=[proc.X_Atrain.shape[1], 1], input_shape_b=[proc.X_Btrain.shape[1], 1])
model = MLP_Linear(proc, 1)
manager1 = ModelManagerLM(proc.X_Atrain, proc.X_Btrain, proc.ytrain, model)
manager1.fit(print_value=True, max_iter=2000, tol=1e-6)
# print("SimpleCNN1D_Linear")
print("MLP_Linear")
print("On Training\n", classification_report(proc.ytrain.numpy(), manager1.predict(proc.X_Atrain, proc.X_Btrain).numpy()))
print("On Testing\n", classification_report(proc.ytest.numpy(), manager1.predict(proc.X_Atest, proc.X_Btest).numpy()))

In [None]:
K = 2600

In [None]:
# model_plus = SimpleCNN1D_Linear(proc, 1, input_shape_a=[proc.X_Atrain.shape[1], 1], input_shape_b=[proc.X_Btrain.shape[1], 1])
model_plus = MLP_Linear(proc, 1)
manager = ModelManagerLM(proc.X_Atrain, proc.X_Btrain, proc.ytrain, model_plus)
manager.model.set_weights(manager1.model.get_weights())
manager.delta = tf.Variable(manager1.delta.value(), name="delta")
ranker = InfluenceRanker(manager=manager, on=proc.complain)
fixer = AutoFixer(manager, corrsel, K)

AQs = []
weighted_f1 = []
rank_list = []
rank_time_lcomb = 0
model_time_lcomb = 0
_, AQ, _, _ = proc.complain(manager)
f1 = f1_score(proc.ytest.numpy(), manager.predict(proc.X_Atest, proc.X_Btest).numpy(), average='weighted')
AQs.append(float(AQ))
weighted_f1.append(f1)

step_size = 10
rain_k = int(np.ceil(K / step_size))
for k in trange(0, rain_k):
    nfix = min(step_size, K - step_size * k)
    assert nfix > 0

    start = time.time()
    rank = rank_fix(ranker, fixer, nfix)
    middle = time.time()
#     train(manager)
    manager.fit(print_value=True, tol=1e-6)
    end = time.time()
    
    rank_list.append(rank.numpy())
    rank_time_lcomb += middle - start
    model_time_lcomb += end - middle

    _, AQ, _, _ = proc.complain(manager)
    f1 = f1_score(proc.ytest.numpy(), manager.predict(proc.X_Atest, proc.X_Btest).numpy(), average='weighted')
    AQs.append(float(AQ))
    weighted_f1.append(f1)

print("Rank_time:", rank_time_lcomb)
print("Model_time:", model_time_lcomb)

df_lcomb = pd.DataFrame({
    "Complain": np.array(AQs),
    "F1": np.array(weighted_f1),
    "K": [1] + list(range(step_size, K + step_size, step_size)),
    "Method": np.repeat("Lcomb", len(AQs)),
})
alt.Chart(pd.concat([df_lcomb])).mark_line().encode(
    x = "K",
    y = "Complain",
    color = "Method"
)

In [None]:
alt.Chart(pd.concat([df_rain, df_lcomb]), title="Complain of query data vs. K, AC=0, MLP").mark_line().encode(
    x = "K",
    y = "Complain",
    color = "Method"
)

In [None]:
alt.Chart(pd.concat([df_rain, df_lcomb]), title="F1 score of test data vs. K, MLP",).mark_line().encode(
    alt.Y('F1',
        scale=alt.Scale(domain=(0.75, 0.85))
    ),
    x = "K",
    color = "Method"
)

In [None]:
lcomb_del = set(fixer.deletions.numpy())

In [None]:
len(rain_del.intersection(lcomb_del))/2600