In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import sys
sys.path.append("..")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score

from processor import NLProcessor

from tqdm import tnrange

In [None]:
corrupt_rate           = 0.1

In [None]:
processor = NLProcessor()
processor.load_spam()

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1, solver = "lbfgs",
                           max_iter = 800,
                           fit_intercept= False,
                           warm_start = True)

def evaluate(model):
    predictions = model.predict(processor.X_test)
    return accuracy_score(processor.Y_test, predictions)
    
    
accuracy_scores = []
guess_scores  = []
for seed in range(15):

    X_train,  ycrptd = processor.corrupt_random(seed, corrupt_rate)
    model.fit(X_train, ycrptd)
    
    # Let us calculate the initial score of the model
    initial_score = evaluate(model)
    
    # Let us make some rounds of correction of sample based on influence scores
    correction_rounds = int(2 * X_train.shape[0] * corrupt_rate)
    
    accuracy_scores.append(np.array([initial_score for i in range(correction_rounds+1)]))
    guess_scores.append(np.zeros(correction_rounds+1))
    
    # Keep a list of checked examples in order to not repeat them
    checked = np.zeros(ycrptd.shape[0], dtype = np.bool)
    
    for i in tnrange(correction_rounds):
        try:
            # Get influences
            influences = processor.complaint_influence(model, ycrptd, C = 1)
        except:
            # The complaint no longer stands
            print(f'Stopped at iteration {i}')
            guess_scores[-1][i+1:] = guess_scores[-1][i]
            accuracy_scores[-1][i+1:] = accuracy_scores[-1][i]
            break
            
        # Ignore already checked instances
        influences[checked] = np.max(influences) + 1
        
        # Our candidate will be the instance with the most negative influence
        to_check = np.argmin(influences)
        
        # List the example as checked
        checked[to_check] = True
        if ycrptd[to_check] == processor.Y_train[to_check]:
            # The guess was incorect. No need to retrain the model
            score = accuracy_scores[-1][i]
            correct_guess = 0
        else:
            # It was a good guess
            ycrptd[to_check] = processor.Y_train[to_check]
            model.fit(X_train, ycrptd)
            score = evaluate(model)
            correct_guess = 1
        
        accuracy_scores[-1][i+1] = score
        guess_scores[-1][i+1] = guess_scores[-1][i] + correct_guess

In [None]:
accuracy_array = np.array(accuracy_scores)
guess_array  = np.array(guess_scores)

accuracy_means = np.mean(accuracy_array[:, ::15], axis = 0)
accuracy_mins  = np.min(accuracy_array[:, ::15], axis  = 0)
accuracy_maxs  = np.max(accuracy_array[:, ::15], axis  = 0)
stacked      = np.stack([accuracy_means -accuracy_mins, accuracy_maxs- accuracy_means])

plt.figure(figsize=(8,8))
plt.errorbar(range(len(accuracy_array[0]))[::15], accuracy_means, yerr = stacked)
plt.show()

In [None]:
guess_means = np.mean(guess_array[:, ::15]/(X_train.shape[0] * corrupt_rate), axis = 0)
guess_mins  = np.min(guess_array[:, ::15]/(X_train.shape[0] * corrupt_rate), axis  = 0)
guess_maxs  = np.max(guess_array[:, ::15]/(X_train.shape[0] * corrupt_rate), axis  = 0)
stacked      = np.stack([guess_means -guess_mins, guess_maxs- guess_means])

plt.figure(figsize=(8,8))
plt.errorbar(range(len(guess_array[0]))[::15], guess_means, yerr = stacked, label = 'Influence Correction')
plt.plot(range(len(guess_array[0]))[:int(X_train.shape[0] * corrupt_rate):15], 
         np.linspace(0, 1, int(X_train.shape[0] * corrupt_rate))[::15],  label = 'Oracle')
plt.legend()
plt.show()

In [None]:
guess_means_inf_complaint_retrain = np.mean(guess_array/(X_train.shape[0] * corrupt_rate), axis = 0)

In [None]:
%store guess_means_inf_complaint_retrain