In [1]:
# Import the libraries
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings

import sklearn.linear_model as sk
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, log_loss
from sklearn.naive_bayes import GaussianNB

warnings.filterwarnings("ignore")

**Self-Implemented Training Functions**

In [None]:
def train_one_vs_all(X, y, interested_in, num_classes, lambda_val):
    """
    Train a one vs. all logistic regression
    
    :param X: The input matrix (ndarray)
    :param y: Label vector (ndarray)
    :param num_classes: Number of classes (int)
    :param lambda_val: Regularization parameter (float)

    :return weight_vectors: Weight vector matrix (ndarray)
    :return intercepts: Intercept vector matrix (ndarray)                     
            
    """
    weight_vectors = np.zeros((X.shape[1], num_classes))
    intercepts = np.zeros(num_classes) 

    for i in range(len(interested_in)):
        if interested_in[i] in y:
            y_c = (y == interested_in[i]).astype(int)
            weight_vectors[:, i], intercepts[i] = train_logistic_regression(X, y_c, lambda_val)

    return weight_vectors, intercepts

In [None]:
def predict_one_vs_all(X, weight_vectors, intercepts):
    """
    Predict one vs. all logistic regression
    
    :param X: The input matrix (ndarray)
    :param weight_vectors: Weight vector matrix (ndarray)
    :param intercepts: Intercept vector matrix (ndarray)     
                       
    :return predictions: Prediction vector (ndarray) 
    
    """    
    
    predictions = np.argmax(np.add(np.dot(X,weight_vectors),intercepts), 1)
    return predictions

In [None]:
def train_logistic_regression(X, y, lambda_val):
    """
    Train a regularized logistic regression model
    
    :param X: The input matrix (ndarray)
    :param y: Label vector (ndarray)
    :param lambda_val: Regularization parameter (float)

    :return weights: Weight vector (ndarray)
    :return intercept: Intercept parameter (float)   
    
    """
    model = linear_model.LogisticRegression(C=2./lambda_val, solver='lbfgs')

    # call model.fit(X, y) while suppressing warnings about convergence
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        model.fit(X, y)

    weight_vector = model.coef_.ravel()
    intercept = model.intercept_
    return weight_vector, intercept

In [None]:
def logistic(z):
    """
    The logistic function
    
    :param z: Array to be converted (ndarray)
    
    :return p: logistic(z) entrywise of the same shape (ndarray)
    
    """

    ones_z = np.ones(z.shape)
    exp = np.full(z.shape, np.e)
    denum = ones_z.copy() + np.power(exp, -z)
    p = np.divide(ones_z, denum)
    return p

In [None]:
def nll_cost_function(X, y, theta):
    """
    Compute the negative log liklihood (nll) cost function for a particular data 
    set and hypothesis (weight vector)
    
    :param X: The input matrix (ndarray)
    :param y: Label vector (ndarray)
    :param theta: Initial parameter vector (ndarray)
    
    :return cost: The value of the cost function (float)
    
    """
    cost = 0
    h = logistic(np.dot(X,theta)).astype('int')
    cost = -np.dot(y, np.log(h)) - np.dot((1-y), np.log(1 - h))
    return cost

In [None]:
def gradient_descent(X, y, theta, alpha, iters):
    """
    Fits a logistic regression model by gradient descent.
    
    :param X: The input matrix (ndarray)
    :param y: Label vector (ndarray)
    :param theta: Initial parameter vector (ndarray)
    :param alpha: Step size (float)
    :param iters: Number of iterations (int)
    
    :return theta: Learned parameter vector (ndarray)
    :return J_history: Cost function in iteration (ndarray)
        
    """
    J_history = np.zeros(iters)

    for i in range(iters):
        d_J = 2*(np.dot(X.T, np.subtract(logistic(np.dot(X,theta)),y)))
        theta = theta - (alpha*d_J)
        J_history[i] = nll_cost_function(X, y, theta)
        
    return theta, J_history

**Vital status**

In [2]:
def vital_status_train(X, y, alpha, iters, num_groups, feature_labels, num_tests = 1, output = True):
        
    if output == True:
        for i in range(num_tests):    
            X_groups, y_groups = get_vital_status_data(X, y, num_groups)

            print("In class model:")
            vital_status_in_class(X_groups, y_groups, alpha, iters, num_groups)
            print()
            print("Scikit model:")
            vital_status_scikit(X_groups, y_groups, num_groups, feature_labels)
    else:
        accuracy_class = 0
        accuracy_scikit = 0
        
        for i in range(num_tests):    
            X_groups, y_groups = get_vital_status_data(X, y, num_groups)
            accuracy_class = accuracy_class + vital_status_in_class(X_groups, y_groups, alpha, iters, num_groups, output)
            accuracy_scikit = accuracy_scikit + vital_status_scikit(X_groups, y_groups, num_groups, feature_labels, output)
        print("In class model:")
        print("Average test accuracy: %.2f" % (accuracy_class/num_tests),"%")
        print("Scikit model:")
        print("Average test accuracy: %.2f" % (accuracy_scikit/num_tests),"%")
        

In [3]:
# Cross-validation groups for vital_status
def get_vital_status_data(X, y, num_groups):  
    # Equate the number of dead/alive cases
    dead = np.nonzero(y == 0)[0]
    alive = np.nonzero(y == 1)[0]
    np.random.shuffle(dead)
    np.random.shuffle(alive)
    num_cases = min(len(dead), len(alive))
    dead = dead[0:num_cases]
    alive = alive[0:num_cases]
    
    # Form the balanced cross-validation subsets (indices)
    dead_groups = np.array_split(dead, num_groups)
    alive_groups = np.array_split(alive, num_groups)

    subsets = []
    for i in range(num_groups):
        assert(len(dead_groups[i]) == len(alive_groups[i]))
        subsets.append(np.concatenate((dead_groups[i], alive_groups[i])))
        np.random.shuffle(subsets[i])

    # Form the balanced cross-validation X and y (values)
    X_groups = []
    y_groups = []
    for i in range(num_groups):
        X_groups.append(X[subsets[i],:])
        y_groups.append(y[subsets[i]])
    return X_groups, y_groups

In [4]:
def vital_status_in_class(X_groups, y_groups, alpha, iters, num_groups, output = True):
    # Define params for the training
    n = X_groups[0].shape[1]
    theta = np.ones(n)

    train_accuracy = 0
    test_accuracy = 0
    train_f1_score = 0
    test_f1_score = 0
    cv_cost = 0
    test_confusion = np.zeros(shape=(2, 2))

    # Perform k-fold cross-validation and record the results
    for i in range(num_groups):
        X_test = X_groups[i]
        y_test = y_groups[i]
        X_train, y_train = get_train_groups(X_groups, y_groups, num_groups, i)

        theta, J = gradient_descent(X_train, y_train, theta, alpha, iters)
        cv_cost = cv_cost + J[-1]

        sort_predictions = lambda x: 0 if x < 0.5 else 1
        convert_to_log = np.vectorize(sort_predictions)
        
        # Apply sigmoid to the test predictions
        train_preds = logistic(np.dot(X_train, theta))
        test_preds = logistic(np.dot(X_test, theta))
        train_preds = convert_to_log(train_preds)
        test_preds = convert_to_log(test_preds)
        
        accuracy, f1 = get_stats(y_train, train_preds)
        train_accuracy = train_accuracy + accuracy
        train_f1_score = train_f1_score + f1
        
        accuracy, f1 = get_stats(y_test, test_preds)
        test_accuracy = test_accuracy + accuracy
        test_f1_score = test_f1_score + f1
        test_confusion = test_confusion + confusion_matrix(y_test, test_preds)
    if output:
        print("Average train final cost: %.2f" % (cv_cost/num_groups))
        print("Average train accuracy: %.2f" % (train_accuracy/num_groups),"%")
        print("Average train F1 score: %.2f" % (train_f1_score/num_groups))

        print("Average test accuracy: %.2f" % (test_accuracy/num_groups),"%")
        print("Average test F1 score: %.2f" % (test_f1_score/num_groups))
        create_heatmap((test_confusion/num_groups).astype('int'), [0, 1])
    return test_accuracy/num_groups

In [5]:
def vital_status_scikit(X_groups, y_groups, num_groups, feature_labels,  output = True):
    train_accuracy = 0
    test_accuracy = 0
    train_f1_score = 0
    test_f1_score = 0
    train_cost = 0
    test_cost = 0
    test_weights = np.zeros(X_groups[0].shape[1])
    test_confusion = np.zeros(shape=(2, 2)) 
    test_bayes = 0

    # Perform k-fold cross-validation and record the results 
    for i in range(num_groups):
        X_test = X_groups[i]
        y_test = y_groups[i] 
        X_train, y_train = get_train_groups(X_groups, y_groups, num_groups, i)

        lr = sk.LogisticRegression(solver='liblinear')
        lr.fit(X_train, y_train)
        train_preds = lr.predict(X_train)
        test_preds = lr.predict(X_test)
        
        gnb = GaussianNB()
        gnb_preds = gnb.fit(X_train, y_train).predict(X_test)
        test_bayes = test_bayes + accuracy_score(y_test, gnb_preds)*100
        
        train_accuracy = train_accuracy + accuracy_score(y_train, train_preds)*100
        train_f1_score = train_f1_score + f1_score(y_train, train_preds)
        train_cost = train_cost + log_loss(y_train, train_preds)
        
        test_accuracy = test_accuracy + accuracy_score(y_test, test_preds)*100
        test_f1_score = test_f1_score + f1_score(y_test, test_preds)
        test_cost = test_cost + log_loss(y_test, test_preds)
        
        test_weights = np.add(test_weights, lr.coef_[0])
        test_confusion = test_confusion + confusion_matrix(y_test, test_preds)

#     UNCOMMENT to access uncompressed weights
#     test_weights_abs = list(np.abs(test_weights))
#     test_weights_sorted = test_weights_abs.copy()
#     test_weights_sorted.sort(reverse = True)

    actual_labels, actual_weights = get_actual_weights(feature_labels, list(test_weights))    
    test_weights_abs = list(np.abs(actual_weights))
    test_weights_sorted = test_weights_abs.copy()
    test_weights_sorted.sort(reverse = True)
    
    if output:
        print("Average train final cost: %.2f" % (train_cost/num_groups))
        print("Average train accuracy: %.2f" % (train_accuracy/num_groups),"%")
        print("Average train F1 score: %.2f" % (train_f1_score/num_groups))
        print("Average test final cost: %.2f" % (test_cost/num_groups))
        print("Average test accuracy: %.2f" % (test_accuracy/num_groups),"%")
        print("Average test F1 score: %.2f" % (test_f1_score/num_groups))
        print("Average Gaussian Naive Bayes accuracy: %.2f" % (test_bayes/num_groups),"%")
        create_heatmap((test_confusion/num_groups).astype('int'), [0, 1])
        print()
        print("Highest average weights (absolute values!):")
        for i in range(len(test_weights_sorted[:3])):
            index = test_weights_abs.index(test_weights_sorted[i])
            print(actual_labels[index], ': %.2f' % test_weights_sorted[i])
    return test_accuracy/num_groups

**Death days to**

In [6]:
def death_days_to_train(X, y, alpha, iters, num_groups):
    train_accuracy = 0
    test_accuracy = 0
    final_cost = 0
    
    # Get cross-validation grouped X and y data
    X_groups, y_groups = get_death_days_to_data(X, y, num_groups)

    # Perform k-fold cross-validation and record the results 
    for i in range(num_groups):
        X_test = X_groups[i]
        y_test = y_groups[i]
        X_train, y_train = get_train_groups(X_groups, y_groups, num_groups, i)

        # Scikit model training
        lr = sk.LinearRegression()
        lr.fit(X_train, y_train)
        train_preds = lr.predict(X_train)
        test_preds = lr.predict(X_test)

        # In-class, only cost
        theta_vec_equations, cost = gradient_descent_v2(normalize_v2(X_train), y_train, alpha, iters)   
        final_cost = final_cost + cost[-1]

        # Calculate the train accuracy
        match = 0
        for i in range(len(y_train)):
            if  train_preds[i] <= y_train[i] + 180 and train_preds[i] >= y_train[i] - 180:
                match += 1
        train_accuracy = train_accuracy + match/len(y_train)*100 

        # Calculate the test accuracy
        match = 0
        for i in range(len(y_test)):
            if  test_preds[i] <= y_test[i] + 180 and test_preds[i] >= y_test[i] - 180:
                match += 1
        test_accuracy = test_accuracy + match/len(y_test)*100 

    print("In-class model:")
    print("Average final cost: %.2f" % (final_cost/num_groups))
    print("Mean in y: %.2f" % np.mean(y))
    print("Variance in y: %.2f" % np.var(y))
    print()
    print("Scikit model:")
    print("Average train accuracy: %.2f" % (train_accuracy/num_groups),"%")
    print("Average test accuracy: %.2f" % (test_accuracy/num_groups),"%")

In [None]:
# Cross-validation groups for death_days_to
def get_death_days_to_data(X, y, num_groups):    
    # Get all dead cases
    death_indices = np.arange(len(y))
    np.random.shuffle(death_indices)
    
    dead_groups = np.array_split(death_indices, num_groups)

    X_groups = []
    y_groups = []
    for i in range(num_groups):
        X_groups.append(X[dead_groups[i],:])
        y_groups.append(y[dead_groups[i]])
    return X_groups, y_groups

**Outcome**

In [None]:
def outcome_train(X, y, interested_in, lambda_val, num_groups, feature_labels, alpha = 0.00000001, iters = 2000, num_tests = 1, output = True):
    if output == True:
        for i in range(num_tests):    
            X_groups, y_groups, num_classes = get_outcome_data(X, y, interested_in, num_groups)

            print("In class model:")
            if num_classes == 2:
                for i in range(len(y_groups)):
                    y_groups[i] = np.where(y_groups[i] == interested_in[0], 0, 1)                    
                print("Logistic regression:")
                vital_status_in_class(X_groups, y_groups, alpha, iters, num_groups)
                print()

            print("One vs all:")  
            outcome_in_class(X_groups, y_groups, num_classes, interested_in, lambda_val, num_groups)

            print()
            print("Scikit model:")
            outcome_scikit(X_groups, y_groups, num_classes, interested_in, num_groups, feature_labels)
            plot_lambdas(X_groups, y_groups, num_classes, interested_in, num_groups)
    else:
        accuracy_regression = 0
        accuracy_classifier = 0
        accuracy_scikit = 0
        
        for i in range(num_tests):  
            X_groups, y_groups, num_classes = get_outcome_data(X, y, interested_in, num_groups)

            if num_classes == 2:
                for i in range(len(y_groups)):
                    y_groups[i] = np.where(y_groups[i] == interested_in[0], 0, 1)                    
                accuracy_regression = accuracy_regression + vital_status_in_class(X_groups, y_groups, alpha, iters, num_groups, output) 
            accuracy_classifier = accuracy_classifier + outcome_in_class(X_groups, y_groups, num_classes, interested_in, lambda_val, num_groups, output)
            accuracy_scikit = accuracy_scikit + outcome_scikit(X_groups, y_groups, num_classes, interested_in, num_groups, feature_labels, output)
                    
        if num_classes == 2:
            print("Logistic regression:")
            print("Average test accuracy: %.2f" % (accuracy_regression/num_tests),"%")
        print("One vs all model:")
        print("Average test accuracy: %.2f" % (accuracy_classifier/num_tests),"%")
        print("Scikit model:")
        print("Average test accuracy: %.2f" % (accuracy_scikit/num_tests),"%")

In [45]:
def get_outcome_data(X, y, interested_in, num_groups):
    # Get the number of samples to take from each class
    lengths = []
    data = []
    for i in range(5): 
        if i in interested_in:
            data.append(np.nonzero(y == i)[0])
            lengths.append(len(data[-1]))    
    num_cases = np.amin(lengths)
    num_classes = len(lengths)
    
    
    # Truncate to balance
    for i in range(num_classes):
        np.random.shuffle(data[i])
        data[i] = data[i][0:num_cases]
    
    # Break into balanced groups
    batch_groups = []
    for i in range(num_classes):
        batch_groups.append(np.array_split(data[i], num_groups))

    # Put batches together 
    outcome_groups = []
    for i in range(num_groups): #0-4
        batch = []
        for j in range(num_classes): #0-2
            batch = np.concatenate([batch, batch_groups[j][i]])
        np.random.shuffle(batch)
        outcome_groups.append(batch.astype('int'))
    
    X_groups = []
    y_groups = []
    for i in range(num_groups):
        X_groups.append(X[outcome_groups[i],:])
        y_groups.append(y[outcome_groups[i]])
    return X_groups, y_groups, num_classes

In [46]:
# Bug 1.1: confusion matrix is not exactly what we need it to be. 
# Bug 1.2: test_confusion is not initialized right

def outcome_in_class(X_groups, y_groups, num_classes, interested_in, lambda_val, num_groups, output = True):
    train_accuracy = 0
    test_accuracy = 0
    train_f1_score = 0
    test_f1_score = 0    
    test_confusion = np.zeros(shape=(len(interested_in), len(interested_in))) 
    
    target_names = []
    for i in interested_in:
        target_names.append('class ' + str(i))

    for i in range(num_groups):
        X_test = X_groups[i]
        y_test = y_groups[i]
        X_train, y_train = get_train_groups(X_groups, y_groups, num_groups, i)
        
        weight_vectors, intercepts = train_one_vs_all(X_train, y_train, interested_in, num_classes, lambda_val)
        train_preds = predict_one_vs_all(X_train, weight_vectors, intercepts)
        test_preds  = predict_one_vs_all(X_test,  weight_vectors, intercepts)

        accuracy, f1 = get_stats(y_train, train_preds)
        train_accuracy = train_accuracy + accuracy
        train_f1_score = train_f1_score + f1
        
        accuracy, f1 = get_stats(y_test, test_preds)
        test_accuracy = test_accuracy + accuracy
        test_f1_score = test_f1_score + f1
        test_confusion = test_confusion + confusion_matrix(list(y_test), list(test_preds))
    if output:
        print("Average train accuracy: %.2f" % (train_accuracy/num_groups),"%")
        print("Average train F1 score: %.2f" % (train_f1_score/num_groups))

        print("Average test accuracy: %.2f" % (test_accuracy/num_groups),"%")
        print("Average test F1 score: %.2f" % (test_f1_score/num_groups))   
        create_heatmap((test_confusion/num_groups).astype('int'), interested_in)
    return test_accuracy/num_groups

In [None]:
def outcome_scikit(X_groups, y_groups, num_classes, interested_in, num_groups, feature_labels, output = True):
    train_accuracy = 0
    test_accuracy = 0
    train_f1_score = 0
    test_f1_score = 0
    test_weights = np.zeros(X_groups[0].shape[1])
    test_confusion = np.zeros(shape=(len(interested_in), len(interested_in)))
    test_bayes = 0
    
    target_names = []
    for i in interested_in:
        target_names.append('class ' + str(i))
    
    for i in range(num_groups):
        X_test = X_groups[i]
        y_test = y_groups[i]
        X_train, y_train = get_train_groups(X_groups, y_groups, num_groups, i)

        lr = sk.LogisticRegression()
        lr.fit(X_train, y_train.astype('int'))
        train_preds = lr.predict(X_train)
        test_preds = lr.predict(X_test)
        
        gnb = GaussianNB()
        gnb_preds = gnb.fit(X_train, y_train.astype('int')).predict(X_test)
        test_bayes = test_bayes + accuracy_score(list(y_test), list(gnb_preds))*100

        train_accuracy = train_accuracy + np.mean(train_preds == y_train)*100
        test_accuracy = test_accuracy + np.mean(test_preds == y_test)*100 
        train_class_report = classification_report(list(y_train), list(train_preds), \
                                              target_names = target_names, output_dict=True)
        test_class_report = classification_report(list(y_test), list(test_preds), \
                                              target_names = target_names, output_dict=True)
        # Calculate average test f1-score
        train_f1_acc = 0
        test_f1_acc = 0
        for name in target_names:
            train_f1_acc = train_f1_acc + train_class_report[name]['f1-score']/len(target_names)
            test_f1_acc = test_f1_acc + test_class_report[name]['f1-score']/len(target_names)
        train_f1_score = train_f1_score + train_f1_acc
        test_f1_score = test_f1_score + test_f1_acc
        test_weights = np.add(test_weights, lr.coef_[0])
        test_confusion = test_confusion + confusion_matrix(list(y_test), list(test_preds))
    
#     UNCOMMENT to access uncompressed weights
#     test_weights_abs = list(np.abs(test_weights))
#     test_weights_sorted = test_weights_abs.copy()
#     test_weights_sorted.sort(reverse = True)

    actual_labels, actual_weights = get_actual_weights(feature_labels, list(test_weights))   
    test_weights_abs = list(np.abs(actual_weights))
    test_weights_sorted = test_weights_abs.copy()
    test_weights_sorted.sort(reverse = True)
    
    if output:
        print("Average train accuracy: %.2f" % (train_accuracy/num_groups),"%")
        print("Average train F1 score: %.2f" % (train_f1_score/num_groups))
        print("Average test accuracy: %.2f" % (test_accuracy/num_groups),"%")
        print("Average test F1 score: %.2f" % (test_f1_score/num_groups))
        print("Average Gaussian Naive Bayes accuracy: %.2f" % (test_bayes/num_groups),"%")
        create_heatmap((test_confusion/num_groups).astype('int'), interested_in)
        print()
        print("Highest average weights (absolute values!):")
        for i in range(len(test_weights_sorted[:3])):
            index = test_weights_abs.index(test_weights_sorted[i])
            print(actual_labels[index], ': %.2f' % test_weights_sorted[i])
    return test_accuracy/num_groups

**Common training helpers**

In [12]:
# Returns the assembled train set without the test group
def get_train_groups(X_groups, y_groups, num_groups, test_group):
    X_train = []
    y_train = []
    for i in range(num_groups):
        if (i != test_group):
            if len(X_train) == 0:
                X_train = np.array(X_groups[i])
                y_train = np.array(y_groups[i])
            else:
                X_train = np.concatenate((X_train, X_groups[i]))
                y_train = np.concatenate((y_train, y_groups[i]))
    return X_train, y_train

In [None]:
def gradient_descent_v2( X, y, alpha, iters, theta=None):
    m,n = X.shape
    if theta is None:
        theta = np.ones(n)  
    J_history = np.zeros(iters)

    for i in range(0, iters):       
        theta = theta - np.dot(alpha*X.T, np.dot(X, theta) - y)
        J_history[i] = cost_function_v2(X, y, theta)    
    return theta, J_history

def cost_function_v2(X, y, theta):  
    cost = 0
    diff = (np.dot(X,theta)-y).T
    diff = np.where(abs(diff) < 30, 0, diff)
    cost = 0.5*np.dot(diff, diff)
    return cost

def normalize_v2(M):
    norm_M = M.copy()
    mean = np.mean(M[:, 1:].copy(), axis = 0)
    std = np.array(np.std(M[:, 1:].copy(), axis = 0))
    norm_M[:,1:] = np.divide(np.subtract(norm_M[:,1:], mean), std)
    return norm_M

In [None]:
def get_stats(trues, predictions):
    stat_zip = zip(predictions, trues)
    
    match = 0
    tp = 0
    fp = 0
    fn = 0
    for x in stat_zip:
        if x[0] != x[1] and x[0] == 1:
            fp = fp + 1
        elif x[0] != x[1] and x[0] == 0:
            fn = fn + 1
        elif x[0] == x[1]:
            match = match + 1
            if x[0] == 1:
                tp = tp + 1
    
    accuracy = match/len(trues)*100   
    precision = 0 if (tp+fp == 0) else tp/(tp+fp)
    recall = 0 if (tp+fn == 0) else tp/(tp+fn)
    f1 = 0 if (precision+recall == 0) else 2*precision*recall/(precision+recall)
    
    return accuracy, f1

In [None]:
def print_num_samples(y):
    for i in range(5): 
        print("Samples of class", i, ":", len(np.nonzero(y == i)[0]))
    print()

In [None]:
def get_comp_labels():
    return ['histological_grade', 'clinical_stage', 'tumor_grade', 'histological_type', 'tumor_stage']

In [None]:
def get_actual_weights(feature_labels, weights):
    actual_labels = []

    compressed_labels = get_comp_labels()
    compressed_indices = []
    for i in range(len(compressed_labels)):
        compressed_indices.append([])

    # extract actual labels
    for i in range(len(feature_labels)):  
        success = False
        for j in range(len(compressed_labels)):
            if feature_labels[i].startswith(compressed_labels[j]):
                compressed_indices[j].append(i)
                if compressed_labels[j] not in actual_labels:
                    actual_labels.append(compressed_labels[j])
                success = True
                break
        if not success:
            actual_labels.append(feature_labels[i])
    compressed_indices.sort(reverse = True)

    # compress weights
    for i in range(len(compressed_indices)):
        if compressed_indices[i]:
            num_elem = len(compressed_indices[i]) 
            start = compressed_indices[i][0]
            finish = compressed_indices[i][0] + num_elem
            weights[start : finish] = [np.mean( weights[start : finish])]
    return actual_labels, weights

**Visual representation**

In [None]:
def create_heatmap(cnf_matrix, interested_in):
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(interested_in))
    plt.xticks(tick_marks)
    plt.yticks(tick_marks)

    # create heatmap
    sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label') 
    plt.show()

In [None]:
def plot_lambdas(X_groups, y_groups, num_classes, interested_in, num_groups):
    train_accuracy = 0
    test_accuracy = 0

    lambda_vals = 10.0 ** np.linspace(-5, 5, 11)
    test_acc = np.zeros(lambda_vals.size)
    train_acc = np.zeros(lambda_vals.size)

    for j in range(lambda_vals.size): 
        train_accuracy = 0
        test_accuracy = 0

        for i in range(num_groups):
            X_test = X_groups[i]
            y_test = y_groups[i]
            X_train, y_train = get_train_groups(X_groups, y_groups, num_groups, i)

            weight_vectors, intercepts = train_one_vs_all(X_train, y_train, interested_in, num_classes, lambda_vals[j])
            train_preds = predict_one_vs_all(X_train, weight_vectors, intercepts)
            test_preds  = predict_one_vs_all(X_test, weight_vectors, intercepts)

            train_accuracy = train_accuracy + np.mean(train_preds == y_train)*100
            test_accuracy = test_accuracy + np.mean(test_preds == y_test)*100 

        train_acc[j] = train_accuracy/num_groups
        test_acc[j] = test_accuracy/num_groups

    plt.xlabel('lambda (log10)')
    plt.ylabel('accuracy')
    plt.xscale('log')
    plt.plot(lambda_vals, train_acc, 'bo', linestyle='dashed')
    plt.plot(lambda_vals, test_acc, 'go', linestyle='dashed')
    plt.legend(('train', 'test')) #if you use this legend, make sure you plot the training data first and then the test data
    plt.show()