In [3]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import sklearn.linear_model as sk
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, log_loss
from classifier import train_one_vs_all, predict_one_vs_all
from logistic_regression import logistic, nll_cost_function, gradient_descent
from sklearn.naive_bayes import GaussianNB

# 0: Complete Remission/Response
# 1: Stable Disease
# 2: Progressive Disease
# 3: Partial Remission/Response
# 4: Persistent Disease
OUTCOMES_NUM = 5

**Vital status**

In [None]:
def vital_status_train(X, y, alpha, iters, cv_groups, feature_labels, train_x_times = 1, full_output = True):
    """
    Trains the models to predict the vital status of a patient by the end of cancer treatment.

    :param X: The input matrix (ndarray)
    :param y: The output vector (ndarray)
    :param alpha: The gradient descent coefficient (float)
    :param iters: The number of gradient descent iterations (int)
    :param cv_groups: The total number of cross-validation groups (int)
    :param feature_labels: The list of feature labels (list)
    :param train_x_times: The number of different batches to train the models on (int)
    :param full_output: True for the full output (bool)
    
    """    
    # Display only the average accuracies
    if not full_output:
        class_accuracy = train_accuracy = test_accuracy = bayes_accuracy = 0
        
        # Accumulate accuracy statistics for each training batch
        for i in range(train_x_times):              
            # Get training data and train the models
            X_groups, y_groups, X_groups_full, y_groups_full = get_vital_status_data(X, y, cv_groups)
            class_acc = vital_status_in_class(X_groups, y_groups, alpha, iters, cv_groups, full_output)
            tr_acc, ts_acc, bs_acc = vital_status_scikit(X_groups, y_groups, X_groups_full, y_groups_full, \
                                                         cv_groups, feature_labels, full_output)               
            class_accuracy = class_accuracy + class_acc
            train_accuracy = train_accuracy + tr_acc
            test_accuracy = test_accuracy + ts_acc
            bayes_accuracy = bayes_accuracy + bs_acc
        print("Logistic Regression (in class):")
        print("Average test accuracy: %.2f" % (class_accuracy/train_x_times),"%")
        print("Logistic Regression (scikit):")      
        print("Average train accuracy: %.2f" % (train_accuracy/train_x_times),"%")
        print("Average test accuracy: %.2f" % (test_accuracy/train_x_times),"%")
        print("Bayes (scikit):")
        print("Average test accuracy: %.2f" % (bayes_accuracy/train_x_times),"%") 
    
    # Display the full statistics and graphics
    else:
        for i in range(train_x_times):             
            # Get training data and train the models
            X_groups, y_groups, X_groups_full, y_groups_full = get_vital_status_data(X, y, cv_groups)
            print("Logistic Regression (in class):")
            vital_status_in_class(X_groups, y_groups, alpha, iters, cv_groups, full_output)
            print("Logistic Regression (scikit):") 
            vital_status_scikit(X_groups, y_groups, X_groups_full, y_groups_full, \
                                cv_groups, feature_labels, full_output)

In [None]:
def get_vital_status_data(X, y, cv_groups):  
    """
    Returns the cross-validation groups for the vital status training.

    :param X: The input matrix (ndarray)
    :param y: The output vector (ndarray)
    :param cv_groups: The total number of cross-validation groups (int)

    :return X_groups: Reduced cross-validation groups for X (list)
    :return y_groups: Reduced cross-validation groups for y (list)
    :return X_groups_full: Cross-validation groups for X that include all cases (list)
    :return y_groups_full: Cross-validation groups for y that include all cases (list)
        
    """    
    # Separate and shuffle all cases
    dead_full = np.nonzero(y == 0)[0]
    alive_full = np.nonzero(y == 1)[0]
    np.random.shuffle(dead_full)
    np.random.shuffle(alive_full)
    
    # Equate the number of cases of different types for in-class model
    num_cases = min(len(dead_full), len(alive_full))
    dead = dead_full[0:num_cases]
    alive = alive_full[0:num_cases]
    
    # Split each type into equal groups
    dead_full_groups = np.array_split(dead_full, cv_groups)
    alive_full_groups = np.array_split(alive_full, cv_groups)
    dead_groups = np.array_split(dead, cv_groups)
    alive_groups = np.array_split(alive, cv_groups)
    
    # Accumulate the groups in lists
    subsets = []
    subsets_full = []
    for i in range(cv_groups):
        subsets_full.append(np.concatenate((dead_full_groups[i], alive_full_groups[i])))
        np.random.shuffle(subsets_full[i]) 
        subsets.append(np.concatenate((dead_groups[i], alive_groups[i])))      
        np.random.shuffle(subsets[i])

    # Form the cross-validation groups using all possible types
    X_groups = []
    y_groups = []
    X_groups_full = []
    y_groups_full = []
    for i in range(cv_groups):
        X_groups_full.append(X[subsets_full[i],:])
        y_groups_full.append(y[subsets_full[i]])      
        X_groups.append(X[subsets[i],:])
        y_groups.append(y[subsets[i]])
        
    return X_groups, y_groups, X_groups_full, y_groups_full

In [1]:
def vital_status_in_class(X_groups, y_groups, alpha, iters, cv_groups, full_output = True):
    """
    Trains the in-class model to predict the vital status.

    :param X_groups: The cross-validation groups for X (list)
    :param y_groups: The cross-validation groups for y (list) 
    :param alpha: The gradient descent coefficient (float)
    :param iters: The number of gradient descent iterations (int)
    :param cv_groups: The total number of cross-validation groups (int)
    :param full_output: True for the full output (bool)

    :return: Average LR test accuracy (float)
        
    """
    # Define parameters for the training
    theta = np.ones(X_groups[0].shape[1])
    train_accuracy = test_accuracy = train_f1_score = test_f1_score = 0
    gradient_cost = 0
    test_confusion = np.zeros(shape=(2, 2))

    for i in range(cv_groups):
        # Get train and test X and y batches
        X_train, y_train = get_train_groups(X_groups, y_groups, cv_groups, i)
        X_test = X_groups[i]
        y_test = y_groups[i]
        
        # Train the in-class model
        theta, J = gradient_descent(X_train, y_train, theta, alpha, iters)
        gradient_cost = gradient_cost + J[-1]
               
        # Get train and test predictions
        sort_predictions = lambda x: 0 if x < 0.5 else 1
        convert_to_log = np.vectorize(sort_predictions)
        train_preds = logistic(np.dot(X_train, theta))
        train_preds = convert_to_log(train_preds)
        test_preds = logistic(np.dot(X_test, theta))
        test_preds = convert_to_log(test_preds)
        
        # Get the statistics
        accuracy, f1 = get_stats(y_train, train_preds)
        train_accuracy = train_accuracy + accuracy
        train_f1_score = train_f1_score + f1
        accuracy, f1 = get_stats(y_test, test_preds)
        test_accuracy = test_accuracy + accuracy
        test_f1_score = test_f1_score + f1
        test_confusion = test_confusion + confusion_matrix(y_test, test_preds)
    
    # Display the full statistics and graphics
    if full_output:
        print("Average train final cost: %.2f" % (gradient_cost/cv_groups))
        print("Average train accuracy: %.2f" % (train_accuracy/cv_groups),"%")
        print("Average train F1 score: %.2f" % (train_f1_score/cv_groups))
        print("Average test accuracy: %.2f" % (test_accuracy/cv_groups),"%")
        print("Average test F1 score: %.2f" % (test_f1_score/cv_groups))
        create_heatmap((test_confusion/cv_groups).astype('int'), [0, 1])
        
    return test_accuracy/cv_groups

In [None]:
def vital_status_scikit(X_groups, y_groups, X_groups_full, y_groups_full, cv_groups, feature_labels, full_output = True):
    """
    Trains the scikit model to predict the vital status.

    :param X_groups: The cross-validation groups for reduced X (list)
    :param y_groups: The cross-validation groups for reduced y (list) 
    :param X_groups_full: The cross-validation groups for full X (list)
    :param y_groups_full: The cross-validation groups for full y (list) 
    :param cv_groups: The total number of cross-validation groups (int)
    :param feature_labels: The list of feature labels (list)
    :param full_output: True for the full output (bool)
    
    :return: Average LR train accuracy (float)
    :return: Average LR test accuracy (float)
    :return: Average Bayes test accuracy (float)
        
    """    
    # Define parameters for the training
    train_accuracy = test_accuracy = bayes_accuracy = 0
    train_cost = test_cost = 0
    train_f1_score = test_f1_score = bayes_f1_score = 0
    test_weights = np.zeros(X_groups[0].shape[1] - 1)
    test_confusion = np.zeros(shape=(2, 2)) 

    for i in range(cv_groups):
        # Get train and test X and y batches
        X_test = X_groups[i]
        y_test = y_groups[i] 
        X_train, y_train = get_train_groups(X_groups, y_groups, cv_groups, i)
        X_test_full = X_groups_full[i]
        y_test_full = y_groups_full[i] 
        X_train_full, y_train_full = get_train_groups(X_groups_full, y_groups_full, cv_groups, i)
        
        # Train the Logistic Regression and Gaussian Bayes models
        lr = sk.LogisticRegression(solver='liblinear', class_weight = 'balanced')
        lr.fit(X_train_full[:,1:], y_train_full)
        gnb = GaussianNB()
        gnb.fit(X_train[:,1:], y_train)
        
        # Get train and test predictions
        train_preds = lr.predict(X_train_full[:,1:])
        test_preds = lr.predict(X_test_full[:,1:])
        gnb_preds = gnb.predict(X_test[:,1:])
        
        # Get the statistics
        bayes_accuracy = bayes_accuracy + accuracy_score(y_test, gnb_preds)*100
        bayes_f1_score = bayes_f1_score + f1_score(y_test, gnb_preds)       
        train_accuracy = train_accuracy + accuracy_score(y_train_full, train_preds)*100
        train_f1_score = train_f1_score + f1_score(y_train_full, train_preds)
        train_cost = train_cost + log_loss(y_train_full, train_preds)        
        test_accuracy = test_accuracy + accuracy_score(y_test_full, test_preds)*100
        test_f1_score = test_f1_score + f1_score(y_test_full, test_preds)
        test_cost = test_cost + log_loss(y_test_full, test_preds)        
        test_weights = np.add(test_weights, lr.coef_[0])
        test_confusion = test_confusion + confusion_matrix(y_test_full, test_preds)

    # Get the labels with the highest average absolute weights
    actual_labels, actual_weights = get_actual_weights(feature_labels[1:], list(test_weights))    
    test_weights_abs = list(np.abs(actual_weights))
    test_weights_sorted = test_weights_abs.copy()
    test_weights_sorted.sort(reverse = True)
    
    # Display the full statistics and graphics
    if full_output:
        print("Average train final cost: %.2f" % (train_cost/cv_groups))
        print("Average train accuracy: %.2f" % (train_accuracy/cv_groups),"%")
        print("Average train F1 score: %.2f" % (train_f1_score/cv_groups))
        print("Average test final cost: %.2f" % (test_cost/cv_groups))
        print("Average test accuracy: %.2f" % (test_accuracy/cv_groups),"%")
        print("Average test F1 score: %.2f" % (test_f1_score/cv_groups))
        print("Average Gaussian Naive Bayes accuracy: %.2f" % (bayes_accuracy/cv_groups),"%")
        print("Average Gaussian Naive Bayes F1 score: %.2f" % (bayes_f1_score/cv_groups))
        create_heatmap((test_confusion/cv_groups).astype('int'), [0, 1])
        print()
        print("Highest average weights (absolute values):")
        for i in range(len(test_weights_sorted[:3])):
            index = test_weights_abs.index(test_weights_sorted[i])
            print(actual_labels[index], ': %.2f' % test_weights_sorted[i])
            
    return train_accuracy/cv_groups, test_accuracy/cv_groups, bayes_accuracy/cv_groups

**Death days to**

In [None]:
def death_days_to_train(X, y, alpha, iters, num_groups):
    train_accuracy = 0
    test_accuracy = 0
    final_cost = 0
    
    # Get cross-validation grouped X and y data
    X_groups, y_groups = get_death_days_to_data(X, y, num_groups)

    # Perform k-fold cross-validation and record the results 
    for i in range(num_groups):
        X_test = X_groups[i]
        y_test = y_groups[i]
        X_train, y_train = get_train_groups(X_groups, y_groups, num_groups, i)

        # Scikit model training
        lr = sk.LinearRegression()
        lr.fit(X_train, y_train)
        train_preds = lr.predict(X_train)
        test_preds = lr.predict(X_test)

        # In-class, only cost
        theta_vec_equations, cost = gradient_descent_v2(normalize_v2(X_train), y_train, alpha, iters)   
        final_cost = final_cost + cost[-1]

        # Calculate the train accuracy
        match = 0
        for i in range(len(y_train)):
            if  train_preds[i] <= y_train[i] + 180 and train_preds[i] >= y_train[i] - 180:
                match += 1
        train_accuracy = train_accuracy + match/len(y_train)*100 

        # Calculate the test accuracy
        match = 0
        for i in range(len(y_test)):
            if  test_preds[i] <= y_test[i] + 180 and test_preds[i] >= y_test[i] - 180:
                match += 1
        test_accuracy = test_accuracy + match/len(y_test)*100 

    print("In-class model:")
    print("Average final cost: %.2f" % (final_cost/num_groups))
    print("Mean in y: %.2f" % np.mean(y))
    print("Variance in y: %.2f" % np.var(y))
    print()
    print("Scikit model:")
    print("Average train accuracy: %.2f" % (train_accuracy/num_groups),"%")
    print("Average test accuracy: %.2f" % (test_accuracy/num_groups),"%")

In [None]:
# Cross-validation groups for death_days_to
def get_death_days_to_data(X, y, num_groups):    
    # Get all dead cases
    death_indices = np.arange(len(y))
    np.random.shuffle(death_indices)
    
    dead_groups = np.array_split(death_indices, num_groups)

    X_groups = []
    y_groups = []
    for i in range(num_groups):
        X_groups.append(X[dead_groups[i],:])
        y_groups.append(y[dead_groups[i]])
    return X_groups, y_groups

**Outcome**

In [None]:
def outcome_train(X, y, interested_in, lambda_val, num_groups, feature_labels, \
                  alpha = 0.00000001, iters = 2000, num_tests = 1, output = True):
    if output == True:
        for i in range(num_tests):    
            X_groups, y_groups, num_classes = get_outcome_data(X, y, interested_in, num_groups)

            print("In class model:")
            if num_classes == 2:
                for i in range(len(y_groups)):
                    y_groups[i] = np.where(y_groups[i] == interested_in[0], 0, 1)                    
                print("Logistic regression:")
                vital_status_in_class(X_groups, y_groups, alpha, iters, num_groups)
                print()

            print("One vs all:")  
            outcome_in_class(X_groups, y_groups, num_classes, interested_in, lambda_val, num_groups)

            print()
            print("Scikit model:")
            outcome_scikit(X_groups, y_groups, num_classes, interested_in, num_groups, feature_labels)
            plot_lambdas(X_groups, y_groups, num_classes, interested_in, num_groups)
    else:
        accuracy_regression = 0
        accuracy_classifier = 0
        accuracy_scikit = 0
        accuracy_bayes = 0
        
        for i in range(num_tests):  
            X_groups, y_groups, num_classes = get_outcome_data(X, y, interested_in, num_groups)

            if num_classes == 2:
                for i in range(len(y_groups)):
                    y_groups[i] = np.where(y_groups[i] == interested_in[0], 0, 1)                    
                accuracy_regression = accuracy_regression + vital_status_in_class(X_groups, y_groups, alpha, iters, num_groups, output) 
            accuracy_classifier = accuracy_classifier + outcome_in_class(X_groups, y_groups, num_classes, interested_in, lambda_val, num_groups, output)
            temp1, temp2 = outcome_scikit(X_groups, y_groups, num_classes, interested_in, num_groups, feature_labels, output)
            accuracy_scikit = accuracy_scikit + temp1       
            accuracy_bayes = accuracy_bayes + temp2    
                
        if num_classes == 2:
            print("Logistic regression:")
            print("Average test accuracy: %.2f" % (accuracy_regression/num_tests),"%")
        print("One vs all model:")
        print("Average test accuracy: %.2f" % (accuracy_classifier/num_tests),"%")
        print("Scikit model:")
        print("Average test accuracy: %.2f" % (accuracy_scikit/num_tests),"%")
        print("Bayes model:")
        print("Average test accuracy: %.2f" % (accuracy_bayes/num_tests),"%")

In [45]:
def get_outcome_data(X, y, interested_in, num_groups):    
    # Get the number of samples to take from each class
    lengths = []
    data = []
    for i in range(OUTCOMES_NUM): 
        if i in interested_in:
            data.append(np.nonzero(y == i)[0])
            lengths.append(len(data[-1]))    
    num_cases = np.amin(lengths)
    num_classes = len(lengths)
    
    
    # Truncate to balance
    for i in range(num_classes):
        np.random.shuffle(data[i])
        data[i] = data[i][0:num_cases]
    
    # Break into balanced groups
    batch_groups = []
    for i in range(num_classes):
        batch_groups.append(np.array_split(data[i], num_groups))

    # Put batches together 
    outcome_groups = []
    for i in range(num_groups): #0-4
        batch = []
        for j in range(num_classes): #0-2
            batch = np.concatenate([batch, batch_groups[j][i]])
        np.random.shuffle(batch)
        outcome_groups.append(batch.astype('int'))
    
    X_groups = []
    y_groups = []
    for i in range(num_groups):
        X_groups.append(X[outcome_groups[i],:])
        y_groups.append(y[outcome_groups[i]])
        
    return X_groups, y_groups, X_groups_full, y_groups_full, num_classes
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    # Get the number of samples to take from each class
    lengths = []
    data = []
    for i in range(5): 
        if i in interested_in:
            data.append(np.nonzero(y == i)[0])
            lengths.append(len(data[-1]))    
    num_cases = np.amin(lengths)
    num_classes = len(lengths)
    
    
    # Truncate to balance
    for i in range(num_classes):
        np.random.shuffle(data[i])
        data[i] = data[i][0:num_cases]
    
    # Break into balanced groups
    batch_groups = []
    for i in range(num_classes):
        batch_groups.append(np.array_split(data[i], num_groups))

    # Put batches together 
    outcome_groups = []
    for i in range(num_groups): #0-4
        batch = []
        for j in range(num_classes): #0-2
            batch = np.concatenate([batch, batch_groups[j][i]])
        np.random.shuffle(batch)
        outcome_groups.append(batch.astype('int'))
    
    X_groups = []
    y_groups = []
    for i in range(num_groups):
        X_groups.append(X[outcome_groups[i],:])
        y_groups.append(y[outcome_groups[i]])
    return X_groups, y_groups, num_classes

In [46]:
def outcome_in_class(X_groups, y_groups, num_classes, interested_in, lambda_val, num_groups, output = True):
    train_accuracy = 0
    test_accuracy = 0
    train_f1_score = 0
    test_f1_score = 0 
    test_confusion = np.zeros(shape=(len(interested_in), len(interested_in)))
    
    target_names = []
    for i in interested_in:
        target_names.append('class ' + str(i))

    for i in range(num_groups):
        X_test = X_groups[i]
        y_test = y_groups[i]
        X_train, y_train = get_train_groups(X_groups, y_groups, num_groups, i)
        
        weight_vectors, intercepts = train_one_vs_all(X_train[:,1:], y_train, interested_in, num_classes, lambda_val)
        train_preds = predict_one_vs_all(X_train[:,1:], weight_vectors, intercepts)
        test_preds  = predict_one_vs_all(X_test[:,1:],  weight_vectors, intercepts)

        accuracy, f1 = get_stats(y_train, train_preds)
        train_accuracy = train_accuracy + accuracy
        train_f1_score = train_f1_score + f1
        
        accuracy, f1 = get_stats(y_test, test_preds)
        test_accuracy = test_accuracy + accuracy
        test_f1_score = test_f1_score + f1
        test_confusion = test_confusion + confusion_matrix(list(y_test), list(test_preds))
    if output:
        print("Average train accuracy: %.2f" % (train_accuracy/num_groups),"%")
        print("Average train F1 score: %.2f" % (train_f1_score/num_groups))

        print("Average test accuracy: %.2f" % (test_accuracy/num_groups),"%")
        print("Average test F1 score: %.2f" % (test_f1_score/num_groups))   
        create_heatmap((test_confusion/num_groups).astype('int'), interested_in)
    return test_accuracy/num_groups

In [None]:
# Bug 1.1: confusion matrix is not exactly what we need it to be. 
# Bug 1.2: test_confusion is not initialized right

def outcome_scikit(X_groups, y_groups, num_classes, interested_in, num_groups, feature_labels, output = True):
    train_accuracy = 0
    test_accuracy = 0
    train_f1_score = 0
    test_f1_score = 0
    bayes_f1_score = 0
    test_weights = np.zeros(X_groups[0].shape[1] - 1)
    test_confusion = np.zeros(shape=(len(interested_in), len(interested_in)))
    test_bayes = 0
    
    target_names = []
    for i in interested_in:
        target_names.append('class ' + str(i))
    
    for i in range(num_groups):
        X_test = X_groups[i]
        y_test = y_groups[i]
        X_train, y_train = get_train_groups(X_groups, y_groups, num_groups, i)
        
        lr = sk.LogisticRegression()
        lr.fit(X_train[:,1:], y_train.astype('int'))
        train_preds = lr.predict(X_train[:,1:])
        test_preds = lr.predict(X_test[:,1:])
        
#         lr1 = sk.LogisticRegression(class_weight = 'balanced')
#         lr1.fit(X_train[:,1:], y_train.astype('int'))
#         train_preds1 = lr1.predict(X_train[:,1:])
#         test_preds1 = lr1.predict(X_test[:,1:])
        
#         print("train:", np.mean(train_preds == y_test)*100, "to", np.mean(train_preds1 == y_test)*100)
#         print("test:", np.mean(test_preds == y_test)*100, "to", np.mean(test_preds1 == y_test)*100)      
        
        # Naive Bayes model
        gnb = GaussianNB()
        gnb.fit(X_train[:,1:], y_train.astype('int'))       
        gnb_preds = gnb.predict(X_test[:,1:])
        
        train_accuracy = train_accuracy + np.mean(train_preds == y_train)*100
        test_accuracy = test_accuracy + np.mean(test_preds == y_test)*100       
        test_bayes = test_bayes + accuracy_score(list(y_test), list(gnb_preds))*100
        train_class_report = classification_report(list(y_train), list(train_preds), \
                                              target_names = target_names, output_dict=True)
        test_class_report = classification_report(list(y_test), list(test_preds), \
                                              target_names = target_names, output_dict=True)     
        bayes_class_report = classification_report(list(y_test), list(gnb_preds), \
                                                   target_names = target_names, output_dict=True)

        # Calculate average test f1-score
        train_f1_score = train_f1_score + train_class_report['weighted avg']['f1-score']
        test_f1_score = test_f1_score + test_class_report['weighted avg']['f1-score']
        bayes_f1_score = bayes_f1_score + bayes_class_report['weighted avg']['f1-score']
        test_weights = np.add(test_weights, lr.coef_[0])
        test_confusion = test_confusion + confusion_matrix(list(y_test), list(test_preds))
    
    actual_labels, actual_weights = get_actual_weights(feature_labels[1:], list(test_weights))   
    test_weights_abs = list(np.abs(actual_weights))
    test_weights_sorted = test_weights_abs.copy()
    test_weights_sorted.sort(reverse = True)
    
    if output:
        print("Average train accuracy: %.2f" % (train_accuracy/num_groups),"%")
        print("Average train F1 score: %.2f" % (train_f1_score/num_groups))
        print("Average test accuracy: %.2f" % (test_accuracy/num_groups),"%")
        print("Average test F1 score: %.2f" % (test_f1_score/num_groups))
        print("Average Gaussian Naive Bayes accuracy: %.2f" % (test_bayes/num_groups),"%")
        print("Average Gaussian Naive Bayes F1 score: %.2f" % (bayes_f1_score/num_groups))
        create_heatmap((test_confusion/num_groups).astype('int'), interested_in)
        print()
        print("Highest average weights (absolute values!):")
        for i in range(len(test_weights_sorted[:3])):
            index = test_weights_abs.index(test_weights_sorted[i])
            print(actual_labels[index], ': %.2f' % test_weights_sorted[i])
    return test_accuracy/num_groups, test_bayes/num_groups

**Common training helpers**

In [12]:
# Returns the assembled train set without the test group
def get_train_groups(X_groups, y_groups, num_groups, test_group):
    X_train = []
    y_train = []
    for i in range(num_groups):
        if (i != test_group):
            if len(X_train) == 0:
                X_train = np.array(X_groups[i])
                y_train = np.array(y_groups[i])
            else:
                X_train = np.concatenate((X_train, X_groups[i]))
                y_train = np.concatenate((y_train, y_groups[i]))
    return X_train, y_train

In [None]:
def gradient_descent_v2( X, y, alpha, iters, theta=None):
    m,n = X.shape
    if theta is None:
        theta = np.ones(n)  
    J_history = np.zeros(iters)

    for i in range(0, iters):       
        theta = theta - np.dot(alpha*X.T, np.dot(X, theta) - y)
        J_history[i] = cost_function_v2(X, y, theta)    
    return theta, J_history

def cost_function_v2(X, y, theta):  
    cost = 0
    diff = (np.dot(X,theta)-y).T
    diff = np.where(abs(diff) < 30, 0, diff)
    cost = 0.5*np.dot(diff, diff)
    return cost

def normalize_v2(M):
    norm_M = M.copy()
    mean = np.mean(M[:, 1:].copy(), axis = 0)
    std = np.array(np.std(M[:, 1:].copy(), axis = 0))
    norm_M[:,1:] = np.divide(np.subtract(norm_M[:,1:], mean), std)
    return norm_M

In [None]:
def get_stats(trues, predictions):
    stat_zip = zip(predictions, trues)
    
    match = 0
    tp = 0
    fp = 0
    fn = 0
    for x in stat_zip:
        if x[0] != x[1] and x[0] == 1:
            fp = fp + 1
        elif x[0] != x[1] and x[0] == 0:
            fn = fn + 1
        elif x[0] == x[1]:
            match = match + 1
            if x[0] == 1:
                tp = tp + 1
    
    accuracy = match/len(trues)*100   
    precision = 0 if (tp+fp == 0) else tp/(tp+fp)
    recall = 0 if (tp+fn == 0) else tp/(tp+fn)
    f1 = 0 if (precision+recall == 0) else 2*precision*recall/(precision+recall)
    
    return accuracy, f1

In [None]:
def print_num_samples(y):
    for i in range(OUTCOMES_NUM): 
        print("Samples of class", i, ":", len(np.nonzero(y == i)[0]))
    print()

In [None]:
def get_comp_labels():
    return ['histological_grade', 'clinical_stage', 'tumor_grade', 'histological_type', 'tumor_stage']

In [None]:
def get_actual_weights(feature_labels, weights):
    actual_labels = []

    compressed_labels = get_comp_labels()
    compressed_indices = []
    for i in range(len(compressed_labels)):
        compressed_indices.append([])

    # extract actual labels
    for i in range(len(feature_labels)):  
        success = False
        for j in range(len(compressed_labels)):
            if feature_labels[i].startswith(compressed_labels[j]):
                compressed_indices[j].append(i)
                if compressed_labels[j] not in actual_labels:
                    actual_labels.append(compressed_labels[j])
                success = True
                break
        if not success:
            actual_labels.append(feature_labels[i])
    compressed_indices.sort(reverse = True)

    # compress weights
    for i in range(len(compressed_indices)):
        if compressed_indices[i]:
            num_elem = len(compressed_indices[i]) 
            start = compressed_indices[i][0]
            finish = compressed_indices[i][0] + num_elem
            weights[start : finish] = [np.mean( weights[start : finish])]
    return actual_labels, weights

**Visual representation**

In [None]:
def create_heatmap(cnf_matrix, interested_in):
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(interested_in))
    plt.xticks(tick_marks)
    plt.yticks(tick_marks)

    # create heatmap
    sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label') 
    plt.show()

In [None]:
def plot_lambdas(X_groups, y_groups, num_classes, interested_in, num_groups):
    train_accuracy = 0
    test_accuracy = 0

    lambda_vals = 10.0 ** np.linspace(-5, 5, 11)
    test_acc = np.zeros(lambda_vals.size)
    train_acc = np.zeros(lambda_vals.size)

    for j in range(lambda_vals.size): 
        train_accuracy = 0
        test_accuracy = 0

        for i in range(num_groups):
            X_test = X_groups[i]
            y_test = y_groups[i]
            X_train, y_train = get_train_groups(X_groups, y_groups, num_groups, i)

            weight_vectors, intercepts = train_one_vs_all(X_train, y_train, interested_in, num_classes, lambda_vals[j])
            train_preds = predict_one_vs_all(X_train, weight_vectors, intercepts)
            test_preds  = predict_one_vs_all(X_test, weight_vectors, intercepts)

            train_accuracy = train_accuracy + np.mean(train_preds == y_train)*100
            test_accuracy = test_accuracy + np.mean(test_preds == y_test)*100 

        train_acc[j] = train_accuracy/num_groups
        test_acc[j] = test_accuracy/num_groups

    plt.xlabel('lambda (log10)')
    plt.ylabel('accuracy')
    plt.xscale('log')
    plt.plot(lambda_vals, train_acc, 'bo', linestyle='dashed')
    plt.plot(lambda_vals, test_acc, 'go', linestyle='dashed')
    plt.legend(('train', 'test')) 
    plt.show()