In [1]:
from sklearn.metrics import precision_recall_curve, average_precision_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler, StandardScaler, label_binarize
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from matplotlib.cm import get_cmap
import matplotlib.pyplot as plt
import numpy as np

## Common Functions (used across more than 1 task)

In [2]:
def load_data():
    '''
    Loads data from the respective given datasets, and imputes values if necessary.
    
    Parameters:
        - None
        
    Returns:
        - None
    '''
    # Loads of data
    x_train = np.load('x_train.npy')
    x_test = np.load('x_test.npy')
    y_train = np.load('y_train.npy')
    y_test = np.load('y_test.npy')
    
    # Summarizes missing values
    train_nans, train_infs = summarize_missing_values(x_train, 'x_train')
    test_nans, test_infs = summarize_missing_values(x_test, 'x_test')
    
    # Imputation of data (if necessary)
    if train_nans > 0 or train_infs > 0:
        x_train = impute_data(x_train)
    if test_nans > 0 or test_infs > 0:
        x_test = impute_data(x_test)

    return x_train, x_test, y_train, y_test

In [3]:
def summarize_missing_values(data, name):
    '''
    Summarizes the missing contents of the dataset (if any)
    
    Parameters:
        - data: Dataset
        - name: Name of the dataset
    '''
    num_nans = np.isnan(data).sum()
    num_infs = np.isinf(data).sum()
    print(f"{name} - Total NaNs: {num_nans}, Total Infs: {num_infs}")
    return num_nans, num_infs

In [4]:
def impute_data(data):
    '''
    Imputes missing values in the dataset (if any)
    
    Parameters:
        - data: Dataset
        
    Returns:
        - imputer.fit_transform(data): Transformed imputed data
    '''
    # Replace Inf values with NaN
    data[np.isinf(data)] = np.nan
    # Uses median to reduce the impact of outliers
    imputer = SimpleImputer(strategy='median')  
    return imputer.fit_transform(data)

In [5]:
def scale_data(train_data, test_data=None, minmax=False, lower_lim=0, upper_lim=1):
    '''
    Scales data using the scaler fitted on the training set
    
    Parameters:
        - train_data: Training dataset
        - test_data: Testing dataset
        - minmax: Decides whether to use the standard scaler or MinMaxScaler
        - lower_lim: Lower limit for the MinMax scaler
        - upper_lim: Upper limit for the MinMax scaler
        
    Returns:
        - train_scaled: Scaled training data
        - test_scaled: Scaled testing data
        - scaler: Scaler object
    '''
    if minmax == False:
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler(feature_range=(lower_lim, upper_lim))
        
    train_scaled = scaler.fit_transform(train_data)
    
    if test_data is None:
        test_scaled = None
    else:
        test_scaled = scaler.transform(test_data)

    return train_scaled, test_scaled, scaler

In [6]:
def plot_precision_recall_curve_multiclass(y_test, probabilities, classes):
    '''
    Plots a precision-recall curve for a multi-class classification problem
    
    Parameters:
        - y_test: Target test dataset
        - probabilities: Contains stored probability scores for all classes
        - classes: Unique classes in the target dataset
        
    Returns:
        - None
    '''
    # Binarize the output classes in a one-vs-all fashion
    y_test_binarized = label_binarize(y_test, classes=classes)

    # Setup colormap
    cmap = get_cmap("tab20") 
    colors = [cmap(i) for i in np.linspace(0, 1, len(classes))]

    plt.figure(figsize=(12, 10))
    # Plots each of the curves
    for i, color in enumerate(colors):
        precision, recall, _ = precision_recall_curve(y_test_binarized[:, i], probabilities[:, i])
        average_precision = average_precision_score(y_test_binarized[:, i], probabilities[:, i])
        plt.plot(recall, precision, color=color, linestyle='-', marker='.', label=f'Class {classes[i]} (AP={average_precision:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall curve per class')
    plt.legend(loc="best")
    plt.grid(True)
    plt.show()

In [7]:
def plot_confusion_matrix(cm, y_test):
    '''
    Plots a confusion matrix
    
    Parameters:
        - cm: Confusion Matrix object
        - y_test: Target test dataset
        
    Returns:
        - None
    '''
    # Plotting the confusion matrix
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(np.unique(y_test)))
    plt.xticks(tick_marks, np.unique(y_test), rotation=45)
    plt.yticks(tick_marks, np.unique(y_test))
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')

    # Annotates squares with the numeric values
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.show()