# Machine Learning - Project 1

In [None]:
# Setup and imports
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
%load_ext autoreload
%autoreload 2

from mlcomp import *
from mlcomp.config import DATA_PATH
from mlcomp.feature_eng import build_advanced_poly, build_simple_poly, standardize, replace_nan_by_median
from mlcomp.helpers import split_data, compute_rmse, predict_labels
from mlcomp.performance import eval_correctness
from mlcomp.data import load_csv_data, create_csv_submission
from mlcomp.models import ridge_regression

## Higgs Data Analysis

### Cross validation

In [None]:
def cross_validation_visualization(lambds, mse_tr, mse_te):
    """visualization the curves of mse_tr and mse_te."""
    plt.semilogx(lambds, mse_tr, marker=".", color='b', label='train error')
    plt.semilogx(lambds, mse_te, marker=".", color='r', label='test error')
    plt.xlabel("lambda")
    plt.ylabel("rmse")
    plt.title("cross validation")
    plt.legend(loc=2)
    plt.grid(True)
    plt.savefig("cross_validation")

In [None]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

def get_train_indices(k_indices, k):
    train_indices = np.array([])
    for i in range(k_indices.shape[0]):
        if (i != k-1):
            train_indices = np.hstack((train_indices, k_indices[i]))
    return train_indices.astype(int)

def cross_validation_step(yb, tx, k_indices, k, lambda_):
    """return the loss of ridge regression."""
    train_indices = get_train_indices(k_indices, k)
    x_train = tx[train_indices]
    y_train = yb[train_indices]
    x_test = tx[k_indices[k-1]]
    y_test = yb[k_indices[k-1]]
    
    weights, rmse = ridge_regression(y_train, x_train, lambda_)
    
    loss_tr = compute_rmse(y_train, x_train, weights)
    loss_te = compute_rmse(y_test, x_test, weights)
    
    return loss_tr[0][0], loss_te[0][0]

def cross_validation(yb, tx, k_fold, seed=1):
    lambdas = np.logspace(-4, 0, 30)
    # split data in k fold
    k_indices = build_k_indices(yb, k_fold, seed)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    best_rmse_te = float("inf")
    best_ind = 0
    
    for ind, lambda_ in enumerate(lambdas):
        avg_loss_tr = 0
        avg_loss_te = 0
        for k in range(1, k_fold+1):
            loss_tr, loss_te = cross_validation_step(yb, tx, k_indices, k, lambda_)
            avg_loss_tr += loss_tr
            avg_loss_te += loss_te
        rmse_tr.append(avg_loss_tr/k_fold)
        rmse_te.append(avg_loss_te/k_fold)
        if (rmse_te[ind] < best_rmse_te):
            best_rmse_te = rmse_te[ind]
            best_ind = ind
        print("lambda={l:.9f}, Training RMSE={tr:.3f}, Testing RMSE={te:.3f}".format(
               l=lambda_, tr=rmse_tr[ind], te=rmse_te[ind]))
    
    cross_validation_visualization(lambdas, rmse_tr, rmse_te)
    return lambdas[best_ind]

### Experimental functions

All of these functions can be used experimentally to, in one way or the other, improve the predictions.

In [None]:
def sorted_correlations(yb, tx):
    """Returns correlations of each input dimension with the labels,
    as well as sorted indices of columns with strongest correlations."""
    cov_mat =  np.corrcoef(yb.T, tx.T)
    corr_abs = abs(cov_mat[1:,0])
    sorted_idxs = np.argsort(corr_abs)
    return corr_abs, sorted_idxs

# Testing on data

Import training data:

In [None]:
TRAIN_PATH = os.path.join(DATA_PATH, 'train.csv')
yb, input_data, ids = load_csv_data(TRAIN_PATH, sub_sample=False)
yb = yb.reshape((yb.shape[0], 1))

In [None]:
print(input_data.shape, yb.shape)

Transform the data to more useable data, by replacing NaN's by the column mean and creating a polynomial base expansion.

By previous data analysis, important_cols are selected to be the few "most influencial" features. We use only those few weights in the polynomial base expansion for computational efficiency.

| Index | Feature                   |
|-------|---------------------------|
|  0    | DER_mass_MMC              |
| 1     | DER_mass_traverse_met_lep |
| 2     | DER_mass_vis              |
| 13    | PRI_tau_pt                |
| 11    | DER_met_phi_centrality    |
| 10    | DER_pt_ratio_lep_tau      |
| 7     | DER_deltar_tau_lep        |
| 19    | PRI_met                   |

In [None]:
#important_cols = [0, 1, 2, 7, 10, 11, 13, 19] # forrest
#important_cols = [0, 1, 2, 7, 10, 11, 13, 5] # -
#important_cols = [0, 2, 7, 1, 11, 13, 5, 9] # (replace 10 by 9)
#important_cols = [0, 2, 7, 1, 11, 13, 5, 9, 19] # 82.592 %
#important_cols = [0, 2, 7, 1, 11, 13, 5, 9, 19, 10] # 82.7284 %
#important_cols = [0, 2, 7, 1, 11, 13, 5, 9, 19, 10, 4] # 82.9896 % best

In [None]:
tx = replace_nan_by_median(input_data, -999)

corr_abs, sorted_idxs = sorted_correlations(yb, tx)
n = 15
D = len(sorted_idxs)
print(corr_abs, sorted_idxs[:D-n-1:-1])
important_cols = sorted_idxs[:D-n-1:-1]

degree = 11 # With current implementation, higher than 5 is comp. infeasable, but it gives the best results!

x_comb = build_mult_comb(tx, 2, important_cols)
tx = np.concatenate((tx[:, important_cols], x_comb), axis = 1)
x_poly = build_simple_poly(tx, degree)
x_poly = standardize(x_poly)

#x_train, x_test, y_train, y_test = split_data(x_poly, yb, ratio=0.99, seed=12345)

In [None]:
tx = replace_nan_by_median(input_data, -999)
#important_cols = [0, 2, 7, 1, 11, 13, 5, 9, 19] # 82.592 %
#important_cols = [0, 2, 7, 1, 11, 13, 5, 9, 19, 10] # 82.7284 %
#important_cols = [0, 2, 7, 1, 11, 13, 5, 9, 19, 10, 4] # 82.9896 %
important_cols = [0, 1, 2, 4, 5, 7, 9, 10, 11, 13, 19]
degree = 5
x_poly = build_advanced_poly(tx, degree, important_cols)
x_poly.shape

In [None]:
x_poly, mean_poly, std_poly = standardize(x_poly)

In [None]:
x_train, x_test, y_train, y_test = split_data(x_poly, yb, ratio=0.9, seed=12345)

In [None]:
from plots import *

def ridge_regression_sim(x_train, y_train, x_test, y_test, seed=1):
    lambdas = np.logspace(-5, 0, 10)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    best_rmse_te = float("inf")
    best_weights = 0
    best_ind = 0
    
    for ind, lambda_ in enumerate(lambdas):
        weights, rmse = ridge_regression(y_train, x_train, lambda_)
        
        rmse_tr.append(compute_rmse(y_train, x_train, weights))
        rmse_te.append(compute_rmse(y_test, x_test, weights))
        
        if (rmse_te[ind] < best_rmse_te):
            best_rmse_te = rmse_te[ind]
            best_weights = weights
            best_ind = ind
        print("lambda={l:.9f}, Training RMSE={tr:.3f}, Testing RMSE={te:.3f}".format(
               l=lambda_, tr=rmse_tr[ind], te=rmse_te[ind]))
        
    # Plot the obtained results
    plot_train_test(rmse_tr, rmse_te, lambdas, 2)
        
    return lambdas[best_ind], best_weights, rmse_te[best_ind]

In [None]:
best_lambda, best_weights, rmse_te = ridge_regression_sim(x_train, y_train, x_test, y_test, seed=1)

In [None]:
# Fill in best lambda from above and get weights
weights, rmse = ridge_regression(y_train, x_train, lambda_=best_lambda)

In [None]:
# Predict labels with found weights and print some useful information about quality of fit
y_pred = predict_labels(weights, x_poly)
eval_correctness(yb, y_pred, verbose=True)

print("-----------------------------")

rmse_train = compute_rmse(y_train, x_train, weights)
rmse_test = compute_rmse(y_test, x_test, weights)
print("Train RMSE:", rmse_train, ", Test RMSE:", rmse_test)

### With cross validation

In [None]:
k_fold = 4
best_lambda = cross_validation(yb, x_poly, k_fold, seed=1)

In [None]:
weights, rsme = ridge_regression(yb, x_poly, 0.00001)

In [None]:
# Predict labels with found weights and print some useful information about quality of fit
y_pred = predict_labels(weights, x_poly)
correctness(yb, y_pred, verbose=True)
print("-----------------")
print(rsme)

In [None]:
weights.size

### Testing different cutoff value

In [None]:
def predict_values(weights, X):
    return np.dot(X, weights)

def predict(y_values, cutoff):
    labels = np.empty(len(y_values))
    labels[y_values <= cutoff] = -1
    labels[y_values > cutoff] = 1
    
    return labels

def correct_by_cutoff(y_true, y_pred, search_space):
    labels = list(map(lambda c: predict(y_pred, c), search_space))
    corrects = list(map(lambda l: correctness(y_true, l), labels))
    return corrects

y_hat_train = predict_values(weights, x_poly)

cutoff_search_space = np.linspace(-4, 4, retstep=0.01)[0]
correctness_by_cutoff = correct_by_cutoff(yb, y_hat_train, cutoff_search_space)
plt.plot(cutoff_search_space, correctness_by_cutoff);

best_cutoff = cutoff_search_space[correctness_by_cutoff.index(max(correctness_by_cutoff))]
print('Best cutoff in train: {}'.format(best_cutoff))

In [None]:
print('Correctness in train with cutoff 0: {}'.format(correctness(y_train, predict(y_hat_train, 0))))
print('Correctness in train with best cutoff: {}'.format(correctness(y_train, predict(y_hat_train, best_cutoff))))

## Load and transform test data:

In [None]:
test_path = "../data/test.csv"
yb_test, input_data_test, ids_test = load_csv_data(test_path, sub_sample=False)

In [None]:
input_data_test = replace_nan_by_median(input_data_test, -999)
#input_data_test = (input_data_test - input_data_test.min(0)) / input_data_test.ptp(0) # normalize
x_submit_poly = build_advanced_poly(input_data_test, degree, important_cols)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

input_data_test = replace_nan_by_median(input_data_test, -999)
submit_poly = PolynomialFeatures(degree)
x_submit_poly = submit_poly.fit_transform(input_data_test[:, important_cols])

In [None]:
#y_test_pred = predict(predict_values(weights, x_submit_poly), 0)
y_test_pred = predict_labels(weights, x_submit_poly)

In [None]:
# Save predictions of test data in csv file, ready for the upload on kaggle
create_csv_submission(ids_test, y_test_pred, "test_output.csv")