# Finding strongest features

In [5]:
# Setup and imports
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import itertools
%load_ext autoreload
%autoreload 2

from mlcomp import *
from mlcomp.config import DATA_PATH
from mlcomp.feature_eng import build_advanced_poly, build_simple_poly, build_mult_comb
from mlcomp.helpers import split_data, compute_rmse, predict_labels
from mlcomp.performance import correctness
from mlcomp.data import load_csv_data, create_csv_submission

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
def ridge_regression(y, tx, lambda_):
    """Ridge regression using normal equations"""    
    N = tx.shape[0]
    D = tx.shape[1]
    
    inv_inner = np.dot(tx.T, tx) + 2 * N * lambda_ * np.identity(D)
    inv = np.linalg.solve(inv_inner, np.identity(D))
    w = np.dot(inv, np.dot(tx.T, y)).reshape((D, 1))
    
    rmse = compute_rmse(y, tx, w)
    
    return w, rmse

In [7]:
def replace_nan_by_median(tx, nan_value):
    """Replaces values with a specified nan_value by the column median."""
    tx[tx == nan_value] = np.nan
    col_median = np.nanmedian(tx, axis=0)
    return np.where(np.isnan(tx), col_median, tx)

In [8]:
def sorted_correlations(yb, tx):
    """Returns correlations of each input dimension with the labels,
    as well as sorted indices of columns with strongest correlations."""
    cov_mat =  np.corrcoef(yb.T, tx.T)
    corr_abs = abs(cov_mat[1:,0])
    sorted_idxs = np.argsort(corr_abs)
    return corr_abs, sorted_idxs

## Cross validation

In [9]:
def cross_validation_visualization(lambds, mse_tr, mse_te):
    """visualization the curves of mse_tr and mse_te."""
    plt.semilogx(lambds, mse_tr, marker=".", color='b', label='train error')
    plt.semilogx(lambds, mse_te, marker=".", color='r', label='test error')
    plt.xlabel("lambda")
    plt.ylabel("rmse")
    plt.title("cross validation")
    plt.legend(loc=2)
    plt.grid(True)
    plt.savefig("cross_validation")

In [10]:
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)

def get_train_indices(k_indices, k):
    train_indices = np.array([])
    for i in range(k_indices.shape[0]):
        if (i != k-1):
            train_indices = np.hstack((train_indices, k_indices[i]))
    return train_indices.astype(int)

def cross_validation_step(yb, tx, k_indices, k, lambda_):
    """return the loss of ridge regression."""
    train_indices = get_train_indices(k_indices, k)
    x_train = tx[train_indices]
    y_train = yb[train_indices]
    x_test = tx[k_indices[k-1]]
    y_test = yb[k_indices[k-1]]
    
    weights, rmse = ridge_regression(y_train, x_train, lambda_)
    
    loss_tr = compute_rmse(y_train, x_train, weights)
    loss_te = compute_rmse(y_test, x_test, weights)
    
    return loss_tr[0][0], loss_te[0][0]

def cross_validation(yb, tx, k_fold, seed=1):
    lambdas = np.logspace(-4, 0, 30)
    # split data in k fold
    k_indices = build_k_indices(yb, k_fold, seed)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    best_rmse_te = float("inf")
    best_ind = 0
    
    for ind, lambda_ in enumerate(lambdas):
        avg_loss_tr = 0
        avg_loss_te = 0
        for k in range(1, k_fold+1):
            loss_tr, loss_te = cross_validation_step(yb, tx, k_indices, k, lambda_)
            avg_loss_tr += loss_tr
            avg_loss_te += loss_te
        rmse_tr.append(avg_loss_tr/k_fold)
        rmse_te.append(avg_loss_te/k_fold)
        if (rmse_te[ind] < best_rmse_te):
            best_rmse_te = rmse_te[ind]
            best_ind = ind
        print("lambda={l:.9f}, Training RMSE={tr:.3f}, Testing RMSE={te:.3f}".format(
               l=lambda_, tr=rmse_tr[ind], te=rmse_te[ind]))
    
    cross_validation_visualization(lambdas, rmse_tr, rmse_te)
    return lambdas[best_ind]

## Find best features

In [11]:
TRAIN_PATH = os.path.join(DATA_PATH, 'train.csv')
yb, input_data, ids = load_csv_data(TRAIN_PATH, sub_sample=False)
yb = yb.reshape((yb.shape[0], 1))

In [23]:
from sklearn.preprocessing import PolynomialFeatures

tx = replace_nan_by_median(input_data, -999)
degree = 5
poly = PolynomialFeatures(degree)

In [24]:
def ridge_regression_sim(x_train, y_train, x_test, y_test, seed=1):
    lambdas = np.logspace(-4, 0, 5)
    # define lists to store the loss of training data and test data
    rmse_tr = []
    rmse_te = []
    best_rmse_te = float("inf")
    best_weights = 0
    best_ind = 0
    
    for ind, lambda_ in enumerate(lambdas):
        weights, rmse = ridge_regression(y_train, x_train, lambda_)
        
        rmse_tr.append(compute_rmse(y_train, x_train, weights))
        rmse_te.append(compute_rmse(y_test, x_test, weights))
        
        if (rmse_te[ind] < best_rmse_te):
            best_rmse_te = rmse_te[ind]
            best_weights = weights
            best_ind = ind
        #print("lambda={l:.9f}, Training RMSE={tr:.3f}, Testing RMSE={te:.3f}".format(
        #       l=lambda_, tr=rmse_tr[ind], te=rmse_te[ind]))
        
    return lambdas[best_ind], best_weights, rmse_te[best_ind]

In [27]:
def correctness(yb, y_pred, verbose=False):
    """Takes inputs known y and predicted y and prints the ratio of correct predictions vs incorrect ones."""
    corrects = (y_pred == yb).sum()
    perc = corrects / len(y_pred) * 100
    if verbose:
        incorrect = len(y_pred) - corrects
        print("Total correct:", corrects, "\nTotal incorrect:", incorrect, "\nCorrect percentage:", perc, "%")

    return perc

In [25]:
candidates = [0, 2, 7, 1, 11, 13, 5, 4, 10, 3, 6, 12, 9]
seed = 12345
ratio_split = 0.9

In [29]:
best_features_te = []
best_features_corr = []
best_te_loss = float("inf")
best_correctness = 0

#comb_iter = itertools.combinations(candidates, 8)
comb_iter = itertools.combinations([0, 1, 2, 7, 10, 11, 13, 19], 8)

for comb in comb_iter:
    features = np.asarray(comb)
    print("Features -->", features)
    
    x_poly = poly.fit_transform(tx[:, features])
    x_train, x_test, y_train, y_test = split_data(x_poly, yb, ratio_split, seed)
    
    best_lambda, w, loss_te = ridge_regression_sim(x_train, y_train, x_test, y_test, seed)

    y_pred = predict_labels(w, x_poly)
    correctness = correctness(yb, y_pred, verbose=False)
    
    if (loss_te < best_te_loss):
        best_features_te = features
        best_te_loss = loss_te
        print("\tBest test loss features found:", best_features_te)
        print("\tBest lambda:", best_lambda)
        print("\tCorrectness:", correctness)
        print("\tLoss:", loss_te)
    if (correctness > best_correctness):
        best_features_corr = features
        best_correctness = correctness
        print("\tBest correctness features found:", best_features_te)
        print("\tBest lambda:", best_lambda)
        print("\tCorrectness:", correctness)
        print("\tLoss:", loss_te)


Features --> [ 0  1  2  7 10 11 13 19]
	Best test loss features found: [ 0  1  2  7 10 11 13 19]
	Best lambda: 0.000278255940221
	Correctness: 81.878
	Loss: [[ 179.6840572]]
	Best correctness features found: [ 0  1  2  7 10 11 13 19]
	Best lambda: 0.000278255940221
	Correctness: 81.878
	Loss: [[ 179.6840572]]


In [None]:
best_features