In [1]:
import os
import pandas as pd
import numpy as np
import copy

from mlcomp.config import DATA_PATH
from mlcomp.data import load_csv_data
from mlcomp.helpers import split_data
from mlcomp.costs import compute_correctness, compute_mae, compute_mse
from mlcomp.cross_validation import ridge_lambda_cv, get_best_parameter
from mlcomp.performance import predict_values, predict
from mlcomp.models import ridge_regression
from mlcomp.feature_eng import build_simple_poly, replace_nan_by_median

import matplotlib.pyplot as plt
%matplotlib inline  

TRAIN_PATH = os.path.join(DATA_PATH, 'train.csv')
RATIO_SPLIT = 0.3
SEED_SPLIT = 872
LAMBDA_SPACE = np.logspace(-5, 0, 30)

## Reading data

In [2]:
y, X, ids = load_csv_data(TRAIN_PATH)
X_train, X_test, y_train, y_test = split_data(X, y, RATIO_SPLIT, seed=SEED_SPLIT)
col_names = list(np.genfromtxt(TRAIN_PATH, delimiter=",", dtype=None,  max_rows=1))
col_names = list(map(lambda x: x.decode("utf-8"), col_names))

## Feature Eng

In [69]:
def dummyrize(X, col_index, allowed_values):
    X_plus = copy.deepcopy(X)
    col_to_dummy = X[: , col_index-2]
    dummies = np.empty((len(X_plus), len(allowed_values)-1))
    
    for i, v in enumerate(allowed_values[0:-1]):
        dummies[:, i] = (col_to_dummy == v)*1
    
    return np.delete(np.concatenate((X_plus, dummies), axis=1), [col_index-2], axis=1)

In [None]:
def to_drop():
    return

In [None]:
DER_mass_MMC
1	DER_mass_traverse_met_lep
2	DER_mass_vis
13	PRI_tau_pt
11	DER_met_phi_centrality
10	DER_pt_ratio_lep_tau
7	DER_deltar_tau_lep
19	PRI_met

In [77]:
def apply_feature_eng(X):

    X_plus = copy.deepcopy(X)
    
    
    X_plus = replace_nan_by_median(X, -999)
    X_plus = (X_plus - X_plus.mean(axis=0)) / X_plus.std(axis=0)
   
    X_plus = build_simple_poly(X_plus, 2)
    
#     dummy_cols = {'PRI_jet_num': [1,2,3,4]}
#     for c in dummy_cols:
#         X_plus = dummyrize(X_plus, col_names.index(c), dummy_cols[c]) # decrese performance 
    
    return X_plus

In [78]:
X_feat = apply_feature_eng(X)
X_feat_train = apply_feature_eng(X_train)
X_feat_test = apply_feature_eng(X_test)

## Using `compute_correctness` as loss function to optimize

In [79]:
LOSS_FN = compute_correctness
LOSS_GREATER_IS_BETTER = True

loss_test = ridge_lambda_cv(y_train, X_feat_train, compute_correctness, LAMBDA_SPACE)[1]
best_lambda = get_best_parameter(loss_test, LAMBDA_SPACE, LOSS_GREATER_IS_BETTER)

best_w = ridge_regression(y, X_feat, best_lambda)
y_test_values = predict_values(best_w, X_feat_test)
y_hat_test = predict(y_test_values, 0)
performance_test = compute_correctness(y_test, y_hat_test)

print('Best lambda: {lambda_}\nPerformance in test set: {pf_test}'
      .format(lambda_=best_lambda, pf_test=round(performance_test, 3)))

Best lambda: 1.4873521072935119e-05
Performance in test set: 77.417
