In [564]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, KernelPCA
from sklearn.linear_model import Ridge, Lasso, HuberRegressor
from sklearn.metrics import r2_score
from sklearn.kernel_ridge import KernelRidge
import numpy as np
import copy
import matplotlib.pyplot as plt
from sklearn import mixture
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cluster import DBSCAN
import itertools
from scipy import linalg
import matplotlib as mpl

In [565]:
X = pd.read_csv("X_train.csv")
X = X.drop(columns=["id"])

In [566]:
y = pd.read_csv("y_train.csv")['y']

In [567]:
X_test = pd.read_csv("X_test.csv")
ids = X_test["id"]
X_test = X_test.drop(columns=["id"])

Remove constant columns

In [568]:
X_test = X_test.loc[:, X.var() != 0.0]
X = X.loc[:, X.var() != 0.0]
X.shape

(1212, 828)

Impute median

In [569]:
X_test = (X_test - X.mean()) / X.std()
X = (X - X.mean()) / X.std ()
X = X.fillna(X.median())
X_test = X_test.fillna(X.median())

In [570]:
X = X.to_numpy()
y = y.to_numpy()

Truncate

In [571]:
def truncate(train, test, delta=0.01):
    
    bot = X.quantile(delta)
    top = X.quantile(1-delta)
    
    for column in train.columns:
        train.loc[train[column] < bot[column], column] = train[column].median()
        train.loc[train[column] > top[column], column] = train[column].median()
        test.loc[test[column] < bot[column], column] = train[column].median()
        test.loc[test[column] > top[column], column] = train[column].median()
        
    return train, test

In [572]:
def rfe(train, val, target, n=20):
    
    estimator = Ridge(alpha=5)
    selector = RFE(estimator, n_features_to_select=n, step=5, verbose=0)
    selector = selector.fit(train, target)
    train = selector.transform(train)
    val = selector.transform(val)
    
    return train, val

PCA

In [573]:
def pca(train, val, n=20):
    
    pca = KernelPCA(kernel='rbf', n_components=n)
    train_t = pca.fit_transform(train)
    val_t = pca.transform(val)
    
    return train_t, val_t

In [574]:
def lasso_select(train, val, target, alpha=1):
    
    clf = Lasso(alpha=alpha)
    clf.fit(train, target)
    train = train[:,clf.coef_ != 0]
    val = val[:, clf.coef_ != 0]
    print("Selected features: " ,np.count_nonzero(clf.coef_))
    
    return train, val

In [575]:
def sel_best(train, val, target, n=20):
    
    sel = SelectKBest(f_regression, k=n)
    train_t = sel.fit_transform(train, target)
    val_t = sel.transform(val)
    
    return train_t, val_t

Cross Validation

In [576]:
X_t = copy.copy(X)
X_test_t = X_test.to_numpy()
mask = np.arange(X_t.shape[0])
np.random.shuffle(mask)
X_t = X_t[mask]
y_t = y[mask]

In [577]:
step = X.shape[0] // 5
r2_val_ridge, r2_train_ridge = 0, 0
r2_val_xgb, r2_train_xgb = 0, 0
r2_val_lasso, r2_train_lasso = 0, 0
r2_val_gbr, r2_train_gbr = 0, 0

for i in range(5):
    
    X_val, y_val = X_t[i*step:(i+1)*step], y_t[i*step:(i+1)*step]
    X_train = np.concatenate((X_t[(i+1)*step:], X_t[:i*step]), axis=0)
    y_train = np.concatenate((y_t[(i+1)*step:], y_t[:i*step]), axis=0)
    
    # X_train, X_val = rfe(X_train, X_val, y_train, n=20)
    
    # feature selection
    
    X_train, X_val = lasso_select(X_train, X_val, y_train, alpha=0.8)

    # fit gmm and remove 0.05 quantile of the loglikelihood scores
    gmm = mixture.GaussianMixture(n_components=1, covariance_type='full')
    gmm.fit(np.concatenate((X_train, np.expand_dims(y_train, axis=-1)), axis=-1))
    loglike = gmm.score_samples(np.concatenate((X_train, np.expand_dims(y_train, axis=-1)), axis=-1))
    cutoff = np.quantile(loglike, 0.05)
    print("Removed outliers: ", X_train.shape[0] - (loglike > cutoff).sum())
    X_train = X_train[loglike > cutoff]
    y_train = y_train[loglike > cutoff]
    
    # X_train, X_val = sel_best(X_train, X_val, y_train, n=100)
    # X_train, X_val = pca(X_train, X_val, n=30)
    
    ridge = Ridge(alpha=2)
    # xgb = XGBRegressor(max_depth=2, reg_lambda=200)
    lasso = Lasso(alpha=1)
    gbr = GradientBoostingRegressor(max_depth=2, learning_rate=0.03, n_estimators=400)
    
    ridge.fit(X_train, y_train)
    # xgb.fit(X_train, y_train)
    lasso.fit(X_train, y_train)
    gbr.fit(X_train, y_train)
    
    r2_val_ridge += r2_score(y_val, ridge.predict(X_val))
    # r2_val_xgb += r2_score(y_val, xgb.predict(X_val))
    r2_val_lasso += r2_score(y_val, lasso.predict(X_val))
    r2_val_gbr += r2_score(y_val, gbr.predict(X_val))
    
    r2_train_ridge += r2_score(y_train, ridge.predict(X_train))
    # r2_train_xgb += r2_score(y_train, xgb.predict(X_train))
    r2_train_lasso += r2_score(y_train, lasso.predict(X_train))
    r2_train_gbr += r2_score(y_train, gbr.predict(X_train))

Selected features:  18
Removed outliers:  49




Selected features:  21
Removed outliers:  49




Selected features:  18
Removed outliers:  49




Selected features:  21
Removed outliers:  49




Selected features:  18
Removed outliers:  49




Validation Scores

In [578]:
print("Ridge: ", r2_val_ridge/5)
# print("XBG: ", r2_val_xgb/5)
print("Lasso: ", r2_val_lasso/5)
print("GBR:", r2_val_gbr/5)

Ridge:  0.3089440569439783
Lasso:  0.31046257598656785
GBR: 0.43197724031539486


Train Scores

In [579]:
print("Ridge: ", r2_train_ridge/5)
# print("XBG: ", r2_train_xgb/5)
print("Lasso: ", r2_train_lasso/5)
print("GBR:", r2_train_gbr/5)

Ridge:  0.5203651558907085
Lasso:  0.4478652472832977
GBR: 0.7527813576639846


Submission

In [None]:
X_t_s, X_test_s = lasso_select(X_t, X_test_t, y_t, alpha=1)
gbr = GradientBoostingRegressor(max_depth=2, learning_rate=0.03, n_estimators=400)
gmm = mixture.GaussianMixture(n_components=2, covariance_type='full')
gmm.fit(np.concatenate((X_t_s, np.expand_dims(y_t, axis=-1)), axis=-1))
loglike = gmm.score_samples(np.concatenate((X_t_s, np.expand_dims(y_t, axis=-1)), axis=-1))
cutoff = np.quantile(loglike, 0.05)
print(X_t_s.shape[0] - (loglike > cutoff).sum())
X_t_s = X_t_s[loglike > cutoff]
y_t = y_t[loglike > cutoff]
gbr.fit(X_t_s, y_t)
sub = gbr.predict(X_test_s)

In [None]:
pd.DataFrame({"id": ids, "y": sub}).to_csv("sub_gbr.csv", index=False)