In [709]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, KernelPCA
from sklearn.linear_model import Ridge, Lasso, HuberRegressor
from sklearn.metrics import r2_score
from sklearn.kernel_ridge import KernelRidge
import numpy as np
import copy
import matplotlib.pyplot as plt
from sklearn import mixture
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectKBest, f_regression
import itertools
from scipy import linalg
import matplotlib as mpl
import lightgbm

In [710]:
X = pd.read_csv("X_train.csv")
X = X.drop(columns=["id"])

In [711]:
y = pd.read_csv("y_train.csv")['y']

In [712]:
X_test = pd.read_csv("X_test.csv")
ids = X_test["id"]
X_test = X_test.drop(columns=["id"])

Remove constant columns

In [713]:
X_test = X_test.loc[:, X.var() != 0.0]
X = X.loc[:, X.var() != 0.0]
X.shape

(1212, 828)

Impute median

In [714]:
X_test = (X_test - X.mean()) / X.std()
X = (X - X.mean()) / X.std ()
X = X.fillna(X.median())
X_test = X_test.fillna(X.median())

Remove highly correlated features

In [715]:
corr_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.75
to_drop = [column for column in upper.columns if any(upper[column] > 0.75)]
print("Removed columns: ", len(to_drop))
# Drop features 
X.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)

Removed columns:  110


In [716]:
X = X.to_numpy()
y = y.to_numpy()
X_test = X_test.to_numpy()

Remove features with low correlation with target <br>
Not entirely sure if this is useful, it removes a lot of features though

In [642]:
combined = np.concatenate((X, np.expand_dims(y, axis=-1)), axis=1)
corr = np.corrcoef(combined, rowvar=False)
corr_y = np.abs(corr[-1,:-1])
print("Removed columns: ", ((corr_y) < 0.1).sum())
X = X[:, corr_y > 0.05]
X_test = X_test[:, corr_y > 0.05]

Removed columns:  620


In [717]:
X.shape

(1212, 718)

Truncate

In [718]:
def truncate(train, test, delta=0.01):
    
    bot = X.quantile(delta)
    top = X.quantile(1-delta)
    
    for column in train.columns:
        train.loc[train[column] < bot[column], column] = train[column].median()
        train.loc[train[column] > top[column], column] = train[column].median()
        test.loc[test[column] < bot[column], column] = train[column].median()
        test.loc[test[column] > top[column], column] = train[column].median()
        
    return train, test

Feature selection methods

In [719]:
def rfe(train, val, target, n=20):
    
    estimator = lightgbm.LGBMRegressor()
    selector = RFE(estimator, n_features_to_select=n, step=1, verbose=0)
    selector = selector.fit(train, target)
    train = selector.transform(train)
    val = selector.transform(val)
    
    return train, val

In [720]:
def pca(train, val, n=20):
    
    pca = KernelPCA(kernel='rbf', n_components=n)
    train_t = pca.fit_transform(train)
    val_t = pca.transform(val)
    
    return train_t, val_t

In [721]:
def lasso_select(train, val, target, alpha=1):
    
    clf = Lasso(alpha=alpha)
    clf.fit(train, target)
    train = train[:,clf.coef_ != 0]
    val = val[:, clf.coef_ != 0]
    print("Selected features: " ,np.count_nonzero(clf.coef_))
    
    return train, val

In [722]:
def sel_best(train, val, target, n=20):
    
    sel = SelectKBest(f_regression, k=n)
    train_t = sel.fit_transform(train, target)
    val_t = sel.transform(val)
    
    return train_t, val_t

Cross Validation

In [774]:
X_t = copy.copy(X)
mask = np.arange(X_t.shape[0])
np.random.shuffle(mask)
X_t = X_t[mask]
y_t = y[mask]

In [775]:
step = X.shape[0] // 5
r2_val_ridge, r2_train_ridge = 0, 0
r2_val_xgb, r2_train_xgb = 0, 0
r2_val_lgb, r2_train_lgb = 0, 0
r2_val_gbr, r2_train_gbr = 0, 0

for i in range(5):
    
    X_val, y_val = X_t[i*step:(i+1)*step], y_t[i*step:(i+1)*step]
    X_train = np.concatenate((X_t[(i+1)*step:], X_t[:i*step]), axis=0)
    y_train = np.concatenate((y_t[(i+1)*step:], y_t[:i*step]), axis=0)
    
    # feature selection
    
    X_train, X_val = lasso_select(X_train, X_val, y_train, alpha=0.7)
    
    # fit models
    
    ridge = Ridge(alpha=2)
    xgb = XGBRegressor(max_depth=2, eta=0.1, reg_lambda=5)
    lgb = lightgbm.LGBMRegressor(max_depth=2, learning_rate=0.015, n_estimators=500)
    gbr = GradientBoostingRegressor(max_depth=2, learning_rate=0.03, n_estimators=400)
    
    ridge.fit(X_train, y_train)
    xgb.fit(X_train, y_train)
    lgb.fit(X_train, y_train)
    gbr.fit(X_train, y_train)
    
    r2_val_ridge += r2_score(y_val, ridge.predict(X_val))
    r2_val_xgb += r2_score(y_val, xgb.predict(X_val))
    r2_val_lgb += r2_score(y_val, lgb.predict(X_val))
    r2_val_gbr += r2_score(y_val, gbr.predict(X_val))
    
    r2_train_ridge += r2_score(y_train, ridge.predict(X_train))
    r2_train_xgb += r2_score(y_train, xgb.predict(X_train))
    r2_train_lgb += r2_score(y_train, lgb.predict(X_train))
    r2_train_gbr += r2_score(y_train, gbr.predict(X_train))

Selected features:  19
Selected features:  18
Selected features:  19
Selected features:  21
Selected features:  18


Validation Scores

In [776]:
print("Ridge: ", r2_val_ridge/5)
print("XGB: ", r2_val_xgb/5)
print("LGB: ", r2_val_lgb/5)
print("GBR:", r2_val_gbr/5)

Ridge:  0.31653372130153573
XGB:  0.4553854027301091
LGB:  0.46582396759095585
GBR: 0.4573052606407234


Train Scores

In [777]:
print("Ridge: ", r2_train_ridge/5)
print("XGB: ", r2_train_xgb/5)
print("LGB: ", r2_train_lgb/5)
print("GBR:", r2_train_gbr/5)

Ridge:  0.403011163327629
XGB:  0.6823728611828044
LGB:  0.6288417113650716
GBR: 0.7249101329167192


Submission

In [None]:
sub = gbr.predict(X_test_s)

In [None]:
pd.DataFrame({"id": ids, "y": sub}).to_csv("sub_gbr.csv", index=False)