In [412]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import DBSCAN

In [413]:
X = pd.read_csv("X_train.csv")
X = X.drop(columns=["id"])

In [414]:
y = pd.read_csv("y_train.csv")['y']

In [415]:
X_test = pd.read_csv("X_test.csv")
ids = X_test["id"]
X_test = X_test.drop(columns=["id"])

Remove constant columns

In [416]:
X_test = X_test.loc[:, X.var() != 0.0]
X = X.loc[:, X.var() != 0.0]

Remove highly correlated features

In [417]:
corr_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
print("Removed columns: ", len(to_drop))
# Drop features 
X.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)

Removed columns:  53


Scale

In [418]:
transformer = StandardScaler()
X = transformer.fit_transform(X)
X_test = transformer.transform(X_test)

Impute median

In [419]:
transformer = RobustScaler()
X = transformer.fit_transform(X)
X_test = transformer.transform(X_test)

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X = imp_median.fit_transform(X)
X_test = imp_median.transform(X_test)

In [420]:
y = y.to_numpy()

Feature selection with Lasso

In [421]:
def lasso_select(train, val, target, alpha=1):
    
    clf = Lasso(alpha=alpha)
    clf.fit(train, target)
    coef = clf.coef_
    
    # select features with non-zero lasso coefficients
    
    train = train[:, coef != 0]
    val = val[:, coef != 0]
    print("Selected features: " ,np.count_nonzero(clf.coef_))
    
    return train, val

Cross Validation

In [422]:
X_s, X_test_s = lasso_select(X, X_test, y, alpha=0.73)

Selected features:  19


In [423]:
dbscan = DBSCAN(eps=4.9, min_samples=40)
dbscan.fit(X_s)
X_t = X_s[dbscan.labels_ != -1]
y_t = y[dbscan.labels_ != -1]
print("Outliers removed: ", (dbscan.labels_ == -1).sum())

Outliers removed:  64


In [None]:
k = 1
n_folds = 5

rkf = RepeatedKFold(n_splits=n_folds, n_repeats=k)

r2_val_ridge, r2_train_ridge = 0, 0
r2_val_xgb, r2_train_xgb = 0, 0
r2_val_lgb, r2_train_lgb = 0, 0
r2_val_gbr, r2_train_gbr = 0, 0
r2_val_cat, r2_train_cat = 0, 0

for train_index, val_index in rkf.split(X_t):
    
    X_train, X_val = X_t[train_index], X_t[val_index]
    y_train, y_val = y_t[train_index], y_t[val_index]

    # fit models

    ridge = Ridge(alpha=1)
    xgb = XGBRegressor(max_depth=2, eta=0.1, reg_lambda=5)
    lgb = LGBMRegressor(max_depth=2, reg_lambda=1)
    gbr = GradientBoostingRegressor()
    cat = CatBoostRegressor(verbose=False, allow_writing_files=False)

    ridge.fit(X_train, y_train)
    xgb.fit(X_train, y_train)
    lgb.fit(X_train, y_train)
    gbr.fit(X_train, y_train)
    cat.fit(X_train, y_train)

    r2_val_ridge += r2_score(y_val, ridge.predict(X_val))
    r2_val_xgb += r2_score(y_val, xgb.predict(X_val))
    r2_val_lgb += r2_score(y_val, lgb.predict(X_val))
    r2_val_gbr += r2_score(y_val, gbr.predict(X_val))
    r2_val_cat += r2_score(y_val, cat.predict(X_val))

    r2_train_ridge += r2_score(y_train, ridge.predict(X_train))
    r2_train_xgb += r2_score(y_train, xgb.predict(X_train))
    r2_train_lgb += r2_score(y_train, lgb.predict(X_train))
    r2_train_gbr += r2_score(y_train, gbr.predict(X_train))
    r2_train_cat += r2_score(y_train, cat.predict(X_train))

Validation Scores

In [None]:
print("Ridge:", r2_val_ridge/(n_folds*k))
print("XGB:", r2_val_xgb/(n_folds*k))
print("LGB:", r2_val_lgb/(n_folds*k))
print("GBR:", r2_val_gbr/(n_folds*k))
print("Cat", r2_val_cat/(n_folds*k))

Train Scores

In [None]:
print("Ridge:", r2_train_ridge/(n_folds*k))
print("XGB:", r2_train_xgb/(n_folds*k))
print("LGB:", r2_train_lgb/(n_folds*k))
print("GBR:", r2_train_gbr/(n_folds*k))
print("Cat:", r2_train_cat/(n_folds*k))

Submission

In [390]:
cat = CatBoostRegressor(verbose=False, allow_writing_files=False)
cat.fit(X_t, y_t)
sub = cat.predict(X_test_s)

In [391]:
pd.DataFrame({"id": ids, "y": sub}).to_csv("sub_cat.csv", index=False)

In [119]:
X.shape

(1212, 775)