In [86]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import DBSCAN

In [87]:
X = pd.read_csv("X_train.csv")
X = X.drop(columns=["id"])

In [88]:
y = pd.read_csv("y_train.csv")['y']

In [89]:
X_test = pd.read_csv("X_test.csv")
ids = X_test["id"]
X_test = X_test.drop(columns=["id"])

Remove constant columns

In [90]:
X_test = X_test.loc[:, X.var() != 0.0]
X = X.loc[:, X.var() != 0.0]

Remove highly correlated features

In [91]:
corr_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
print("Removed columns: ", len(to_drop))
# Drop features 
X.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)

Removed columns:  53


Scale

In [92]:
transformer = RobustScaler()
X = transformer.fit_transform(X)
X_test = transformer.transform(X_test)

Impute median

In [93]:
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X = imp_median.fit_transform(X)
X_test = imp_median.transform(X_test)

In [94]:
y = y.to_numpy()

Feature selection methods

In [95]:
def pca(train, val, n=20):
    
    pca = PCA(whiten=True, n_components=n)
    train_t = pca.fit_transform(train)
    val_t = pca.transform(val)
    
    return train_t, val_t

In [96]:
def lasso_select(train, val, target, alpha=1):
    
    clf = Lasso(alpha=alpha)
    clf.fit(train, target)
    coef = clf.coef_
    
    # select features with non-zero lasso coefficients
    
    train = train[:, coef != 0]
    val = val[:, coef != 0]
    print("Selected features: " ,np.count_nonzero(clf.coef_))
    
    return train, val

Cross Validation

In [97]:
k = 10
n_folds = 5

rkf = RepeatedKFold(n_splits=n_folds, n_repeats=k)

r2_val_ridge, r2_train_ridge = 0, 0
r2_val_xgb, r2_train_xgb = 0, 0
r2_val_lgb, r2_train_lgb = 0, 0
r2_val_gbr, r2_train_gbr = 0, 0
r2_val_cat, r2_train_cat = 0, 0

for train_index, val_index in rkf.split(X):
    
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # feature selection

    X_train, X_val = lasso_select(X_train, X_val, y_train, alpha=0.7)
        
    # outlier removal
    
    dbscan = DBSCAN(eps=4, min_samples=40)
    dbscan.fit(X_train)
    before = X_train.shape[0]
    X_train = X_train[dbscan.labels_ != -1]
    print("Outliers removed: ", before - X_train.shape[0])

    # fit models

    ridge = Ridge(alpha=2)
    xgb = XGBRegressor(max_depth=2, eta=0.1, reg_lambda=5)
    lgb = LGBMRegressor()
    gbr = GradientBoostingRegressor()
    cat = CatBoostRegressor(verbose=False, allow_writing_files=False)

    ridge.fit(X_train, y_train)
    xgb.fit(X_train, y_train)
    lgb.fit(X_train, y_train)
    gbr.fit(X_train, y_train)
    cat.fit(X_train, y_train)

    r2_val_ridge += r2_score(y_val, ridge.predict(X_val))
    r2_val_xgb += r2_score(y_val, xgb.predict(X_val))
    r2_val_lgb += r2_score(y_val, lgb.predict(X_val))
    r2_val_gbr += r2_score(y_val, gbr.predict(X_val))
    r2_val_cat += r2_score(y_val, cat.predict(X_val))

    r2_train_ridge += r2_score(y_train, ridge.predict(X_train))
    r2_train_xgb += r2_score(y_train, xgb.predict(X_train))
    r2_train_lgb += r2_score(y_train, lgb.predict(X_train))
    r2_train_gbr += r2_score(y_train, gbr.predict(X_train))
    r2_train_cat += r2_score(y_train, cat.predict(X_train))

Selected features:  18
Selected features:  18
Selected features:  17
Selected features:  18
Selected features:  19
Selected features:  19
Selected features:  19
Selected features:  17
Selected features:  15
Selected features:  20
Selected features:  21
Selected features:  16
Selected features:  19
Selected features:  17
Selected features:  17
Selected features:  16
Selected features:  18
Selected features:  20
Selected features:  18
Selected features:  15
Selected features:  17
Selected features:  17
Selected features:  17
Selected features:  19
Selected features:  18
Selected features:  18
Selected features:  19
Selected features:  18
Selected features:  17
Selected features:  20
Selected features:  21
Selected features:  17
Selected features:  18
Selected features:  17
Selected features:  17
Selected features:  16
Selected features:  19
Selected features:  17
Selected features:  18
Selected features:  15
Selected features:  18
Selected features:  17
Selected features:  17
Selected fe

Validation Scores

In [98]:
print("Ridge:", r2_val_ridge/(n_folds*k))
print("XGB:", r2_val_xgb/(n_folds*k))
print("LGB:", r2_val_lgb/(n_folds*k))
print("GBR:", r2_val_gbr/(n_folds*k))
print("Cat", r2_val_cat/(n_folds*k))

Ridge: 0.34653804976979624
XGB: 0.468803692637444
LGB: 0.4777992111814588
GBR: 0.4634140095833609
Cat 0.5088410553727866


Train Scores

In [99]:
print("Ridge:", r2_train_ridge/(n_folds*k))
print("XGB:", r2_train_xgb/(n_folds*k))
print("LGB:", r2_train_lgb/(n_folds*k))
print("GBR:", r2_train_gbr/(n_folds*k))
print("Cat:", r2_train_cat/(n_folds*k))

Ridge: 0.3963338704560547
XGB: 0.6857695832735453
LGB: 0.9556834804563412
GBR: 0.8172977853483961
Cat: 0.9774356534497847


<h3> Results: </h3>
lasso 0.7, correlated feature removal at 0.9: <br>
1. Ridge: 0.34106056914528515 <br>
2. XGB: 0.4732512178890452 <br>
3. LGB: 0.48213427727725927 <br>
4. GBR: 0.47345544221294494 <br>
5. Cat 0.5147129879649083 <br>

<br>
removing features with low correlation with target is not useful
<br>
Robust Scaler <br>
1. Ridge: 0.34223038357372415 <br>
2. XGB: 0.46853086163501784 <br>
3. LGB: 0.4827957318822244 <br>
4. GBR: 0.4638285545174996 <br>
5. Cat 0.5156525376456286 <br>


Submission

In [54]:
X_sub, X_test_sub = lasso_select(X, X_test, y, alpha=0.7)
cat = CatBoostRegressor(verbose=False, allow_writing_files=False)
cat.fit(X_sub, y)
sub = cat.predict(X_test_sub)

Selected features:  17


In [55]:
pd.DataFrame({"id": ids, "y": sub}).to_csv("sub_cat.csv", index=False)

In [119]:
X.shape

(1212, 775)