In [363]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA, KernelPCA
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score
from sklearn.kernel_ridge import KernelRidge
import numpy as np
import copy

In [364]:
X = pd.read_csv("X_train.csv")
X = X.drop(columns=["id"])

In [365]:
y = pd.read_csv("y_train.csv")['y']

In [366]:
X_test = pd.read_csv("X_test.csv")
ids = X_test["id"]
X_test = X_test.drop(columns=["id"])

Remove constant columns

In [367]:
X_test = X_test.loc[:, X.var() != 0.0]
X = X.loc[:, X.var() != 0.0]
X.shape

(1212, 828)

Impute median

In [368]:
X = X.fillna(X.median())
X_test = X_test.fillna(X_test.median())

Truncate

In [369]:
def truncate(train, delta=0.01):
    
    bot = X.quantile(delta)
    top = X.quantile(1-delta)
    
    for column in train.columns:
        train.loc[train[column] < bot[column], column] = train[column].median()
        train.loc[train[column] > top[column], column] = train[column].median()
    
    return train

Normalize

In [370]:
def normalize(train):
    return ((train - train.mean())/train.std()).to_numpy()

PCA

In [371]:
def pca(train, val, target=None):
    
    pca = PCA(n_components=20)
    train_t = pca.fit_transform(train)
    val_t = pca.transform(val)
    
    return train_t, val_t

Cross Validation

In [372]:
X_t = truncate(copy.copy(X))
X_t = normalize(X_t)
mask = np.arange(X_t.shape[0])
np.random.shuffle(mask)
X_t = X_t[mask]
y_t = y[mask]

In [373]:
step = X.shape[0] // 5
r2_train = []
r2_val = []

for i in range(5):
    
    X_val, y_val = X_t[i*step:(i+1)*step], y_t[i*step:(i+1)*step]
    X_train = np.concatenate((X_t[(i+1)*step:], X_t[:i*step]), axis=0)
    y_train = np.concatenate((y_t[(i+1)*step:], y_t[:i*step]), axis=0)
    
    X_train_t, X_val_t = pca(X_train, X_val)
    
    ridge = Ridge(alpha=1.0)
    ridge.fit(X_train_t, y_train)
    
    r2_train.append(r2_score(y_train, ridge.predict(X_train_t)))
    r2_val.append(r2_score(y_val, ridge.predict(X_val_t)))

In [374]:
sum(r2_val) / len(r2_val)

0.43479970021500536

In [375]:
sum(r2_train) / len(r2_train)

0.46762678182429207

Submission

In [340]:
X_t = truncate(copy.copy(X))
X_t = normalize(X_t)
X_test_t = truncate(copy.copy(X_test))
X_test_t = normalize(X_test_t)
mask = np.arange(X_t.shape[0])
np.random.shuffle(mask)
X_t = X_t[mask]
y_t = y[mask]
X_train_t, X_test_t = pca(X_t, X_test_t)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_t, y_t)
sub = ridge.predict(X_test_t)

In [344]:
pd.DataFrame({"id": ids, "y": sub}).to_csv("sub_med.csv", index=False)

In [342]:
sub.shape

(776,)

In [343]:
X_test.shape

(776, 828)