In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, KernelPCA
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score
from sklearn.kernel_ridge import KernelRidge

In [2]:
X_train = pd.read_csv("X_train.csv")
X_train = X_train.drop(columns=["id"])

In [3]:
y_train = pd.read_csv("y_train.csv")['y']

In [4]:
X_test = pd.read_csv("X_test.csv")
ids = X_test["id"]
X_test = X_test.drop(columns=["id"])

Remove constant columns

In [5]:
X_test = X_test.loc[:, X_train.var() != 0.0]
X_train = X_train.loc[:, X_train.var() != 0.0]

Impute median

In [6]:
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())

Truncate

In [7]:
DELTA = 0.05
bot = X_train.quantile(DELTA)
top = X_train.quantile(1-DELTA)

In [8]:
for column in X_train.columns:
    X_train.loc[X_train[column] < bot[column], column] = bot[column]
    X_train.loc[X_train[column] > top[column], column] = top[column]

Normalize

In [9]:
X_train = (X_train - X_train.mean())/X_train.std()
X_test = (X_test - X_test.mean())/X_test.std()

Shuffle and Train Test Split

In [10]:
X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

PCA

In [11]:
pca = PCA(n_components=20)
X_train_t = pca.fit_transform(X_train_)
print(pca.explained_variance_ratio_.sum())
print(pca.explained_variance_ratio_[:10].sum())

0.23104555837030805
0.19376525424652366


In [12]:
X_val_t = pca.transform(X_val)
X_test_t = pca.transform(X_test)

LASSO

In [None]:
clf = Lasso(alpha=0.5)
clf.fit(X_train_, y_train_)

In [None]:
(clf.coef_ > 0).sum()

In [None]:
X_train_t = X_train_.loc[:,clf.coef_ > 0]
X_val_t = X_val.loc[:, clf.coef_ > 0]
X_test_t = X_test.loc[:, clf.coef_ > 0]

Ridge

In [13]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_t, y_train_)

Ridge()

Evaluate

In [14]:
score = r2_score(y_val, ridge.predict(X_val_t))
score

0.36170754629481505

Training Score

In [15]:
r2_score(y_train_, ridge.predict(X_train_t))

0.4394639041990358

Submission

In [None]:
sub = ridge.predict(pca.transform(X_test))

In [None]:
pd.DataFrame({"id": ids, "y": sub}).to_csv("/home/richard/Documents/AML Projects/Task1/data/sub.csv", index=False)