In [44]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
from ignite.contrib.metrics.regression import R2Score
import time
from sklearn.linear_model import Ridge
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge, Lasso
from sklearn.cluster import DBSCAN

In [45]:
X = pd.read_csv("X_train.csv")
X = X.drop(columns=["id"])

In [46]:
y = pd.read_csv("y_train.csv")['y']

In [47]:
X_test = pd.read_csv("X_test.csv")
ids = X_test["id"]
X_test = X_test.drop(columns=["id"])

Remove constant columns

In [48]:
X_test = X_test.loc[:, X.var() != 0.0]
X = X.loc[:, X.var() != 0.0]

Remove highly correlated features

In [49]:
corr_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
print("Removed columns: ", len(to_drop))
# Drop features 
X.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)

Removed columns:  53


Scale

In [50]:
transformer = RobustScaler()
X = transformer.fit_transform(X)
X_test = transformer.transform(X_test)

Impute median

In [51]:
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X = imp_median.fit_transform(X)
X_test = imp_median.transform(X_test)

In [52]:
y = y.to_numpy()

Feature selection with Lasso

In [53]:
def lasso_select(train, val, target, alpha=1):
    
    clf = Lasso(alpha=alpha)
    clf.fit(train, target)
    coef = clf.coef_
    
    # select features with non-zero lasso coefficients
    
    train = train[:, coef != 0]
    val = val[:, coef != 0]
    print("Selected features: " ,np.count_nonzero(clf.coef_))
    
    return train, val

Cross Validation

In [54]:
X_s, X_test_s = lasso_select(X, X_test, y, alpha=0.56)

Selected features:  20


In [55]:
dbscan = DBSCAN(eps=4.9, min_samples=40)
dbscan.fit(X_s)
X_t = X_s[dbscan.labels_ != -1]
y_t = y[dbscan.labels_ != -1]
print("Outliers removed: ", (dbscan.labels_ == -1).sum())

Outliers removed:  68


In [56]:
dataset = np.concatenate((X_t, np.expand_dims(y_t, axis=-1)), axis=1)
dataset = torch.from_numpy(dataset).float()

In [167]:
class Net(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        self.l1 = nn.Linear(20, 5)
        self.l2 = nn.Linear(5, 1)
        self.l3 = nn.Linear(20, 1)
        self.relu = nn.LeakyReLU()
        
    def forward(self, x):
        
        z = self.l1(x)
        z = self.relu(z)
        z = self.l2(z)
        
        return z

In [168]:
model = Net()

In [169]:
trainsize = int(0.8*X_t.shape[0])
valsize = X_t.shape[0] - trainsize

In [170]:
train, val = random_split(dataset, [trainsize, valsize])

In [171]:
trainloader = DataLoader(train, batch_size=4)
validloader = DataLoader(val, batch_size=4)

In [172]:
criterion = nn.MSELoss()
optim = torch.optim.Adam(model.parameters(), lr=5e-4)
metric = R2Score()

In [173]:
epochs = 800

for e in range(epochs):
    
    t = time.time()
    
    metric.reset()
    
    for batch in trainloader:
        
        x = batch[:,:-1]
        y = torch.squeeze(batch[:,-1])
        
        optim.zero_grad()
        model.train()
        
        out = model(x)
        
        loss = criterion(y, out)
        
        loss.backward()
        
        optim.step()
        
        metric.update((torch.squeeze(out), y))

    r2 = metric.compute()
        
    epoch_duration = time.time() - t
        
    if e % 20 == 0:
        print(f'Epoch {e} | Train R2: {r2:.4f} | '
              # f' Validation R2: {r2_val:.4f} | '
              f' Duration {epoch_duration:.2f} sec')

Epoch 0 | Train R2: -55.5419 |  Duration 0.29 sec
Epoch 20 | Train R2: -16.6339 |  Duration 0.26 sec
Epoch 40 | Train R2: -5.2368 |  Duration 0.26 sec
Epoch 60 | Train R2: -0.8248 |  Duration 0.27 sec
Epoch 80 | Train R2: 0.0741 |  Duration 0.27 sec
Epoch 100 | Train R2: 0.2074 |  Duration 0.27 sec
Epoch 120 | Train R2: 0.2379 |  Duration 0.30 sec
Epoch 140 | Train R2: 0.2475 |  Duration 0.27 sec
Epoch 160 | Train R2: 0.2526 |  Duration 0.30 sec
Epoch 180 | Train R2: 0.2561 |  Duration 0.31 sec
Epoch 200 | Train R2: 0.2587 |  Duration 0.26 sec
Epoch 220 | Train R2: 0.2606 |  Duration 0.28 sec
Epoch 240 | Train R2: 0.2635 |  Duration 0.27 sec
Epoch 260 | Train R2: 0.2651 |  Duration 0.28 sec
Epoch 280 | Train R2: 0.2661 |  Duration 0.27 sec


KeyboardInterrupt: 