In [1]:
import argparse
import copy
import pandas as pd
import numpy as np
import torch

from src.preprocessor import basic_preprocess, test_preprocess, select_features
from src.model import get_model, get_autoencoder
from src.metric import rmsle
from src.train import train_cv, train_pca_cv, train_ae_cv
from src.inference import cv_ensemble, cv_ensemble_pca, cv_ensemble_kmeans, cv_ensemble_ae

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
parser = argparse.ArgumentParser()

# Set Tasks
parser.add_argument("--run_all", type=str, default="true")
parser.add_argument("--feature_selection_method", type=str, default="embedded") # Option: none, wrapper, filter, embedded
parser.add_argument("--feature_extraction_method", type=str, default="autoencoder") # Option: none, pca, kmeans, autoencoder
parser.add_argument("--base_model", type=str, default="lgbm") # Option: linear_reg, random_forest, lgbm

# General Options
parser.add_argument("--extract_dim", type=int, default=8)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--train_data_path", type=str, default="./data/sberbank-russian-housing-market/train.csv")
parser.add_argument("--test_data_path", type=str, default="./data/sberbank-russian-housing-market/test.csv")
parser.add_argument("--sample_submission_path", type=str, default="./data/sberbank-russian-housing-market/sample_submission.csv")
parser.add_argument("--submission_path", type=str, default="./data/")

# Training Options
parser.add_argument("--num_cv_split", type=int, default=5)
parser.add_argument("--use_top_n_features", type=int, default=20)

# K-Means Options
parser.add_argument("--kmeans_max_iter", type=int, default=300)
parser.add_argument("--kmeans_n_clusters", type=int, default=8)

# Autoencoder Options
parser.add_argument("--ae_train", type=str, default="false")
parser.add_argument("--ae_val_size", type=float, default=0.1)
parser.add_argument("--ae_learning_rate", type=float, default=3e-4)
parser.add_argument("--ae_batch_size", type=int, default=32)
parser.add_argument("--ae_num_workers", type=int, default=4)
parser.add_argument("--ae_hidden_dim", type=int, default=8)
parser.add_argument("--ae_num_epochs", type=int, default=500)
parser.add_argument("--ae_checkpoint_dir", type=str, default="./checkpoints")
parser.add_argument("--ae_load_model_dir", type=str, default="./checkpoints/model-457.bin")

# PCA Options
parser.add_argument("--pca_n_components", type=int, default=8)

args = parser.parse_args([])
args.device = device

In [3]:
# Load and process data

train_data = pd.read_csv(args.train_data_path)
train_data = basic_preprocess(train_data, args)
X = train_data.drop('price_doc', axis=1)
y = train_data['price_doc']

args.processed_dim = len(X.columns)

In [4]:
# Get pipeline

pipeline = get_model(args)

In [5]:
# Train

if args.feature_selection_method in ["embedded", "none"]:
    models_, scores_ = train_cv(X, y, get_model(args, True), args)



[1/5] Train Size: 24080 | Val Size: 6021 | RMSLE: 0.623
[2/5] Train Size: 24081 | Val Size: 6020 | RMSLE: 0.427
[3/5] Train Size: 24081 | Val Size: 6020 | RMSLE: 0.478
[4/5] Train Size: 24081 | Val Size: 6020 | RMSLE: 0.457
[5/5] Train Size: 24081 | Val Size: 6020 | RMSLE: 0.428
Average RMSLE: 0.48 | CV RMSLE Std. Dev: 0.07 



In [6]:
# Feature selection

if args.feature_selection_method == "embedded":
    selected_features, excluded_features = select_features(models_, train_data, args)

elif args.feature_selection_method == "filter":
    selected_features, excluded_features = select_features(None, train_data, args)
    
X_selected = X[selected_features] 
X_unselected = X[excluded_features]

In [7]:
# Feature extraction & Train base model

if args.feature_extraction_method == "none":
    models_, scores_ = train_cv(X_selected, y, pipeline, args)

elif args.feature_extraction_method == "pca":
    models_, scalers_, pcas_, scores_ = train_pca_cv(X, y, pipeline, selected_features, excluded_features, args)

elif args.feature_extraction_method == "kmeans":
    models_, clusterers_, scores_ = train_kmeans_cv(X, y, pipeline, selected_features, excluded_features, args)

elif args.feature_extraction_method == "autoencoder":
    ae_model = get_autoencoder(args)
    models_, scalers_, scores_ = train_ae_cv(X, y, pipeline, selected_features, excluded_features, ae_model, args)




[1/5] Train Size: 24080 | Val Size: 6021 | RMSLE: 0.618
[2/5] Train Size: 24081 | Val Size: 6020 | RMSLE: 0.434
[3/5] Train Size: 24081 | Val Size: 6020 | RMSLE: 0.481
[4/5] Train Size: 24081 | Val Size: 6020 | RMSLE: 0.461
[5/5] Train Size: 24081 | Val Size: 6020 | RMSLE: 0.423
Average RMSLE: 0.48 | CV RMSLE Std. Dev: 0.07 



In [9]:
# Inference

test_data = pd.read_csv(args.test_data_path)
test_data = test_preprocess(test_data, args)

if args.feature_selection_method == "none":
    test_pred = cv_ensemble(models_, test_data)

elif args.feature_extraction_method == "none":
    test_pred = cv_ensemble(models_, test_data, selected_features)
    
if args.feature_extraction_method == "pca":
    test_pred = cv_ensemble_pca(models_, scalers_, pcas_, test_data, selected_features, excluded_features)

elif args.feature_extraction_method == "kmeans":
    test_pred = cv_ensemble_kmeans(models_, clusterers_, test_data, selected_features, excluded_features)

elif args.feature_extraction_method == "autoencoder":
    test_pred = cv_ensemble_ae(models_, scalers_, test_data, selected_features, excluded_features, ae_model)

In [10]:
# Save

submission = pd.read_csv(args.sample_submission_path)
submission['price_doc'] = test_pred
sub_path = f"{args.submission_path}result_{args.feature_selection_method}_{args.feature_extraction_method}_{args.base_model}_seed{args.seed}.csv"
submission.to_csv(sub_path, index=False)