# Step 0.0. Install LightAutoML

In [None]:
pip install -U lightautoml

# Step 0.1. Import necessary libraries 

In [None]:
%matplotlib inline

# Standard python libraries
import os
import time
import re

# Installed libraries
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
import torch
import matplotlib.pyplot as plt

# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

# Step 0.2. Parameters 

In [None]:
N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 7200 # Time in seconds for automl run

# Step 0.3. Fix torch number of threads and numpy seed 

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

# Step 0.4. Data load 

In [None]:
%%time

train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
test_data.head()

In [None]:
submission = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
submission.head()

# Step 0.5. Add new features

In [None]:
def create_extra_features(data):
    data["SqFtPerRoom"] = data["GrLivArea"] / (data["TotRmsAbvGrd"] +
                                               data["FullBath"] +
                                               data["HalfBath"] +
                                               data["KitchenAbvGr"])

    data['Total_Home_Quality'] = data['OverallQual'] + data['OverallCond']

    data['Total_Bathrooms'] = (data['FullBath'] + (0.5 * data['HalfBath']) +
                                   data['BsmtFullBath'] + (0.5 * data['BsmtHalfBath']))

    data["HighQualSF"] = data["1stFlrSF"] + data["2ndFlrSF"]
    return data

train_data = create_extra_features(train_data)
test_data = create_extra_features(test_data)

# Step 0.6. Data splitting for train-test 

In [None]:
tr_data, te_data = train_test_split(train_data, 
                                     test_size=TEST_SIZE,
                                     random_state=RANDOM_STATE)
print('Data splitted. Parts sizes: tr_data = {}, te_data = {}'.format(tr_data.shape, te_data.shape))

# ========= AutoML preset usage =========


## Step 1. Create Task

In [None]:
%%time

task = Task('reg', loss = 'rmsle', metric = 'rmsle')

## Step 2. Setup columns roles

In [None]:
%%time

roles = {
    'target': 'SalePrice',
    'drop': ['Id'],
}

## Step 3. Create AutoML from preset and train on 80% of data

In [None]:
%%time 

cnt_trained = 0
results = []
rs_list = list(range(2000, 2005))
for it, rs in enumerate(rs_list):
    print('=' * 30)
    print('START RANDOM_STATE = {}'.format(rs))
    print('=' * 30)
    
    # Train AutoML
    automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': rs},
                        verbose = 1)
    oof_pred = automl.fit_predict(tr_data, roles = roles)
    
    # Predict on test
    test_pred = automl.predict(te_data)
    cnt_trained += 1
    
    # Save predictions
    if it == 0:
        oof_pred_full = oof_pred.data[:, 0].copy()
        test_pred_full = test_pred.data[:, 0].copy()
    else:
        oof_pred_full += oof_pred.data[:, 0]
        test_pred_full += test_pred.data[:, 0]
    
    # Check scores for current predict and aggregated one
    rmsle_usual = mean_squared_log_error(te_data['SalePrice'].values, test_pred.data[:, 0]) ** 0.5
    rmsle_full = mean_squared_log_error(te_data['SalePrice'].values, test_pred_full / cnt_trained) ** 0.5
    results.append((rmsle_usual, rmsle_full, rmsle_full - rmsle_usual))
    
    print('Check scores...')
    print('Holdout score: {}'.format(rmsle_usual))
    print('Holdout score full: {}'.format(rmsle_full))
    print('Difference: {}'.format(rmsle_full - rmsle_usual))

In [None]:
test_pred_full_0 = test_pred_full.copy() / cnt_trained

## Step 4. Graphical check

In [None]:
plt.figure(figsize = (20, 10))
plt.plot(range(1, cnt_trained + 1), [res[0] for res in results], color = 'b', linewidth = 2, label = 'Usual LightAutoML model RMSLE')
plt.plot(range(1, cnt_trained + 1), [res[1] for res in results], color = 'g', linewidth = 2, label = 'RMSLE for averaged LightAutoMLs')
plt.grid()
plt.legend()
plt.title('LightAutoML RMSLE vs. averaged LightAutoMLs composition RMSLE')
plt.xlabel('Iteration number')
plt.ylabel('RMSLE')
plt.show()

In [None]:
mean_rmsle = np.mean([res[0] for res in results])
mean_rmsle

In [None]:
differences = np.array([res[1] - mean_rmsle for res in results])

In [None]:
plt.figure(figsize = (20, 10))
plt.plot(range(1, cnt_trained + 1), differences, color = 'g', linewidth = 2, label = 'Difference')
plt.plot(range(1, cnt_trained + 1), [np.mean(differences[0:i+1]) for i in range(len(differences))], 'b-.', linewidth = 2, label = 'Cumulative mean difference')
plt.plot(range(1, cnt_trained + 1), [0.0 for res in results], 'r--', linewidth = 2, label = 'Zero line')
plt.grid()
plt.legend()
plt.title('Difference between mean LightAutoML RMSLE and averaged LightAutoMLs composition RMSLE at each iteration')
plt.xlabel('Iteration number')
plt.ylabel('RMSLE difference')
plt.show()

## Step 5. Create AutoML with pseudo labelled data from holdout

Below we use Pseudo labelling technique - we use trained model to predict for holdout data and change the real target with these predictions. After this change we train new automl model on both train data with real target and holdout data with pseudo-target:

In [None]:
data_with_pseudolabels = te_data.copy()
data_with_pseudolabels['SalePrice'] = test_pred_full_0.copy() 

new_dataset = pd.concat([tr_data, data_with_pseudolabels]).sample(frac = 1, random_state = 13).reset_index(drop = True)
print(tr_data.shape, new_dataset.shape)

In [None]:
tr_data['SalePrice'].describe()

In [None]:
new_dataset['SalePrice'].describe()

In [None]:
%%time 

cnt_trained = 0
results = []
rs_list = list(range(2000, 2005))
for it, rs in enumerate(rs_list):
    print('=' * 30)
    print('START RANDOM_STATE = {}'.format(rs))
    print('=' * 30)
    
    # Train AutoML
    automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': rs},
                        verbose = 1)
    oof_pred = automl.fit_predict(new_dataset, roles = roles)
    
    # Predict on test
    test_pred = automl.predict(te_data)
    cnt_trained += 1
    
    # Save predictions
    if it == 0:
        oof_pred_full = oof_pred.data[:, 0].copy()
        test_pred_full = test_pred.data[:, 0].copy()
    else:
        oof_pred_full += oof_pred.data[:, 0]
        test_pred_full += test_pred.data[:, 0]
    
    # Check scores for current predict and aggregated one
    rmsle_usual = mean_squared_log_error(te_data['SalePrice'].values, test_pred.data[:, 0]) ** 0.5
    rmsle_full = mean_squared_log_error(te_data['SalePrice'].values, test_pred_full / cnt_trained) ** 0.5
    results.append((rmsle_usual, rmsle_full, rmsle_full - rmsle_usual))
    
    print('Check scores...')
    print('Holdout score: {}'.format(rmsle_usual))
    print('Holdout score full: {}'.format(rmsle_full))
    print('Difference: {}'.format(rmsle_full - rmsle_usual))

## Step 6. New graphical check with pseudo label trained LightAutoML model

In [None]:
%%time

plt.figure(figsize = (20, 10))
plt.plot(range(1, cnt_trained + 1), [res[0] for res in results], color = 'b', linewidth = 2, label = 'Usual LightAutoML model RMSLE')
plt.plot(range(1, cnt_trained + 1), [res[1] for res in results], color = 'g', linewidth = 2, label = 'RMSLE for averaged LightAutoMLs')
plt.grid()
plt.legend()
plt.title('LightAutoML RMSLE vs. averaged LightAutoMLs composition RMSLE')
plt.xlabel('Iteration number')
plt.ylabel('RMSLE')
plt.show()

In [None]:
mean_rmsle = np.mean([res[0] for res in results])
mean_rmsle

In [None]:
differences = np.array([res[1] - mean_rmsle for res in results])

In [None]:
plt.figure(figsize = (20, 10))
plt.plot(range(1, cnt_trained + 1), differences, color = 'g', linewidth = 2, label = 'Difference')
plt.plot(range(1, cnt_trained + 1), [np.mean(differences[0:i+1]) for i in range(len(differences))], 'b-.', linewidth = 2, label = 'Cumulative mean difference')
plt.plot(range(1, cnt_trained + 1), [0.0 for res in results], 'r--', linewidth = 2, label = 'Zero line')
plt.grid()
plt.legend()
plt.title('Difference between mean LightAutoML RMSLE and averaged LightAutoMLs composition RMSLE at each iteration')
plt.xlabel('Iteration number')
plt.ylabel('RMSLE difference')
plt.show()

## Step 7. Train LightAutoML on full training dataset and pseudolabelled real test data

In [None]:
%%time 

cnt_trained = 0
results = []
rs_list = list(range(2000, 2010))
for it, rs in enumerate(rs_list):
    print('=' * 30)
    print('START RANDOM_STATE = {}'.format(rs))
    print('=' * 30)
    
    # Train AutoML
    automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': rs},
                        verbose = 1)
    oof_pred = automl.fit_predict(train_data, roles = roles)
    
    # Predict on test
    test_pred = automl.predict(test_data)
    cnt_trained += 1
    
    # Save predictions
    if it == 0:
        oof_pred_full = oof_pred.data[:, 0].copy()
        test_pred_full = test_pred.data[:, 0].copy()
    else:
        oof_pred_full += oof_pred.data[:, 0]
        test_pred_full += test_pred.data[:, 0]
    
    # Check scores for current predict and aggregated one
    rmsle_usual = mean_squared_log_error(train_data['SalePrice'].values, oof_pred.data[:, 0]) ** 0.5
    rmsle_full = mean_squared_log_error(train_data['SalePrice'].values, oof_pred_full / cnt_trained) ** 0.5
    results.append((rmsle_usual, rmsle_full, rmsle_full - rmsle_usual))
    
    print('Check scores...')
    print('OOF score: {}'.format(rmsle_usual))
    print('OOF score full: {}'.format(rmsle_full))
    print('Difference: {}'.format(rmsle_full - rmsle_usual))

In [None]:
test_pred_full_0 = test_pred_full.copy() / cnt_trained

## Step 8. Pseudolabel train on full train and test datasets

In [None]:
# create pseudolabelled dataset from real test data
test_data_with_pseudolabels = test_data.copy()
test_data_with_pseudolabels['SalePrice'] = test_pred_full_0

# combine all together
full_dataset = pd.concat([train_data, test_data_with_pseudolabels]).sample(frac = 1, random_state = 13).reset_index(drop = True)
print(train_data.shape, full_dataset.shape)

In [None]:
%%time 

cnt_trained = 0
results = []
rs_list = list(range(2000, 2007))
for it, rs in enumerate(rs_list):
    print('=' * 30)
    print('START RANDOM_STATE = {}'.format(rs))
    print('=' * 30)
    
    # Train AutoML
    automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': ['lgb', 'lgb_tuned', 'cb', 'cb_tuned']},
                       reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': rs},
                        verbose = 1)
    oof_pred = automl.fit_predict(full_dataset, roles = roles)
    
    # Predict on test
    test_pred = automl.predict(test_data)
    cnt_trained += 1
    
    # Save predictions
    if it == 0:
        oof_pred_full = oof_pred.data[:, 0].copy()
        test_pred_full = test_pred.data[:, 0].copy()
    else:
        oof_pred_full += oof_pred.data[:, 0]
        test_pred_full += test_pred.data[:, 0]

## Step 11. Prepare submission

In [None]:
submission['SalePrice'] = test_pred_full / cnt_trained
submission.to_csv('LightAutoML_pseudolabelled_multistart.csv', index = False)

In [None]:
submission