In [None]:
!pip install -U lightautoml

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from sklearn.metrics import r2_score

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD

import xgboost as xgb

from catboost import CatBoostRegressor

from h2o.automl import H2OAutoML
import h2o

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/mercedes-benz-greener-manufacturing/train.csv.zip')
test = pd.read_csv('/kaggle/input/mercedes-benz-greener-manufacturing/test.csv.zip')
sample_submission = pd.read_csv('/kaggle/input/mercedes-benz-greener-manufacturing/sample_submission.csv.zip')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

In [None]:
sample_submission['y'] = train['y'].mean()
sample_submission.to_csv('submission.baseline.csv', index=None)
# private score: -0.00168
# public score: -0.00038

In [None]:
# process columns, apply LabelEncoder to categorical features
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

# shape        
print('Shape train: {}\nShape test: {}'.format(train.shape, test.shape))

n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=42)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=42)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=42)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]
    
    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]
    
y_train = train["y"]
y_mean = np.mean(y_train)

# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 500, 
    'eta': 0.005,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

num_boost_rounds = 1500
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)


# check f2-score (to get higher score - increase num_boost_round in previous cell)
from sklearn.metrics import r2_score
print(r2_score(model.predict(dtrain), dtrain.get_label()))

# make predictions and save results
y_pred = model.predict(dtest)

sample_submission['y'] = y_pred
sample_submission.to_csv('submission.community-baseline.csv', index=False)

sample_submission.head()

# private score: 0.54370
# public score: 0.56292

In [None]:
cat_features = ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']
reg = CatBoostRegressor(cat_features=cat_features, random_seed=42)

usefule_features = [x for x in train.columns if x not in ['ID', 'y']]

reg.fit(train[usefule_features], train['y'], verbose=100)
pred = reg.predict(test[usefule_features])

sample_submission['y'] = pred
sample_submission.to_csv('submission.catboost.csv', index=None)

sample_submission.head()

# private score: 0.54187
# public score: 0.54393

In [None]:
task = Task(name="reg", metric=r2_score)
automl = TabularAutoML(task)

oof_pred = automl.fit_predict(train, roles = {'target': 'y', 'drop': ['ID']}, verbose=1)
test_pred = automl.predict(test)

sample_submission['y'] = test_pred.data[:, 0]
sample_submission.to_csv("submission.lama.csv", index=False)

sample_submission.head()

# private score: 0.54605
# public score: 0.54860

In [None]:
h2o.init()
path = '/kaggle/input/mercedes-benz-greener-manufacturing/train.csv.zip'
# Import a sample binary outcome train/test set into H2O
train = h2o.import_file(path)

usefule_features = [c for c in train.columns if c not in ('y', 'ID')]
target = 'y'

# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=usefule_features, y=target, training_frame=train)

test = h2o.import_file('/kaggle/input/mercedes-benz-greener-manufacturing/test.csv.zip')
pred = aml.predict(test)

sample_submission['y'] = pred.as_data_frame()['predict']
sample_submission.to_csv('submission.h2o.csv', index=None)

sample_submission.head()

# private score: 0.54596
# public score: 0.55265