AutoGluon is an auto-ml package, developed by J Mueller, X Shi, A Smola:

Mueller, Jonas, Xingjian Shi, and Alexander Smola. "Faster, Simpler, More Accurate: Practical Automated Machine Learning with Tabular, Text, and Image Data." Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020.

For tabular data, AutoGluon can produce models to predict the values in one column based on the values in the other columns. With just a single call to fit(), you can achieve high accuracy in standard supervised learning tasks (both classification and regression).

In the economy of a competition it can help you to create benchmarks, get insights on models' workings and accelerate your experimentations.

Installing the latest Scikit-learn

In [None]:
!pip install scikit-learn -U

Installing LightGBM for GPU

In [None]:
!rm -r /opt/conda/lib/python3.7/site-packages/lightgbm

In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
!apt-get install -y -qq libboost-all-dev

In [None]:
# If you have trouble with cmake, run this: ldd "$(type -p cmake)"
# and find out what library is missing or has a wrong version
# !rm /opt/conda/lib/libcurl.so.4

In [None]:
# %%bash
# cd LightGBM
# mkdir build
# cd build
# cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
# make -j$(nproc)

In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

Installing AutoGluon

In [None]:
!pip install autogluon 

In [None]:
# Importing core libraries
import numpy as np
import pandas as pd
import gc

# Importing AutoGluon
from autogluon.tabular import TabularDataset, TabularPredictor

# Scikit Learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.pipeline import Pipeline

In [None]:
# Derived from the original script https://www.kaggle.com/gemartin/load-data-reduce-memory-usage 
# by Guillaume Martin

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Loading data 
X_train = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv").set_index('id')
X_test = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv").set_index('id')
X_train.head()

In [None]:
# Feature engineering
# unique_values = X_train.iloc[:1000].nunique() < 0
# categoricals = [col for col in  unique_values.index[unique_values < 10] if col!='target']
numeric = [col for col in X_train.columns  if col!='target']
# print("categoricals",categoricals)
print("numeric",numeric)
X_train['mean_numeric'] = X_train[numeric].mean(axis=1)
X_train['std_numeric'] = X_train[numeric].std(axis=1)
X_train['min_numeric'] = X_train[numeric].min(axis=1)
X_train['max_numeric'] = X_train[numeric].max(axis=1)

X_test['mean_numeric'] = X_test[numeric].mean(axis=1)
X_test['std_numeric'] = X_test[numeric].std(axis=1)
X_test['min_numeric'] = X_test[numeric].min(axis=1)
X_test['max_numeric'] = X_test[numeric].max(axis=1)

numeric+=['mean_numeric','std_numeric','min_numeric','max_numeric']
print("numeric",numeric)

In [None]:
X_train.head()
# X_train.columns.tolist()

In [None]:
# Feature selection
features = ['f0',
 'f1',
 'f2',
 'f3',
 'f4',
 'f5',
 'f6',
 'f7',
 'f8',
 'f9',
 'f10',
 'f11',
 'f12',
 'f13',
 'f14',
 'f15',
 'f16',
 'f17',
 'f18',
 'f19',
 'f20',
 'f21',
 'f22',
 'f23',
 'f24',
 'f25',
 'f26',
 'f27',
 'f28',
 'f29',
 'f30',
 'f31',
 'f32',
 'f33',
 'f34',
 'f35',
 'f36',
 'f37',
 'f38',
 'f39',
 'f40',
 'f41',
 'f42',
 'f43',
 'f44',
 'f45',
 'f46',
 'f47',
 'f48',
 'f49',
 'f50',
 'f51',
 'f52',
 'f53',
 'f54',
 'f55',
 'f56',
 'f57',
 'f58',
 'f59',
 'f60',
 'f61',
 'f62',
 'f63',
 'f64',
 'f65',
 'f66',
 'f67',
 'f68',
 'f69',
 'f70',
 'f71',
 'f72',
 'f73',
 'f74',
 'f75',
 'f76',
 'f77',
 'f78',
 'f79',
 'f80',
 'f81',
 'f82',
 'f83',
 'f84',
 'f85',
 'f86',
 'f87',
 'f88',
 'f89',
 'f90',
 'f91',
 'f92',
 'f93',
 'f94',
 'f95',
 'f96',
 'f97',
 'f98',
 'f99',
 'mean_numeric',
 'std_numeric',
 'min_numeric',
 'max_numeric']

X_train = X_train[features + ['target']]
X_test = X_test[features]

In [None]:
### REDUCE MEMORY USAGE
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)
gc.collect()

In [None]:
VALIDATION = False
if VALIDATION is True:
    X_train, X_val = train_test_split(X_train, test_size=int(len(X_train) * 0.2), random_state=42)
    train_data = TabularDataset(X_train)
    val_data = TabularDataset(X_val)
else:
    train_data = TabularDataset(X_train)
    val_data = TabularDataset(X_train.iloc[:100_000, :])

SUBSAMPLE = False
RANDOM_STATE = 0
if SUBSAMPLE is True:
    subsample_size = 100_000  # subsample subset of data for faster demo, try setting this to much larger values
    train_data = train_data.sample(n=subsample_size, random_state=RANDOM_STATE)
    
train_data.head()

In [None]:
label = 'target'
print("Summary of target variable: \n", train_data[label].describe())

In [None]:
!mkdir agModels

You can actually use the optimized parameters that you can find on public kernels to boost your AutoGluon performances.

For instance these parameters are from the high scoring notebook https://www.kaggle.com/dlaststark/tps-1021-la-dee-da by DLASTSTARK

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',
    'use_label_encoder': False,
    'n_estimators': 10000,
    'max_depth': 3,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'learning_rate': 0.01187,
#     'gpu_id': 0,
#     'predictor': 'gpu_predictor'
}

cb_params = {
    'loss_function' : 'CrossEntropy',
    'eval_metric' : 'AUC',
    'iterations' : 10000,
    'grow_policy' : 'SymmetricTree',
    'use_best_model' : True,
    'depth' : 5,
    'l2_leaf_reg' : 3.0,
    'random_strength' : 1.0,
    'learning_rate' : 0.1,
#     'task_type' : 'GPU',
#     'devices' : '0',
    'verbose' : 0
}

lgb_params = {
    'objective' : 'binary',
    'metric' : 'auc',
    'max_depth' : 3,
    'num_leaves' : 7,
    'n_estimators' : 5000,
    'colsample_bytree' : 0.3,
    'subsample' : 0.5,
    'reg_alpha' : 18,
    'reg_lambda' : 17,
    'learning_rate' : 0.095,
#     'device' : 'gpu'
}

In [None]:
save_path = 'agModels'  # specifies folder to store trained models
presets='best_quality'
metric = 'roc_auc'
hours = 8.0
predictor = (TabularPredictor(label=label, eval_metric=metric, path=save_path)
             .fit(train_data,
                  excluded_model_types = ['KNN', 'XT' ,'RF', 'NN', 'FASTAI'],
                  hyperparameters = {'GBM': lgb_params, 
                                     'CAT': cb_params,
                                     'XGB': xgb_params
                                    },
                  presets=presets,
                  time_limit= int(60 * 60 * hours))
            )


In [None]:
results = predictor.fit_summary(show_plot=True)

In [None]:
leaderboard = predictor.leaderboard(val_data)

In [None]:
test_data = TabularDataset(X_test)
test_preds = predictor.predict_proba(test_data)

In [None]:
# Predicting and submission
submission = pd.DataFrame({'id':X_test.index, 
                           'target': test_preds.iloc[:,1].ravel()})

submission.to_csv("submission_autogluon_PL.csv", index=False)

In [None]:
submission