AutoGluon is an auto-ml package, developed by J Mueller, X Shi, A Smola:

Mueller, Jonas, Xingjian Shi, and Alexander Smola. "Faster, Simpler, More Accurate: Practical Automated Machine Learning with Tabular, Text, and Image Data." Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2020.

For tabular data, AutoGluon can produce models to predict the values in one column based on the values in the other columns. With just a single call to fit(), you can achieve high accuracy in standard supervised learning tasks (both classification and regression).

In the economy of a competition it can help you to create benchmarks, get insights on models' workings and accelerate your experimentations.

In [None]:
!pip install autogluon 

In [None]:
!pip install scikit-learn -U

In [None]:
# Importing core libraries
import numpy as np
import pandas as pd
import gc
from scipy.stats import skew

# Importing AutoGluon
from autogluon.tabular import TabularDataset, TabularPredictor

# Scikit Learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
# Derived from the original script https://www.kaggle.com/gemartin/load-data-reduce-memory-usage 
# by Guillaume Martin

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# Loading data 
X_train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv").set_index('id')
X_test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv").set_index('id')

y_train = X_train['claim']
X_train = X_train.drop('claim', axis='columns')

In [None]:
def get_stats_per_row(data):
    data['mv_row'] = data.isna().sum(axis=1)
    data['min_row'] = data.min(axis=1)
    data['std_row'] = data.std(axis=1)
    return data

def impute_skewed_features(data):
    skewed_feat = data.skew()
    skewed_feat = [*skewed_feat[abs(skewed_feat.values) > 1].index]

    for feat in skewed_feat:
        median = data[feat].median()
        data[feat] = data[feat].fillna(median)
        
    return data

pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler())
])

X_train = pd.DataFrame(pipeline.fit_transform(impute_skewed_features(get_stats_per_row(X_train))),
                       columns=X_train.columns,
                       index=X_train.index)
X_test = pd.DataFrame(pipeline.transform(impute_skewed_features(get_stats_per_row(X_test))),
                      columns=X_test.columns,
                      index=X_test.index)

In [None]:
# Adding t-SNE and UMAP projections
prj_train = pd.read_csv("../input/really-not-missing-at-random/train.csv")
prj_test = pd.read_csv("../input/really-not-missing-at-random/test.csv")

projections = ['t_sne_0', 't_sne_1', 't_umap_0', 't_umap_1']
X_train[projections] = prj_train[projections]
X_test[projections] = prj_test[projections]

In [None]:
X_train['claim'] = y_train

In [None]:
### REDUCE MEMORY USAGE
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)
gc.collect()

In [None]:
VALIDATION = False
if VALIDATION is True:
    X_train, X_val = train_test_split(X_train, test_size=int(len(X_train) * 0.2), random_state=42)
    train_data = TabularDataset(X_train)
    val_data = TabularDataset(X_val)
else:
    train_data = TabularDataset(X_train)
    val_data = TabularDataset(X_train.iloc[:100_000, :])

SUBSAMPLE = False
if SUBSAMPLE is True:
    subsample_size = 10_000  # subsample subset of data for faster demo, try setting this to much larger values
    train_data = train_data.sample(n=subsample_size, random_state=0)
    
train_data.head()

In [None]:
label = 'claim'
print("Summary of target variable: \n", train_data[label].describe())

In [None]:
!mkdir agModels

In [None]:
lgbm1_params = {
    'metric' : 'auc',
    'max_depth' : 3,
    'num_leaves' : 7,
    'n_estimators' : 5000,
    'colsample_bytree' : 0.3,
    'subsample' : 0.5,
    'reg_alpha' : 18,
    'reg_lambda' : 17,
    'learning_rate' : 0.095,
    'device' : 'gpu',
    'objective' : 'binary'
}

lgbm2_params = {
    'metric' : 'auc',
    'objective': 'binary',
    'n_estimators': 10000,
    'learning_rate': 0.095,
    'subsample': 0.6,
    'subsample_freq': 1,
    'colsample_bytree': 0.4,
    'reg_alpha': 10.0,
    'reg_lambda': 1e-1,
    'min_child_weight': 256,
    'min_child_samples': 20,
    'device' : 'gpu',
    'max_depth' : 3,
    'num_leaves' : 7
}

lgbm3_params = {
    'metric' : 'auc',
    'objective' : 'binary',
    'device_type': 'gpu', 
    'n_estimators': 10000, 
    'learning_rate': 0.12230165751633416, 
    'num_leaves': 1400, 
    'max_depth': 8, 
    'min_child_samples': 3100, 
    'reg_alpha': 10, 
    'reg_lambda': 65, 
    'min_split_gain': 5.157818977461183, 
    'subsample': 0.5, 
    'subsample_freq': 1, 
    'colsample_bytree': 0.2
}

catb1_params = {
    'eval_metric' : 'AUC',
    'iterations': 15585, 
    'objective': 'CrossEntropy',
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}

catb2_params = {
    'eval_metric' : 'AUC',
    'depth' : 5,
    'grow_policy' : 'SymmetricTree',
    'l2_leaf_reg' : 3.0,
    'random_strength' : 1.0,
    'learning_rate' : 0.1,
    'iterations' : 10000,
    'loss_function' : 'CrossEntropy',
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}

xgb1_params = {
    'eval_metric' : 'auc',
    'lambda': 0.004562711234493688, 
    'alpha': 7.268146704546314, 
    'colsample_bytree': 0.6468987558386358, 
    'colsample_bynode': 0.29113878257290376, 
    'colsample_bylevel': 0.8915913499148167, 
    'subsample': 0.37130229826185135, 
    'learning_rate': 0.021671163563123198, 
    'grow_policy': 'lossguide', 
    'max_depth': 18, 
    'min_child_weight': 215, 
    'max_bin': 272,
    'n_estimators': 10000,
    'random_state': 0,
    'use_label_encoder': False,
    'objective': 'binary:logistic',
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'predictor': 'gpu_predictor'
}

xgb2_params = dict(
    eval_metric='auc',
    max_depth=3,
    subsample=0.5,
    colsample_bytree=0.5,
    learning_rate=0.01187431306013263,
    n_estimators=10000,
    n_jobs=-1,
    use_label_encoder=False,
    objective='binary:logistic',
    tree_method='gpu_hist',
    gpu_id=0,
    predictor='gpu_predictor'
)

In [None]:
save_path = 'agModels'  # specifies folder to store trained models
presets='best_quality'
metric = 'roc_auc'
hours = 5.0
predictor = (TabularPredictor(label=label, eval_metric=metric, path=save_path)
             .fit(train_data,
                  excluded_model_types = ['KNN', 'XT' ,'RF', 'NN', 'FASTAI'],
                  hyperparameters = {'GBM': lgbm1_params, 
                                     'CAT': catb1_params,
                                     'XGB': xgb1_params
                                    },
                  presets=presets,
                  time_limit= int(60 * 60 * hours))
            )

In [None]:
results = predictor.fit_summary(show_plot=True)

In [None]:
leaderboard = predictor.leaderboard(val_data)

In [None]:
test_data = TabularDataset(X_test)
test_preds = predictor.predict_proba(test_data)

In [None]:
# Predicting and submission
submission = pd.DataFrame({'id':X_test.index, 
                           'claim': test_preds.iloc[:,1].ravel()})

submission.to_csv("submission.csv", index=False)

In [None]:
submission