In [None]:
!pip install autogluon --quiet # autogluon==0.2.0

In [None]:
!pip install scikit-learn -U --quiet

In [None]:
# Importing core libraries
import numpy as np
import pandas as pd

# Importing AutoGluon
from autogluon.tabular import TabularDataset, TabularPredictor

# Scikit Learn
from sklearn.model_selection import train_test_split
### YOUR FEATURE ENGINEERING GOES HERE

from sklearn.decomposition import FactorAnalysis
from sklearn.preprocessing import StandardScaler

In [None]:
# Loading data 
X_train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv").set_index('id')
X_test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv").set_index('id')

In [None]:
num_cols  = [c for c in X_train.columns if c.startswith("f")] 
X_train["nan_count"] = X_train.isnull().sum(axis=1)
X_test["nan_count"] = X_test.isnull().sum(axis=1)

means = X_train.mean()
claim = X_train.claim
X_train = X_train.fillna(means).drop('claim', axis='columns')
X_test = X_test.fillna(means)

ss = StandardScaler()
X_train[:] = ss.fit_transform(X_train)

fa = FactorAnalysis(rotation='varimax').fit(X_train)
valid_factors = (fa.components_**2).sum(axis=1) > 0

#X_train = pd.DataFrame(fa.transform(X_train)[:, valid_factors])
X_train['claim'] = claim
#X_test = pd.DataFrame(fa.transform(ss.transform(X_test))[:, valid_factors])
X_test[:] =  ss.transform(X_test)
X_train['min_row'] = X_train[num_cols].min(axis=1)
X_train['mean_row'] = X_train[num_cols].min(axis=1)
X_train['max_row'] = X_train[num_cols].max(axis=1)
X_train['std_row'] = X_train[num_cols].std(axis=1)
X_test['min_row'] = X_test[num_cols].min(axis=1)
X_test['max_row'] = X_test[num_cols].min(axis=1)
X_test['std_row'] = X_test[num_cols].std(axis=1)
X_test['mean_row'] = X_test[num_cols].mean(axis=1)

In [None]:
VALIDATION = False
if VALIDATION is True:
    X_train, X_val = train_test_split(X_train, test_size=int(len(X_train) * 0.2), random_state=42)
    train_data = TabularDataset(X_train)
    val_data = TabularDataset(X_val)
else:
    train_data = TabularDataset(X_train)
    val_data = TabularDataset(X_train.iloc[:100_000, :])

SUBSAMPLE = False
if SUBSAMPLE is True:
    subsample_size = 10_000  # subsample subset of data for faster demo, try setting this to much larger values
    train_data = train_data.sample(n=subsample_size, random_state=0)
    
train_data.head()

In [None]:
label = 'claim'
print("Summary of target variable: \n", train_data[label].describe())

In [None]:
!mkdir agModels

In [None]:
save_path = 'agModels'  # specifies folder to store trained models
presets='best_quality'
metric = 'roc_auc'
hours = 4
hyperparameters = {
   # 'NN': {'num_epochs': 500},
   'GBM': { },
   'CAT': {'iterations': 18000 },
   'RF': { },
   'XT': { },
   #'KNN': {},
   #'custom': ['GBM'],
}
predictor = (TabularPredictor(label=label, eval_metric=metric,
                              path=save_path)
             .fit(train_data,
                  presets=presets,hyperparameters=hyperparameters,num_bag_folds=10, num_bag_sets=1,
                  time_limit= int(60 * 60 * hours))
            )

In [None]:
results = predictor.fit_summary(show_plot=True)

In [None]:
leaderboard = predictor.leaderboard(val_data)

In [None]:
test_data = TabularDataset(X_test)
test_preds = predictor.predict_proba(test_data)[1]

In [None]:
train_preds = predictor.predict_proba(train_data)[1]
train_sub = pd.DataFrame({'id':X_train.index, 
                           'claim': train_preds})

train_sub.to_csv("train_autogluon_pred.csv", index=False)
train_sub.head()

In [None]:
# Predicting and submission
submission = pd.DataFrame({'id':X_test.index, 
                           'claim': test_preds})

submission.to_csv("submission.csv", index=False)
# Public score 0.811

In [None]:
submission.head()