In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import seaborn as sns

import matplotlib.pyplot as plt

!pip -q install --upgrade autogluon

from autogluon.tabular import TabularDataset , TabularPredictor

In [None]:
df = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv')
df

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
# copy the data
df_norm = df.copy()
  
# apply normalization techniques
for column in df_norm.columns[1:-1]:
    df_norm[column] = df_norm[column]  / df_norm[column].abs().max()
    
df_norm

In [None]:
df.plot.scatter( df.columns[1] , df.columns[2])

In [None]:
df1 = df[df.columns[1:10]]
df1

sns.pairplot(df1 , corner=True )

In [None]:
g = sns.FacetGrid(df1, col=df.columns[1], col_wrap=4, height=2, ylim=(0, 10))
g.map(sns.pointplot, df.columns[2], df.columns[3], order=[1, 2, 3 , 4 , 5], color=".3", ci=None)

# Training model using AutoML - AutoGluon

## Train on partial data

In [None]:
train_data = TabularDataset('../input/tabular-playground-series-jun-2021/train.csv')

train_data = train_data.sample(n = 5000 , random_state = 8)
train_data

label = 'target'
print(train_data[label].describe())

save_path = './'
predictor = TabularPredictor(label = label , path = save_path , verbosity=3).fit(train_data)

## Lets Go Deeper

In [None]:
label = 'target'
eval_metric='log_loss'
save_path = './AutoGlon/'
timeLimit = 60 * 10   #30 minutes approx
num_trials = 10
search_strategy = 'auto' 

hyperparameter_kwargs = {'num_trials': num_trials,
                         'scheduler' : 'local',
                         'searcher': search_strategy
                        }

models={'NN': {},
        'GBM': [{'device':'gpu', 'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}},{'device':'gpu'},'GBMLarge',],
        'CAT': {'task_type':'GPU'},
        'XGB': {'tree_method':'gpu_hist'},
        'FASTAI': {}}

predictor = TabularPredictor(label=label,
                             eval_metric=eval_metric,
                             path=save_path,
                             verbosity=1) # 0 = Silent, 4 = Print every damn thing

predictor.fit(df,
              hyperparameters=models,
              presets='best_quality',
              keep_only_best=True,
              num_bag_folds=5, num_bag_sets=1, num_stack_levels=1,
              refit_full=True,
              save_space=True,
              hyperparameter_tune_kwargs=hyperparameter_kwargs,
              time_limit=timeLimit)

results = predictor.fit_summary()
results

In [None]:
predictor = TabularPredictor.load(save_path)

y_test = TabularDataset('../input/tabular-playground-series-jun-2021/test.csv')

y_pred = predictor.predict_proba(y_test)
y_pred['id'] = y_test['id']


y_pred.to_csv('submission.csv' , index = None)

y_pred

In [None]:
subsample_size = 500 
y_eval = train_data.sample(n = 500 , random_state = 16)

'''test_data_head = y_test[label]
test_data = y_test.drop(columns =[label])
test_data.head()

y_pred = predictor.predict(test_data)
print(y_pred)

perf = predictor.evaluate_predictions(y_true = test_data_head , y_pred = y_pred , auxiliary_metrics = True)
perf'''

predictor.leaderboard(y_eval , silent = True)

In [None]:
train_data = TabularDataset('../input/tabular-playground-series-jun-2021/train.csv')

sub_train_data = train_data.sample(n = 500 , random_state = 18)

predictor.feature_importance(sub_train_data)