# AutoGluon 

- Hello Kagglers, in this Notebook I will use simple features and AutoGluon to get a good robust model.

- This notebooks is inspired from C4rl05/V 's work
https://www.kaggle.com/code/cv13j0/tps-apr-2022-xgboost-model


- This is my first notebook and I am trying to learn to contribute in a better way

- Please `Upvote` if you find this notebook useful.

### Data Descriptions
In this competition, you'll classify 60-second sequences of sensor data, indicating whether a subject was in either of two activity states for the duration of the sequence

### Files and Field Descriptions
train.csv - the training set, comprising ~26,000 60-second recordings of thirteen biological sensors for almost one thousand experimental participants
* sequence - a unique id for each sequence
* subject - a unique id for the subject in the experiment
* step - time step of the recording, in one second intervals
* sensor_00 - sensor_12 - the value for each of the thirteen sensors at that time step
* train_labels.csv - the class label for each sequence.
* sequence - the unique id for each sequence.
* state - the state associated to each sequence. This is the target which you are trying to predict.

test.csv - the test set. For each of the ~12,000 sequences, you should predict a value for that sequence's state.

sample_submission.csv - a sample submission file in the correct format.

---

# 1. Downloading / Loading the Required Libraries

In [None]:
! pip install autogluon

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.metrics import roc_auc_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
     for filename in filenames:
         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings

___

# 2. Setting the Notebook

In [None]:
%%time
# I like to disable my Notebook Warnings.
import warnings
warnings.filterwarnings('ignore')



___

# 3. Data Loading

In [None]:
%%time
trn_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train.csv')
trn_label_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv')
tst_data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/test.csv')

sub = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv')


# 4. Exploring the Information Available

## 4.1. Analysing the Train Dataset

In [None]:
%%time
trn_data.info()

In [None]:
%%time
trn_data.head(10)

In [None]:
%%time
trn_data.describe()

In [None]:
%%time
trn_data.isnull().sum()

---

## 4.2. Analysing the Trian Labels Dataset

In [None]:
%%time
trn_label_data.info()

In [None]:
%%time
trn_label_data.head()

In [None]:
%%time
trn_label_data.describe()

In [None]:
%%time
trn_label_data.isnull().sum()

---

## 4.3. Analysing the Trian Dataset, Using Groups

In [None]:
%%time
trn_summary = trn_data[['sequence', 'subject', 'step']].groupby(['sequence', 'subject']).count().reset_index()

In [None]:
%%time
trn_summary[trn_summary['subject'] == 66].shape

In [None]:
%%time
summary_by_subject = trn_summary[['sequence', 'subject']].groupby(['subject']).count().reset_index()
summary_by_subject.head()

In [None]:
%%time
trn_unique_subjects = set(list(trn_data['subject'].unique()))
tst_unique_subjects = set(list(tst_data['subject'].unique()))
overlap_subjets = trn_unique_subjects.intersection(tst_unique_subjects)
print('Repeated Subjects in Test Dataset:', len(overlap_subjets))

---



# 5. Creating New Model Features

## 5.1. Creating Aggregated Features by Subject and Sequence

In [None]:
%%time
from scipy.stats import kurtosis
def kurtosis_func(series):
    '''
    Describe something...
    '''
    return kurtosis(series)

def q01(series):
    return np.quantile(series, 0.01)

def q05(series):
    return np.quantile(series, 0.05)

def q95(series):
    return np.quantile(series, 0.95)

def q99(series):
    return np.quantile(series, 0.99)

def aggregated_features(df, aggregation_cols = ['sequence'], prefix = ''):
    agg_strategy = {'sensor_00': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_01': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_02': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_03': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_04': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_05': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_06': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_07': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_08': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_09': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_10': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_11': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                    'sensor_12': ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew', kurtosis_func, q01, q05, q95, q99],
                   }
    group = df.groupby(aggregation_cols).aggregate(agg_strategy)
    group.columns = ['_'.join(col).strip() for col in group.columns]
    group.columns = [str(prefix) + str(col) for col in group.columns]
    group.reset_index(inplace = True)
    
    temp = (df.groupby(aggregation_cols).size().reset_index(name = str(prefix) + 'size'))
    group = pd.merge(temp, group, how = 'left', on = aggregation_cols,)
    return group

In [None]:
%%time
trn_merge_data = aggregated_features(trn_data, aggregation_cols = ['sequence', 'subject'])
tst_merge_data = aggregated_features(tst_data, aggregation_cols = ['sequence', 'subject'])

## 5.2. Creating Aggregated Features by Subject

In [None]:
%%time
trn_subjects_merge_data = aggregated_features(trn_data, aggregation_cols = ['subject'], prefix = 'subject_')
tst_subjects_merge_data = aggregated_features(tst_data, aggregation_cols = ['subject'], prefix = 'subject_')

In [None]:
%%time
trn_subjects_merge_data.head()

---



# 6. Merging the Datasets for Training

In [None]:
%%time
trn_merge_data = trn_merge_data.merge(trn_label_data, how = 'left', on = 'sequence')

In [None]:
%%time
trn_merge_data = trn_merge_data.merge(trn_subjects_merge_data, how = 'left', on = 'subject')
tst_merge_data = tst_merge_data.merge(tst_subjects_merge_data, how = 'left', on = 'subject')

In [None]:
%%time
trn_merge_data.head()

In [None]:
%%time
tst_merge_data.head()

---

# 7. Post Processing the Information for the Model

In [None]:
%%time
ignore = ['sequence', 'state', 'subject']
features = [feat for feat in trn_merge_data.columns if feat not in ignore]
label = 'state'
target_feature = label

---

# 8. Creating a Simple Train / Test Split Strategy

In [None]:
%%time
from sklearn.model_selection import train_test_split
test_size_pct = 0.10
X_train, X_valid, y_train, y_valid = train_test_split(trn_merge_data[features], trn_merge_data[target_feature], test_size = test_size_pct, random_state = 42)

In [None]:
X_train[label] = y_train

---

# 9. Modeling using AutoGluon

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor
train_data = TabularDataset(X_train)
valid_data = TabularDataset(X_valid)

In [None]:
# Specify the label column, eval metric, time limit
eval_metric = 'roc_auc'
time_limit = 3600 * 6  # 6 hours

In [None]:
# Fit end-to-end with raw data in one line of code
predictor = TabularPredictor(
    label=label, eval_metric=eval_metric).fit(train_data, presets='best_quality', time_limit=time_limit)

In [None]:
predictor.leaderboard()

In [None]:
valid_pred_proba = predictor.predict_proba(valid_data)
score = roc_auc_score(y_valid, valid_pred_proba[1])
print(score)

In [None]:
import pandas as pd
test_data = TabularDataset(tst_merge_data[features])
test_pred_proba = predictor.predict_proba(test_data)

In [None]:
sub["state"] = test_pred_proba[1]
sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)


