#### This notebook intends to share a basic, beginner-level time series classification approach. Statistical features are calculated using the tsfresh library, relevant features are selected, and the model used is XGBoost.

# Loading Data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Read the csv files and see a few rows of the train.csv and train_labels.csv files

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
train_labels = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv", index_col='sequence')
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/test.csv")
sample_sub = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv")

display(train_df)
display(train_labels)

# Feature Extraction using tsfresh

In [None]:
from tsfresh.feature_extraction import extract_features, MinimalFCParameters

train_X = extract_features(train_df, default_fc_parameters=MinimalFCParameters(), column_id="sequence", column_sort="step")
test_X = extract_features(test_df, default_fc_parameters=MinimalFCParameters(), column_id="sequence", column_sort="step")

[Above] Note - *MinimalFCParameters* requires significantly less computation than *ComprehensiveFCParameters*, but computes less features. 
*Column_id* tells tsfresh which column to group by. Each sequence is identified by the *sequence* column; therefore, that is what we are passing. tsfresh will extract features for each unique sequence.

In [None]:
y = pd.Series(train_labels.state).astype(int) #Create labels series (correct format for tsfresh)

In [None]:
y.head()

# Select the most relevant features 

In [None]:
from tsfresh import select_features
X_selected = select_features(train_X, y)
test_X_selected = test_X[X_selected.columns] 

In [None]:
X_selected

# Train-Val-Split and XGBoost Model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_selected, y, train_size = 0.8)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=2500, learning_rate=0.01, tree_method="gpu_hist", predictor="gpu_predictor", eval_metric="auc")
model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_valid, y_valid)])

#### Note - the XGBoost model (above) uses GPU acceleration, but it is not necessary. To run without a GPU, simply remove *tree_method="gpu_hist", predictor="gpu_predictor* from the *XGBClassifier* parameters.

# Prediction and submission

In [None]:
preds = model.predict(test_X_selected)

In [None]:
print(len(preds) == len(test_X_selected)) #checking to see the length of predictions match the length of the data 

In [None]:
submission = pd.DataFrame({
    "sequence" : sample_sub.sequence,
    "state" : preds
})

In [None]:
submission.to_csv("submission.csv", index=False)