# Getting data -> (X,y) with all the features

In [25]:
from raptor_functions.supervised.datasets import get_data
df = get_data('handheld_data')

In [26]:
df_0 = df

In [27]:
import numpy as np
import pandas as pd
import xgboost as xgb
from boruta import BorutaPy
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import (
    ComprehensiveFCParameters,
    extract_features,
)
from tsfresh import extract_features

In [28]:
FEATURES = [
    "exp_unique_id",
    "timesteps",
    "sensor_1",
    "sensor_2",
    "sensor_3",
    "sensor_4",
    "sensor_5",
    "sensor_6",
    "sensor_7",
    "sensor_8",
    "sensor_9",
    "sensor_10",
    "sensor_11",
    "sensor_12",
    "sensor_13",
    "sensor_14",
    "sensor_15",
    "sensor_16",
    "sensor_17",
    "sensor_18",
    "sensor_19",
    "sensor_20",
    "sensor_21",
    "sensor_22",
    "sensor_23",
    "sensor_24",
]

TARGET_COL = "result"

unique_id="exp_unique_id"
label="result"
timesteps="timesteps"

In [29]:
extraction_settings = ComprehensiveFCParameters()

In [30]:
y = df.groupby(unique_id).first()[label]
X = df.drop(label, axis=1)

In [31]:
features = [col for col in X if col.startswith('sensor')]
features = [unique_id, timesteps] + features

X = X[features]

# X = df.drop(label, axis=1)
# y = df.groupby(unique_id).first()[label]

X = extract_features(
    X,
    column_id=unique_id,
    column_sort=timesteps,
    default_fc_parameters=extraction_settings,
    # we impute = remove all NaN features automatically
    impute_function=impute,
)

Feature Extraction: 100%|██████████| 20/20 [00:22<00:00,  1.15s/it]


In [32]:
X.head()

Unnamed: 0,sensor_1__variance_larger_than_standard_deviation,sensor_1__has_duplicate_max,sensor_1__has_duplicate_min,sensor_1__has_duplicate,sensor_1__sum_values,sensor_1__abs_energy,sensor_1__mean_abs_change,sensor_1__mean_change,sensor_1__mean_second_derivative_central,sensor_1__median,...,sensor_24__permutation_entropy__dimension_6__tau_1,sensor_24__permutation_entropy__dimension_7__tau_1,sensor_24__query_similarity_count__query_None__threshold_0.0,"sensor_24__matrix_profile__feature_""min""__threshold_0.98","sensor_24__matrix_profile__feature_""max""__threshold_0.98","sensor_24__matrix_profile__feature_""mean""__threshold_0.98","sensor_24__matrix_profile__feature_""median""__threshold_0.98","sensor_24__matrix_profile__feature_""25""__threshold_0.98","sensor_24__matrix_profile__feature_""75""__threshold_0.98",sensor_24__mean_n_absolute_max__number_of_maxima_7
0,1.0,0.0,0.0,1.0,15693.25,1962165.0,1.147378,0.002984,-0.005024,131.184,...,4.3731,4.569595,0.0,1.17142,3.822228,2.409795,2.474262,1.771601,2.914718,155.948143
1,1.0,0.0,0.0,1.0,15906.505,2001377.0,0.895402,-0.003984,0.003008,131.121,...,4.400694,4.604489,0.0,0.842265,3.331958,1.880553,1.853738,1.349697,2.452201,156.120286
2,1.0,1.0,0.0,1.0,15674.239,1960253.0,1.115709,-0.004008,0.003028,131.4975,...,4.273651,4.507078,0.0,0.781496,4.026202,2.332053,2.333184,1.543296,3.029155,158.410286
3,1.0,0.0,0.0,1.0,15502.825,1904814.0,0.942016,0.000976,0.005905,128.703,...,4.50433,4.642867,0.0,0.936207,3.382332,2.014803,2.100663,1.232587,2.590493,156.301571
4,1.0,0.0,0.0,1.0,16154.553,2069512.0,1.050535,0.0,-0.003599,133.588,...,4.336223,4.530518,0.0,0.913677,3.611633,2.139663,2.118371,1.551142,2.609582,158.603143


In [33]:
y.head()

exp_unique_id
0    Control
1      Covid
2    Control
3      Covid
4    Control
Name: result, dtype: object

In [34]:
# y_binary = pd.get_dummies(y)
# y_binary = y_binary['Covid']
# y = y_binary
# y = y.to_frame()
# y = y.rename(columns = {'Covid':'result'}, inplace = True)

In [35]:
df = pd.concat([X,y], axis=1)

# Train / Test datasets + Model

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [37]:
df_train = pd.concat([X_train,y_train], axis=1)
df_test = pd.concat([X_test,y_test], axis=1)

In [38]:
# Train Model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [39]:
# import xgboost as xgb
# model = xgb.XGBRegressor()
# model.fit(X_train, y_train)


# Deep checks

In [40]:
label_col = 'result'

In [41]:
from deepchecks.tabular import Dataset

# We explicitly state that this dataset has no categorical features, otherwise they will be automatically inferred
# If the dataset has categorical features, the best practice is to pass a list with their names

ds_train = Dataset(df_train, label=label_col, cat_features=[])
ds_test =  Dataset(df_test,  label=label_col, cat_features=[])

New Data: Single Dataset Validation

When you start working with a new dataset, you have only a single dataset (no train-test split), and you probably don’t have a model. As part of your EDA you want to ensure your data’s integrity, and have it ready for your needs. For example, you want to know if there are many duplicate samples, problems with string or categorical features, significant outliers, inconsistent labels, etc.

For these purposes you can use the deepchecks.tabular.suites.single_dataset_integrity() suite.

In [42]:
from deepchecks.tabular.suites import single_dataset_integrity
suite = single_dataset_integrity() 
suite_result = suite.run(df)
suite_result.save_as_html('df_integrify.html')

Single Dataset Integrity Suite:   0%|          | 0/8 [00:00<?, ? Check/s]

In [43]:
suite_result = suite.run(df_0)
suite_result.save_as_html('df0_integrify')


Single Dataset Integrity Suite:   0%|          | 0/8 [00:00<?, ? Check/s]

After Splitting the Data: Train-Test Validation

When you split your data (for whichever purpose and manner), you have two or more separate datasets, however you might not have a model yet. Just before you continue working with your data you want to ensure that the splits are indeed representative as you want them to be. For example, you want to verify that the classes are balanced similarly, that there is no significant change in distributions between the features or labels in each of the classes, that there is no potential data leakage that may contaminate your model or perceived results, etc.

For these purposes you can use the deepchecks.tabular.suites.train_test_validation() suite.

In [44]:
# from deepchecks.tabular.suites import train_test_validation
# suite = train_test_validation()
# suite_result = suite.run(train_dataset=ds_train, test_dataset=ds_test, model=model)
# suite_result.save_as_html('train_test_validation')

After Training a Model: Analysis & Validation

At this phase you have a trained model which you want to evaluate. Thus, you probably want to look at examine several performance metrics, compare it to various benchmarks and be able to construct a clear picture about the model’s performance. you may also want to try identify where it under-performs, and investigate to see if you discover any insights that you may use to improve its performance.

For these purposes you can use the deepchecks.tabular.suites.model_evaluation() suite.

In [45]:
from deepchecks.tabular.suites import model_evaluation
suite = model_evaluation()
suite_result = suite.run(model=model, train_dataset=ds_train, test_dataset=ds_test)
suite_result.save_as_html('model_evaluation.html')

Model Evaluation Suite:   0%|          | 0/11 [00:00<?, ? Check/s]

General Overview: Full Suite

Here you want to have a quick overview of the project, and receive all of the insights that you can get, given a specific state of the model and the data.

For this purpose you can use the deepchecks.tabular.suites.full_suite().

In [46]:
from deepchecks.tabular.suites import full_suite
suite = full_suite()
suite_result = suite.run(train_dataset=ds_train, test_dataset=ds_test, model=model)
suite_result.save_as_html('deep_check_full.html')

Full Suite:   0%|          | 0/35 [00:00<?, ? Check/s]


invalid value encountered in double_scalars

