<a href="https://colab.research.google.com/github/Bryant-Dental/raptor_functions/blob/main/raptor_functions/examples/supervised_end_to_end.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Libraries

In [1]:
# !pip install raptor_functions

In [2]:
# !pip install mlflow optuna Boruta pycaret awscli boto3 tsfresh

### Configure AWS
- This is necessary to use train_experiments to log results and artifacts on aws instance

In [3]:
# !aws configure

### Import Packages

In [4]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import mlflow
from raptor_functions.supervised.prediction import load_model, make_prediction, get_model_and_features, get_prediction_features
import xgboost as xgb
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from raptor_functions.supervised.train import train_experiments
from raptor_functions.supervised.datasets import get_data
from raptor_functions.supervised.feature_extraction import get_training_features

### Load Data

In [5]:
df = get_data('validated_breath_data')
df.head()

Unnamed: 0,exp_unique_id,exp_name,timesteps,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,humidity,measurement_stage,date_exp,time_elapsed,datetime_exp,filename,result
0,0,39A,0.0,316.594,315.854,315.735,313.18,307.498,315.926,316.188,315.854,315.711,315.424,315.591,315.759,103.213,103.81,103.81,105.792,111.593,103.571,103.404,103.213,103.142,103.786,103.404,103.691,52.467,baseline,20/01/2022,0.0,2022-01-20 13:31:47.000,39A_13_31_00.03.txt,Covid
1,0,39A,1.0,316.928,315.854,316.188,313.324,307.331,315.997,316.499,316.093,316.141,315.281,315.257,315.782,103.094,103.762,103.81,106.03,111.784,103.452,103.524,103.333,103.357,103.882,103.357,103.595,45.112,baseline,20/01/2022,0.25,2022-01-20 13:31:47.250,39A_13_31_00.03.txt,Covid
2,0,39A,2.0,315.711,316.045,315.759,313.514,307.403,315.95,315.926,315.95,316.188,315.114,315.233,315.735,103.404,103.834,103.858,105.959,111.808,103.524,103.285,103.189,103.309,104.001,103.452,103.643,35.637,baseline,20/01/2022,0.5,2022-01-20 13:31:47.500,39A_13_31_00.03.txt,Covid
3,0,39A,3.0,316.499,315.854,315.329,312.536,308.095,315.854,316.403,316.093,315.973,315.162,315.711,315.735,103.357,104.025,104.049,106.102,111.832,103.571,103.404,103.237,103.476,104.073,103.38,103.619,28.102,baseline,20/01/2022,0.75,2022-01-20 13:31:47.750,39A_13_31_00.03.txt,Covid
4,0,39A,4.0,317.048,315.568,315.52,312.679,307.188,315.926,316.451,316.355,316.594,315.353,315.353,315.735,103.333,103.786,103.906,105.959,111.784,103.595,103.476,103.357,103.428,103.929,103.357,103.428,26.794,baseline,20/01/2022,1.0,2022-01-20 13:31:48.000,39A_13_31_00.03.txt,Covid


In [6]:
df['exp_unique_id'].tail()

148    137
149    137
150    137
151    137
152    137
Name: exp_unique_id, dtype: int64

### Feature Engineering
- Features extracted from each sensor array  for all cycles of experiment. 
- These include piecemeal features like mean, median, std, variance and transform features eg FFT and wavelength transform. 
- There is option to use only the raw signals or add the offset and gradient signals 
- Boruta algorithm is used to select relevant features
- Default model for boruta is xgboost but currently does not work in google colab. 
- It is advisable to use another tree model e.g random forest

In [7]:

forest = RandomForestClassifier()

In [8]:
df = get_training_features(df, offset=True, gradient=True, tree_model=forest)

Extracting all features


Feature Extraction:  20%|██        | 4/20 [00:42<01:34,  5.88s/it]

In [None]:
df.head()

In [None]:
print("hello world 2")

### Model Training

In [None]:
train_experiments(df)

### Load Model

In [None]:

model_uri = 's3://raptor-mlflow-data/mlartifacts/1/1a2c4d859a074573869d1add23fb6075/artifacts/Random Forest Classifier'
# loaded_model = load_model(model_uri)


In [None]:
# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(model_uri)

# Load model as a Sklearn flavor.
loaded_model = mlflow.sklearn.load_model(model_uri)



In [None]:
# # Load model from pickle file.
# model_pickle_filepath  = 'path/to/pickle_model'

# loaded_model = load_model(model_uri)

## Prediction

In [None]:
from raptor_functions.supervised.datasets import get_data
df = get_data('handheld_data')

# select one single experiment
df_exp = df.groupby('exp_unique_id').get_group(0)

In [None]:
X = df_exp.drop('result', axis=1)

### Method 1

In [None]:
prediction_data = get_prediction_features(X, model_uri)

In [None]:
prediction = loaded_model.predict(prediction_data)

### Method 2

In [None]:
pred = make_prediction(X, model_uri)