# Experiment tool with open dataset

In [1]:
import pandas as pd

from src.xtrees.TreeDash import *
from src.xtrees.TreePlot import *
from src.xtrees.VizTree import *
from src.xtrees.ForestBasedTree import *

from jupyter_dash import JupyterDash

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

seed = 42

In [2]:
# Cleaning functions

def dummies(df):
    categorical_columns = df.select_dtypes(include=['category']).columns
    df_dummies = pd.DataFrame()
    for col in categorical_columns:
        dummies = pd.get_dummies(df[col], prefix=col, prefix_sep='_is_', dtype=int)
        dummies.columns = [
            f"{col}_is_{str(value).lower().replace(' ', '-').replace('_', '-')}"
            for value in dummies.columns.str.split('_is_').str[-1]
        ]
        df_dummies = pd.concat([df_dummies, dummies], axis=1)
    df = df.drop(columns=categorical_columns)
    df_final = pd.concat([df, df_dummies], axis=1)
    return df_final

def clean_feature_names(df):
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(' ', '-')
    df.columns = df.columns.str.replace('_', '-')
    df.columns = df.columns.str.replace('--', '-')
    return df



## Sleep Health and Lifestyle Dataset
[source](https://www.kaggle.com/datasets/uom190346a/sleep-health-and-lifestyle-dataset)

To properly run this script, you should download the CSV file from kaggle and add it to:
`data/Sleep_health_and_lifestyle_dataset.csv`

## Cleaning

In [3]:
sleep = pd.read_csv("data/Sleep_health_and_lifestyle_dataset.csv", index_col="Person ID")

sleep['Gender'] = sleep['Gender'].astype('category')
sleep['Age'] = sleep['Age'].astype(int)
sleep['Occupation'] = sleep['Occupation'].astype('category')
sleep['Sleep Duration'] = sleep['Sleep Duration'].astype(float)
sleep['Quality of Sleep'] = sleep['Quality of Sleep'].astype(int)
sleep['Physical Activity Level'] = sleep['Physical Activity Level'].astype(int)
sleep['Stress Level'] = sleep['Stress Level'].astype(int)
sleep['BMI Category'] = sleep['BMI Category'].astype('category')
sleep['Heart Rate'] = sleep['Heart Rate'].astype(int)
sleep['Daily Steps'] = sleep['Daily Steps'].astype(int)
sleep['Sleep Disorder'] = sleep['Sleep Disorder'].astype('category')

sleep[['systolic', 'diastolic']] = sleep['Blood Pressure'].str.split('/', expand=True)
sleep['systolic'] = sleep['systolic'].astype(int)
sleep['diastolic'] = sleep['diastolic'].astype(int)
sleep = sleep.drop(columns='Blood Pressure')

sleep = clean_feature_names(sleep)
sleep = dummies(sleep)


In [4]:
print(sleep.head())

           age  sleep-duration  quality-of-sleep  physical-activity-level  \
Person ID                                                                   
1           27             6.1                 6                       42   
2           28             6.2                 6                       60   
3           28             6.2                 6                       60   
4           28             5.9                 4                       30   
5           28             5.9                 4                       30   

           stress-level  heart-rate  daily-steps  systolic  diastolic  \
Person ID                                                               
1                     6          77         4200       126         83   
2                     8          75        10000       125         80   
3                     8          75        10000       125         80   
4                     8          85         3000       140         90   
5                     

## Regression

In [5]:
target = "sleep-duration"
sleep_X_reg = sleep.drop(columns=[target, 'quality-of-sleep'])

sleep_feature_names = sleep_X_reg.columns
sleep_feature_types = sleep_X_reg.dtypes

sleep_y = sleep[target]

sleep_X_train, sleep_X_test, sleep_y_train, sleep_y_test = train_test_split(sleep_X_reg, sleep_y, test_size=0.3, random_state=seed)

In [6]:
X = sleep_X_reg
y = sleep_y
feature_names = sleep_feature_names
feature_types = sleep_feature_types

X_train = sleep_X_train
X_test = sleep_X_test
y_train = sleep_y_train
y_test = sleep_y_test

In [7]:

# rf parameters
num_of_estimators = 20
max_depth = 4
min_sample_leaf = max(1, int(0.02 * len(X_train)))

sleep_rf = RandomForestRegressor(n_estimators=num_of_estimators, max_depth=max_depth, min_samples_leaf=min_sample_leaf, random_state=seed)
sleep_rf.fit(X_train, y_train)


minimal_forest_size=10
max_number_of_branches=50
exclusion_threshold=0.8

sleep_fbt = ForestBasedTree(random_state=seed)

sleep_fbt.fit(sleep_rf, X_train, y_train, feature_types, feature_names, 
        minimal_forest_size=minimal_forest_size, amount_of_branches_threshold=max_number_of_branches, exclusion_threshold=exclusion_threshold)

print(sleep_fbt.cs_df.head())


   0_upper  0_lower  1_upper  1_lower  2_upper  2_lower  3_upper  3_lower  \
0     48.0     -inf     65.0     -inf      6.5     -inf      inf     -inf   
1     48.0     -inf     70.0     65.0      6.5     -inf      inf     -inf   
2     48.0     -inf     70.0     -inf      6.5     -inf      inf     -inf   
3     48.0     -inf     70.0     -inf      6.5     -inf      inf     -inf   
4     30.5     -inf      inf     72.5      6.5     -inf      inf     -inf   

   4_upper  4_lower  ...  22_lower  23_upper  23_lower  24_upper  24_lower  \
0      inf     -inf  ...      -inf       0.5      -inf       inf      -inf   
1      inf     -inf  ...      -inf       0.5      -inf       inf      -inf   
2      inf     -inf  ...      -inf       inf       0.5       inf      -inf   
3      inf     -inf  ...      -inf       inf       0.5       inf      -inf   
4      inf     -inf  ...      -inf       0.5      -inf       inf      -inf   

   25_upper  25_lower  number_of_samples  branch_probability  regres

In [8]:

sleep_fbt_viz = VizTree(sleep_fbt, sleep_X_reg)

combined_dashboard = CombinedDashboard(sleep_fbt_viz, X_test, y_test, X, sleep_rf)
combined_dashboard.run(port=8070)


In [9]:
sleep_fbt_viz = VizTree(sleep_fbt, sleep_X_reg)
sankey_plot = SankeyTreePlot(sleep_fbt_viz, show_text=False, show_label=False)
sankey_plot.show()

## Classification

In [10]:
target = "quality-of-sleep"
sleep_X_class = sleep.drop(columns=[target, 'sleep-duration'])

sleep_feature_names = sleep_X_class.columns
sleep_feature_types = sleep_X_class.dtypes

sleep_y = sleep[target]

le = LabelEncoder()
sleep_y = le.fit_transform(sleep_y)
sleep_class_names = [str(i) for i in le.classes_]

sleep_X_train, sleep_X_test, sleep_y_train, sleep_y_test = train_test_split(sleep_X_class, sleep_y, test_size=0.3, random_state=seed)

X = sleep_X_class
y = sleep_y
feature_names = sleep_feature_names
feature_types = sleep_feature_types

X_train = sleep_X_train
X_test = sleep_X_test
y_train = sleep_y_train
y_test = sleep_y_test

In [11]:
# rf parameters
sleep_dt = DecisionTreeClassifier(max_depth=10,random_state=seed)
sleep_dt.fit(X_train, y_train)

In [12]:

viz_dt = VizTree(sleep_dt, X=sleep_X_class, class_names=sleep_class_names)

prune = viz_dt.prune(5)
sankey_plot = SankeyTreePlot(prune, show_text=True, show_label=True)
sankey_plot.show()


In [13]:

viz_dt = VizTree(sleep_dt, X=sleep_X_class, class_names=sleep_class_names)
dt_dashboard = VizTreeDashboard(viz_dt, X_test, y_test)
dt_dashboard.run(port=8071)
