In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download(
    "uom190346a/sleep-health-and-lifestyle-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\User\.cache\kagglehub\datasets\uom190346a\sleep-health-and-lifestyle-dataset\versions\2


## Implementasi Teknik Data Preparation

In [2]:
import pandas as pd

file_path = r"./Sleep_health_and_lifestyle_dataset.csv"
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


### Data Preparation dengan One Hot Encoding

In [3]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ["Gender", "Occupation", "BMI Category", "Sleep Disorder"]

numerical_cols = df.drop(columns=categorical_cols).columns

encoder = OneHotEncoder(drop='first', sparse_output=False)

encoded_array = encoder.fit_transform(df[categorical_cols])

encoded_cols = encoder.get_feature_names_out(categorical_cols)

encoded_df = pd.DataFrame(encoded_array, columns=encoded_cols)

df_encoded = pd.concat(
    [df[numerical_cols].reset_index(drop=True), encoded_df], axis=1)

df_encoded.head()

Unnamed: 0,Person ID,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Blood Pressure,Heart Rate,Daily Steps,Gender_Male,...,Occupation_Sales Representative,Occupation_Salesperson,Occupation_Scientist,Occupation_Software Engineer,Occupation_Teacher,BMI Category_Normal Weight,BMI Category_Obese,BMI Category_Overweight,Sleep Disorder_Sleep Apnea,Sleep Disorder_nan
0,1,27,6.1,6,42,6,126/83,77,4200,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,2,28,6.2,6,60,8,125/80,75,10000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,28,6.2,6,60,8,125/80,75,10000,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,28,5.9,4,30,8,140/90,85,3000,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,5,28,5.9,4,30,8,140/90,85,3000,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


### Data Preparation dengan Outlier Removal

In [4]:
numerical_columns = ['Sleep Duration', 'Physical Activity Level',
                     'Stress Level', 'Heart Rate', 'Daily Steps']

df_no_outliers = df.copy()

for col in numerical_columns:
    Q1 = df_no_outliers[col].quantile(0.25)
    Q3 = df_no_outliers[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_no_outliers = df_no_outliers[(df_no_outliers[col] >= lower_bound) & (
        df_no_outliers[col] <= upper_bound)]

df_no_outliers.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
7,8,Male,29,Doctor,7.8,7,75,6,Normal,120/80,70,8000,
8,9,Male,29,Doctor,7.8,7,75,6,Normal,120/80,70,8000,


In [5]:
print("Jumlah baris sebelum:", df.shape[0])
print("Jumlah baris sesudah:", df_no_outliers.shape[0])

Jumlah baris sebelum: 374
Jumlah baris sesudah: 359


### Data Preparation dengan Normalization

In [6]:
from sklearn.preprocessing import MinMaxScaler

norm_cols = ['Sleep Duration', 'Physical Activity Level',
             'Stress Level', 'Heart Rate', 'Daily Steps']
scaler = MinMaxScaler()
df_normalized = df.copy()
df_normalized[norm_cols] = scaler.fit_transform(df_normalized[norm_cols])

df_normalized.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,0.111111,6,0.2,0.6,Overweight,126/83,0.571429,0.171429,
1,2,Male,28,Doctor,0.148148,6,0.5,1.0,Normal,125/80,0.47619,1.0,
2,3,Male,28,Doctor,0.148148,6,0.5,1.0,Normal,125/80,0.47619,1.0,
3,4,Male,28,Sales Representative,0.037037,4,0.0,1.0,Obese,140/90,0.952381,0.0,Sleep Apnea
4,5,Male,28,Sales Representative,0.037037,4,0.0,1.0,Obese,140/90,0.952381,0.0,Sleep Apnea


### Data Preparaion dengan Standarization

In [7]:
from sklearn.preprocessing import StandardScaler

std_cols = ['Sleep Duration', 'Physical Activity Level',
            'Stress Level', 'Heart Rate', 'Daily Steps']
scaler_std = StandardScaler()
df_standardized = df.copy()
df_standardized[std_cols] = scaler_std.fit_transform(df_standardized[std_cols])

df_standardized.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,-1.298887,6,-0.825418,0.347021,Overweight,126/83,1.654719,-1.619584,
1,2,Male,28,Doctor,-1.173036,6,0.039844,1.475592,Normal,125/80,1.170474,1.970077,
2,3,Male,28,Doctor,-1.173036,6,0.039844,1.475592,Normal,125/80,1.170474,1.970077,
3,4,Male,28,Sales Representative,-1.550588,4,-1.40226,1.475592,Obese,140/90,3.591698,-2.362273,Sleep Apnea
4,5,Male,28,Sales Representative,-1.550588,4,-1.40226,1.475592,Obese,140/90,3.591698,-2.362273,Sleep Apnea


### Pembuatan DataSet

In [8]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder

df.isnull().sum()

Person ID                    0
Gender                       0
Age                          0
Occupation                   0
Sleep Duration               0
Quality of Sleep             0
Physical Activity Level      0
Stress Level                 0
BMI Category                 0
Blood Pressure               0
Heart Rate                   0
Daily Steps                  0
Sleep Disorder             219
dtype: int64

In [9]:
df_encoded = df.copy()
for col in df_encoded.select_dtypes(include=['object']).columns:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

X = df_encoded.drop('Sleep Duration', axis=1)
y = df_encoded['Sleep Duration']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Jumlah data latih:", X_train.shape[0])
print("Jumlah data uji:", X_test.shape[0])

Jumlah data latih: 299
Jumlah data uji: 75


In [10]:
model = DecisionTreeRegressor(random_state=42)

cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')

print("Skor R² dari Cross Validation (5-fold):", cv_scores)
print("Rata-rata R²:", cv_scores.mean())

Skor R² dari Cross Validation (5-fold): [0.82106023 0.69550198 0.8454479  0.96903624 0.96797266]
Rata-rata R²: 0.8598038013477151


In [11]:
print(f"📊 Rata-rata skor R² dari cross-validation: {cv_scores.mean():.4f}")

📊 Rata-rata skor R² dari cross-validation: 0.8598
