# Sleep Research - XGBoost Model

In [2]:
! pip install pandas
! pip install matplotlib
! pip install seaborn
! pip install openpyxl
! pip install xgboost
! pip install scikit-learn



In [None]:

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [None]:
# Load the dataset
# DATA_FILE = 'RawDataset.xlsx'
#DATA_FILE = '5_Features.xlsx'
DATA_FILE = '70_Features.xlsx'
try:
    # read only 1001 columns to reading metadata
    df = pd.read_excel(DATA_FILE, header=0, engine='openpyxl')
    print("Loaded successfully:", DATA_FILE)
except Exception as e:
    print("Failed to load. Update the `path` variable to the correct location. Error:", e)
    df = None

Loaded successfully: 70_Features.xlsx


In [3]:
# Quick overview
df.head() if df is not None else None

Unnamed: 0,Mean,StdDev,Skew,Kurt,Peak2Peak,EnergyRaw,Median,IQR,P01,P05,...,AmpEntropy,DWT_A3,DWT_D3,DWT_D2,DWT_D1,WaveletEntropy,SampleEntropy,PermutationEntropy,DFA,Label
0,3.414019e-16,1.0,0.788815,0.423563,4.531697,0.998223,-0.013784,1.244292,-1.545235,-1.377913,...,3.05911,0.97798,0.010728,0.007757,0.003535,0.128075,0,0,0,2
1,-5.690032e-17,1.0,0.541208,-0.592408,3.976004,0.99658,-0.126803,1.414762,-1.497482,-1.350678,...,3.220521,0.972925,0.018076,0.005841,0.003157,0.147468,0,0,0,2
2,-3.983022e-16,1.0,0.507958,-0.591504,4.063041,0.999362,-0.105651,1.44672,-1.561757,-1.393596,...,3.234085,0.984795,0.007653,0.004495,0.003057,0.094376,0,0,0,2
3,-3.698521e-16,1.0,0.276668,-0.900604,4.127095,0.999542,-0.18935,1.565204,-1.802516,-1.334522,...,3.254256,0.973561,0.018094,0.004841,0.003504,0.144302,0,0,0,2
4,7.397041e-16,1.0,0.33201,-0.677892,4.19047,0.999081,-0.144789,1.473932,-1.873234,-1.454931,...,3.264122,0.982235,0.008991,0.004324,0.00445,0.107604,0,0,0,2


In [4]:
# Missing values and basic stats
if df is not None:
    missing = df.isna().sum().sort_values(ascending=False)
    display(missing[missing>0])
    display(df.describe(include='all').T)

Series([], dtype: int64)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Mean,2119.0,-2.685244e-20,2.759895e-16,-3.357119e-15,-1.138006e-16,0.000000,9.957556e-17,3.470919e-15
StdDev,2119.0,1.000000e+00,7.552664e-16,1.000000e+00,1.000000e+00,1.000000,1.000000e+00,1.000000e+00
Skew,2119.0,1.123786e-01,8.880763e-01,-3.768204e+00,-4.040333e-01,0.112730,5.995871e-01,4.783542e+00
Kurt,2119.0,4.265156e-01,2.255863e+00,-1.803445e+00,-7.537274e-01,-0.180930,6.615262e-01,2.377076e+01
Peak2Peak,2119.0,4.902540e+00,1.062709e+00,2.711897e+00,4.177208e+00,4.765481,5.516499e+00,1.105119e+01
...,...,...,...,...,...,...,...,...
WaveletEntropy,2119.0,3.421243e-01,1.816498e-01,2.722998e-02,2.019265e-01,0.305638,4.535465e-01,1.174381e+00
SampleEntropy,2119.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00
PermutationEntropy,2119.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00
DFA,2119.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00


In [5]:
# Train XGBoost with 10-fold cross-validation
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Ensure dataframe 'df' is present
if 'df' not in globals() or df is None:
    path = DATA_FILE
    try:
        df = pd.read_excel(path, header=0, engine='openpyxl')
        print('Loaded', path)
    except Exception as e:
        raise RuntimeError(f'Dataframe `df` not found and failed to load from {path}') from e

# use Label column as target
TARGET_COLUMN = 'Label'
print('Using target column:', TARGET_COLUMN)

# Prepare X and y
data = df.copy()
data = data.dropna(subset=[TARGET_COLUMN])
y = data[TARGET_COLUMN]
# le = LabelEncoder()
y = y - 1  # change y values from 1,2,3 to 0,1,2
X = data.drop(columns=[TARGET_COLUMN])

Using target column: Label


In [6]:
y.value_counts()

Label
0    728
1    701
2    262
3    180
5    146
4    102
Name: count, dtype: int64

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2119 entries, 0 to 2118
Data columns (total 70 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Mean                  2119 non-null   float64
 1   StdDev                2119 non-null   float64
 2   Skew                  2119 non-null   float64
 3   Kurt                  2119 non-null   float64
 4   Peak2Peak             2119 non-null   float64
 5   EnergyRaw             2119 non-null   float64
 6   Median                2119 non-null   float64
 7   IQR                   2119 non-null   float64
 8   P01                   2119 non-null   float64
 9   P05                   2119 non-null   float64
 10  P10                   2119 non-null   float64
 11  P25                   2119 non-null   float64
 12  P75                   2119 non-null   float64
 13  P90                   2119 non-null   float64
 14  P95                   2119 non-null   float64
 15  P99                  

In [8]:
y.unique()

array([1, 5, 4, 0, 3, 2])

In [9]:
X.columns

Index(['Mean', 'StdDev', 'Skew', 'Kurt', 'Peak2Peak', 'EnergyRaw', 'Median',
       'IQR', 'P01', 'P05', 'P10', 'P25', 'P75', 'P90', 'P95', 'P99',
       'TrimMean_5_95', 'MAD', 'RMS', 'HjorthActivity', 'HjorthMobility',
       'HjorthComplexity', 'RMSSD', 'WaveformLen', 'ZCR', 'DiffSkew',
       'DiffKurt', 'SSC', 'ACF@1', 'ACF@2', 'ACF@3', 'ACF@5', 'ACF@10',
       'ACF@15', 'ACF@20', 'ACF@25', 'DominantFreq', 'SpectralEntropy',
       'SpectralCentroid', 'SpectralBandwidth', 'SpectralFlatness',
       'RelPow_<2Hz', 'RelPow_2-5Hz', 'RelPow_5-10Hz', 'RelPow_10-20Hz',
       'RollOff50', 'RollOff75', 'RollOff85', 'RollOff90', 'RollOff95',
       'SpecSlope', 'WelchCentroid', 'WelchBandwidth', 'WelchEntropy',
       'SpecSkew', 'SpecKurt', 'SpecCrest', 'Ratio_(2-5)/(5-10)',
       'Ratio_(<2)/(2-5)', 'Ratio_(5-10)/(10-20)', 'Ratio_(<2)/(5-10)',
       'AmpEntropy', 'DWT_A3', 'DWT_D3', 'DWT_D2', 'DWT_D1', 'WaveletEntropy',
       'SampleEntropy', 'PermutationEntropy', 'DFA'],
      dtyp

In [None]:
import re

# XGBoost does not allow special characters in column names
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(
    ('[', ']', '<'))) else col for col in X.columns.values]

In [None]:
model = XGBClassifier(random_state=42, n_jobs=-1)
scores = cross_val_score(estimator=model, X=X, y=y, cv=5, n_jobs=-1, )
print(f'Cross-validated scores: {scores}')
print(f'Mean score: {np.mean(scores):.4f} ± {np.std(scores):.4f}')

Cross-validated scores: [0.55188679 0.65801887 0.60141509 0.60377358 0.58628842]
Mean score: 0.6003 ± 0.0343
