# Sleep Research - XGBoost Model

In [2]:
! pip install pandas
! pip install matplotlib
! pip install seaborn
! pip install openpyxl
! pip install xgboost
! pip install scikit-learn



In [1]:

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [12]:
# Load the dataset
DATA_FILE = 'RawDataset.xlsx'
#DATA_FILE = '5_Features.xlsx'
#DATA_FILE = '70_Features.xlsx'
try:
    # read only 1001 columns to reading metadata
    df = pd.read_excel(DATA_FILE, header=0, engine='openpyxl')
    print("Loaded successfully:", DATA_FILE)
except Exception as e:
    print("Failed to load. Update the `path` variable to the correct location. Error:", e)
    df = None

Loaded successfully: RawDataset.xlsx


In [13]:
# Quick overview
df.head() if df is not None else None

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,1000
0,0.850866,-0.448092,-0.511831,-0.401936,-0.335998,-0.382155,-0.194234,0.011268,-0.070053,0.037642,...,1.411329,1.446495,1.422318,1.40034,1.378359,1.36737,1.334403,1.347589,1.332204,2
1,0.104705,0.068097,0.066169,0.066169,0.052681,0.031489,-0.066779,-0.163118,-0.188167,-0.23441,...,2.104716,2.097009,2.073888,2.100862,2.160594,2.177935,2.050767,1.960207,1.848454,2
2,-1.39395,-1.397894,-1.39395,-1.460988,-1.500423,-1.551687,-1.555631,-1.646329,-1.703509,-1.737028,...,0.560025,0.528477,0.560025,0.532421,0.55411,0.668963,0.824235,0.765084,0.798604,2
3,-1.123399,-1.139669,-1.149837,-1.153904,-1.123399,-1.145769,-1.115265,-1.143734,-1.113232,-1.149837,...,1.320972,1.25793,1.445021,1.365712,1.554835,1.784631,1.815134,1.510096,0.676325,2
4,-1.309756,-1.34342,-1.299125,-1.33279,-1.318616,-1.322159,-1.320388,-1.357595,-1.320388,-1.355822,...,1.877716,1.652697,1.682819,1.620805,1.388698,1.271761,1.337317,0.630368,0.95815,2


In [14]:
# Missing values and basic stats
if df is not None:
    missing = df.isna().sum().sort_values(ascending=False)
    display(missing[missing>0])
    display(df.describe(include='all').T)

Series([], dtype: int64)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,2119.0,-0.006783,1.138254,-6.730676,-0.746862,-0.066822,0.759308,5.547731
1,2119.0,-0.025213,1.141473,-5.757372,-0.758837,-0.083490,0.728584,5.670820
2,2119.0,-0.016369,1.125723,-6.730676,-0.736172,-0.073427,0.736363,5.375439
3,2119.0,-0.019514,1.128614,-5.181815,-0.743879,-0.092537,0.722054,5.547731
4,2119.0,-0.008702,1.116778,-5.064086,-0.729302,-0.078169,0.733480,5.498501
...,...,...,...,...,...,...,...,...
996,2119.0,0.039216,1.098088,-4.177942,-0.692720,-0.031201,0.780299,7.340937
997,2119.0,0.033716,1.108407,-4.932188,-0.711155,-0.037977,0.777024,7.450354
998,2119.0,0.037261,1.095251,-5.351619,-0.706308,-0.049077,0.785438,6.739191
999,2119.0,0.028164,1.098774,-5.757808,-0.707022,-0.047974,0.768466,6.119199


In [16]:
# Train XGBoost with 10-fold cross-validation
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Ensure dataframe 'df' is present
if 'df' not in globals() or df is None:
    path = DATA_FILE
    try:
        df = pd.read_excel(path, header=0, engine='openpyxl')
        print('Loaded', path)
    except Exception as e:
        raise RuntimeError(f'Dataframe `df` not found and failed to load from {path}') from e

# use Label column as target
TARGET_COLUMN = df.columns[-1]  # Assuming the last column is the target
print('Using target column:', TARGET_COLUMN)

# Prepare X and y
data = df.copy()
data = data.dropna(subset=[TARGET_COLUMN])
y = data[TARGET_COLUMN]
# le = LabelEncoder()
y = y - 1  # change y values from 1,2,3 to 0,1,2
X = data.drop(columns=[TARGET_COLUMN])

Using target column: 1000


In [17]:
y.value_counts()

1000
0    728
1    701
2    262
3    180
5    146
4    102
Name: count, dtype: int64

In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2119 entries, 0 to 2118
Columns: 1000 entries, 0 to 999
dtypes: float64(1000)
memory usage: 16.2 MB


In [19]:
y.unique()

array([1, 5, 4, 0, 3, 2])

In [20]:
X.columns

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
       ...
       990, 991, 992, 993, 994, 995, 996, 997, 998, 999],
      dtype='int64', length=1000)

In [21]:
import re

# XGBoost does not allow special characters in column names
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(
    ('[', ']', '<'))) else col for col in X.columns.values]

In [22]:
model = XGBClassifier(random_state=42, n_jobs=-1)
scores = cross_val_score(estimator=model, X=X, y=y, cv=5, n_jobs=-1, )
print(f'Cross-validated scores: {scores}')
print(f'Mean score: {np.mean(scores):.4f} ± {np.std(scores):.4f}')

Cross-validated scores: [0.56839623 0.43867925 0.40566038 0.51886792 0.4964539 ]
Mean score: 0.4856 ± 0.0577
