<a href="https://colab.research.google.com/github/breathofthe/predicting-sleep-stage-/blob/main/%EB%8B%A4%EC%8B%9C%ED%95%98%EC%9E%90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# 데이터 로드
data = pd.read_csv('/content/train.csv')

# IQR 계산
Q1 = data['pulse'].quantile(0.25)
Q3 = data['pulse'].quantile(0.75)
IQR = Q3 - Q1

# 이상치 범위 설정
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# 이상치 제거
data_filtered = data[(data['pulse'] >= lower_bound) & (data['pulse'] <= upper_bound)]

In [None]:
# 각 sleep_stage의 행 수 계산
sleep_stage_counts = data['sleep_stage'].value_counts()

# 결과 출력
print(sleep_stage_counts)


sleep_stage
3.0    6052652
0.0    2782033
2.0    1880966
1.0    1758155
4.0     324668
Name: count, dtype: int64


In [None]:
# 로그 변환 적용
data_filtered['log_pulse'] = np.log1p(data_filtered['pulse'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['log_pulse'] = np.log1p(data_filtered['pulse'])


In [None]:
from sklearn.preprocessing import MinMaxScaler

# 정규화 적용
scaler = MinMaxScaler()
data_filtered['normalized_log_pulse'] = scaler.fit_transform(data_filtered[['log_pulse']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['normalized_log_pulse'] = scaler.fit_transform(data_filtered[['log_pulse']])


In [None]:
# 특성 생성 함수 정의
def create_lagged_features(df, lag=29):
    lagged_data = pd.DataFrame()
    for i in range(1, lag + 1):
        lagged_data[f'lag_{i}'] = df['normalized_log_pulse'].shift(i)
    lagged_data['sleep_stage'] = df['sleep_stage']
    lagged_data = lagged_data.dropna().reset_index(drop=True)
    return lagged_data

# 특성 생성 적용
lagged_data = data_filtered.groupby('subjectID').apply(create_lagged_features).reset_index(drop=True)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# lagged_data를 활용한 데이터 준비
X = lagged_data.drop('sleep_stage', axis=1)
y = lagged_data['sleep_stage']

# 학습용 데이터와 테스트용 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 랜덤 포레스트 모델 초기화 및 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 예측
y_pred = rf_model.predict(X_test)

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

In [None]:
pip install lightgbm==3.3.2



# 단순 light gbm

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# 특성과 레이블 분리
X = lagged_data.drop('sleep_stage', axis=1)
y = lagged_data['sleep_stage']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM 데이터셋 준비
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# 파라미터 설정
params = {
    'objective': 'multiclass',
    'num_class': len(y.unique()),  # sleep_stage의 클래스 수
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# 모델 학습
gbm = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, valid_data],
    early_stopping_rounds=10
)

# 예측
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred = [list(x).index(max(x)) for x in y_pred]

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

# class 가중치 추가

In [None]:
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight  # 추가된 부분

# 클래스 가중치 계산
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# LightGBM 데이터셋 준비
train_weights = y_train.map(class_weights_dict).values
train_data = lgb.Dataset(X_train, label=y_train, weight=train_weights)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# 파라미터 설정
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_train)),  # sleep_stage의 클래스 수
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_data_in_leaf': 20,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'is_unbalance': False  # 이미 가중치를 직접 설정했으므로 False로 설정
}

# 모델 학습
gbm = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, valid_data],
    early_stopping_rounds=10
)

# 예측
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred = [list(x).index(max(x)) for x in y_pred]

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1508
[LightGBM] [Info] Number of data points in the train set: 4397658, number of used features: 29
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[1]	training's multi_logloss: 1.60464	valid_1's multi_logloss: 1.60648
Training until validation scores don't improve for 10 rounds
[2]	training's multi_logloss: 1.60032	valid_1's multi_logloss: 1.6038
[3]	training's multi_logloss: 1.59632	valid_1's multi_logloss: 1.60136
[4]	training's multi_logloss: 1.59253	valid_1's multi_logloss: 1.59907
[5]	training's multi_logloss: 1.58898	valid_1's multi_logloss: 1.59701
[6]	training's multi_logloss: 1.5855	valid_1's multi_logloss: 1.59503
[7]	training's multi_logloss: 1.58273	valid_1's multi_logl

# 모델의 복잡도를 감소시키면서 class 가중치 추가

In [None]:
# 클래스 가중치 계산
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# LightGBM 데이터셋 준비
train_weights = y_train.map(class_weights_dict).values
train_data = lgb.Dataset(X_train, label=y_train, weight=train_weights)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# 파라미터 설정
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_train)),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 15,  # 모델 복잡도를 낮춤
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_data_in_leaf': 50,  # 리프 노드에 있어야 하는 최소 데이터 수 증가
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'is_unbalance': False  # 클래스 가중치를 직접 설정했으므로 False로 설정
}

# 모델 학습
gbm = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, valid_data],
    early_stopping_rounds=10
)

# 예측
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred = [list(x).index(max(x)) for x in y_pred]

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")




You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1508
[LightGBM] [Info] Number of data points in the train set: 4397658, number of used features: 29
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[1]	training's multi_logloss: 1.60611	valid_1's multi_logloss: 1.60718
Training until validation scores don't improve for 10 rounds
[2]	training's multi_logloss: 1.60309	valid_1's multi_logloss: 1.60517
[3]	training's multi_logloss: 1.60036	valid_1's multi_logloss: 1.60334
[4]	training's multi_logloss: 1.59756	valid_1's multi_logloss: 1.60156
[5]	training's multi_logloss: 1.59507	valid_1's multi_logloss: 1.59995
[6]	training's multi_logloss: 1.59291	valid_1's multi_logloss: 1.59847
[7]	training's multi_logloss: 1.59101	valid_1's multi_lo

# 언더샘플링과 오버샘플링을 합친 기법

In [None]:
test_data.loc[:, 'log_pulse'] = np.log1p(test_data['pulse'])
test_data.loc[:, 'normalized_log_pulse'] = scaler.transform(test_data[['log_pulse']])
test_lagged_data = test_data.groupby('subjectID').apply(create_lagged_features).reset_index(drop=True)
X_test_final = test_lagged_data.drop('sleep_stage', axis=1)
test_pred = gbm.predict(X_test_final, num_iteration=gbm.best_iteration)
test_pred_classes = [list(x).index(max(x)) for x in test_pred]


In [None]:
solution_sample['sleep_stage'] = test_pred_classes


In [None]:
output_file_path = '/mnt/data/solution.csv'
solution_sample.to_csv(output_file_path, index=False)
