In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_events = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv')

In [3]:
train_series = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet')

In [4]:
sample_submission = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv')

In [5]:
train_events.head(10)

Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400
5,038441c925bb,3,wakeup,44400.0,2018-08-17T05:10:00-0400
6,038441c925bb,4,onset,57240.0,2018-08-17T23:00:00-0400
7,038441c925bb,4,wakeup,62856.0,2018-08-18T06:48:00-0400
8,038441c925bb,5,onset,,
9,038441c925bb,5,wakeup,,


In [6]:
train_series.head(10)

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215
5,038441c925bb,5,2018-08-14T15:30:25-0400,2.6367,0.0217
6,038441c925bb,6,2018-08-14T15:30:30-0400,2.6367,0.0217
7,038441c925bb,7,2018-08-14T15:30:35-0400,2.6367,0.0218
8,038441c925bb,8,2018-08-14T15:30:40-0400,2.798,0.0223
9,038441c925bb,9,2018-08-14T15:30:45-0400,3.0847,0.0217


In [7]:
sample_submission.head(10)

Unnamed: 0,row_id,series_id,step,event,score
0,0,038441c925bb,100,onset,0.0
1,1,038441c925bb,105,wakeup,0.0
2,2,03d92c9f6f8a,80,onset,0.5
3,3,03d92c9f6f8a,110,wakeup,0.5
4,4,0402a003dae9,90,onset,1.0
5,5,0402a003dae9,120,wakeup,1.0


In [None]:
missing_values = train_series.isnull().sum()

train_series.dropna(inplace=True)

train_series.fillna(train_series.mean(), inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_series['accelerometer_data'] = scaler.fit_transform(train_series['accelerometer_data'].values.reshape(-1, 1))

In [None]:
train_series['mean_acceleration'] = train_series['accelerometer_data'].rolling(window=10).mean()

train_events['sleep_duration'] = (train_events['wakeup_timestamp'] - train_events['onset_timestamp']).dt.total_seconds()

In [None]:
train_events_encoded = pd.get_dummies(train_events, columns=['event'], prefix=['event'])

In [None]:
merged_data = pd.merge(train_series, train_events, on='series_id', how='inner')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_series, train_events, test_size=0.2, random_state=42)

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_val)

In [None]:
ap_scores = []

for event_tolerance_group in event_tolerance_groups:
    ap_score = average_precision_score(y_val[event_tolerance_group], y_pred[event_tolerance_group])
    ap_scores.append(ap_score)

final_score = np.mean(ap_scores)

In [None]:
submission = pd.DataFrame({
    'series_id': test_series['series_id'],
    'event': test_predictions
})

submission.to_csv('submission.csv', index=False)

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import matplotlib.pyplot as plt

precision, recall, _ = precision_recall_curve(y_val[event_tolerance_group], y_pred[event_tolerance_group])

ap_score = average_precision_score(y_val[event_tolerance_group], y_pred[event_tolerance_group])

plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve (AP={ap_score:.2f})')
plt.show()

In [None]:
feature_importances = model.feature_importances_
feature_names = train_series.columns  # Replace with actual feature names

sorted_idx = np.argsort(feature_importances)

plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), feature_importances[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_val[event_tolerance_group], y_pred[event_tolerance_group])

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
false_positives = X_val[y_pred == 1]
false_negatives = X_val[y_pred == 0]
