In [None]:
# Delete all kaggle json files
!rm -f kaggle*.json

# Delete the dataset zip
!rm -f cmi-detect-behavior-with-sensor-data.zip

# Delete the folders
!rm -rf cmi_data sample_data


In [None]:
from google.colab import files
files.upload()  # Upload your kaggle.json again if runtime was reset

!mkdir -p ~/.config/kaggle
!cp kaggle.json ~/.config/kaggle/
!chmod 600 ~/.config/kaggle/kaggle.json


Saving kaggle.json to kaggle.json


In [None]:
# Step 1: Download & unzip
!kaggle competitions download -c cmi-detect-behavior-with-sensor-data
!unzip -q cmi-detect-behavior-with-sensor-data.zip -d cmi_data/

# Step 2: Load the data
import pandas as pd
train_df = pd.read_csv("cmi_data/train.csv")

# Step 3: Show the shape
print("Train shape:", train_df.shape)

# Step 4: Preview sensor structure
sensor_start_col = train_df.columns.get_loc('acc_x')
sensor_cols = train_df.columns[sensor_start_col:]
sequence_lengths = train_df.groupby('sequence_id').size()
num_features = len(sensor_cols)
max_timesteps = sequence_lengths.max()
min_timesteps = sequence_lengths.min()

print("\nTime steps per sequence:")
print(sequence_lengths.describe())
print(f"\nTotal sensor features: {num_features}")
print(f"Sample sensor columns: {sensor_cols[:5].tolist()}")


Downloading cmi-detect-behavior-with-sensor-data.zip to /content
 62% 110M/178M [00:00<00:00, 1.15GB/s]
100% 178M/178M [00:00<00:00, 1.16GB/s]
Train shape: (574945, 341)

Time steps per sequence:
count    8151.000000
mean       70.536744
std        35.389879
min        29.000000
25%        51.000000
50%        59.000000
75%        78.000000
max       700.000000
dtype: float64

Total sensor features: 332
Sample sensor columns: ['acc_x', 'acc_y', 'acc_z', 'rot_w', 'rot_x']


In [None]:
# # STEP 1: Install and import dependencies
# import torch
# import torch.nn as nn
# from torch.utils.data import Dataset, DataLoader
# from sklearn.preprocessing import LabelEncoder, StandardScaler
# from sklearn.model_selection import train_test_split
# import numpy as np
# import pandas as pd
# from torch.nn.utils.rnn import pad_sequence
# from sklearn.metrics import accuracy_score, classification_report

# # STEP 2: Load data
# train_df = pd.read_csv("cmi_data/train.csv")

# # STEP 3: Sensor & label columns
# sensor_start_col = train_df.columns.get_loc('acc_x')
# sensor_cols = train_df.columns[sensor_start_col:]
# label_df = train_df[['sequence_id', 'behavior']].drop_duplicates()

# # STEP 4: Encode labels
# le = LabelEncoder()
# label_df['label'] = le.fit_transform(label_df['behavior'])

# # STEP 5: Build sequences and normalize
# sequences = []
# labels = []

# # Collect all sequences for scaling
# raw_sequences = []
# for seq_id in label_df['sequence_id']:
#     seq_data = train_df[train_df['sequence_id'] == seq_id][sensor_cols].values
#     raw_sequences.append(seq_data)
#     labels.append(label_df[label_df['sequence_id'] == seq_id]['label'].values[0])

# # Fit scaler on all sensor data
# scaler = StandardScaler()
# scaler.fit(np.vstack(raw_sequences))

# # Normalize each sequence
# normalized_sequences = [scaler.transform(seq) for seq in raw_sequences]

# # Convert to tensor and pad
# tensor_sequences = [torch.tensor(seq, dtype=torch.float32) for seq in normalized_sequences]
# padded_sequences = pad_sequence(tensor_sequences, batch_first=True)  # [N, T, F]
# labels_tensor = torch.tensor(labels, dtype=torch.long)

# # STEP 6: Train/val split
# X_train, X_val, y_train, y_val = train_test_split(
#     padded_sequences, labels_tensor, test_size=0.2, stratify=labels_tensor, random_state=42
# )

# # STEP 7: Custom Dataset
# class SensorDataset(Dataset):
#     def __init__(self, X, y):
#         self.X = X
#         self.y = y
#     def __len__(self):
#         return len(self.X)
#     def __getitem__(self, idx):
#         return self.X[idx], self.y[idx]

# train_ds = SensorDataset(X_train, y_train)
# val_ds = SensorDataset(X_val, y_val)
# train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
# val_dl = DataLoader(val_ds, batch_size=32)

# # STEP 8: Define LSTM Model
# class LSTMClassifier(nn.Module):
#     def __init__(self, input_size, hidden_size, num_classes):
#         super().__init__()
#         self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
#         self.fc = nn.Linear(hidden_size, num_classes)

#     def forward(self, x):
#         _, (h_n, _) = self.lstm(x)
#         out = self.fc(h_n[-1])
#         return out

# # STEP 9: Train
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = LSTMClassifier(input_size=padded_sequences.shape[2], hidden_size=128, num_classes=len(le.classes_)).to(device)

# loss_fn = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# for epoch in range(5):  # Increase if needed
#     model.train()
#     total_loss = 0
#     for xb, yb in train_dl:
#         xb, yb = xb.to(device), yb.to(device)
#         optimizer.zero_grad()
#         preds = model(xb)
#         loss = loss_fn(preds, yb)

#         # NaN check
#         if torch.isnan(loss):
#             print("NaN loss detected! Aborting training.")
#             break

#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
#         optimizer.step()
#         total_loss += loss.item()
#     else:
#         print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_dl):.4f}")
#         continue
#     break  # if NaN occurred

# # STEP 10: Evaluate
# model.eval()
# all_preds = []
# all_labels = []

# with torch.no_grad():
#     for xb, yb in val_dl:
#         xb = xb.to(device)
#         outputs = model(xb)
#         preds = torch.argmax(outputs, dim=1).cpu()
#         all_preds.extend(preds.numpy())
#         all_labels.extend(yb.numpy())

# print("Validation Accuracy:", accuracy_score(all_labels, all_preds))
# print(classification_report(all_labels, all_preds, target_names=le.classes_))


In [None]:
# STEP 1: Install and import dependencies
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# STEP 2: Load data
train_df = pd.read_csv("cmi_data/train.csv")

In [None]:
# STEP 3: Sensor & label columns
sensor_start_col = train_df.columns.get_loc('acc_x')
sensor_cols = train_df.columns[sensor_start_col:]
label_df = train_df[['sequence_id', 'behavior']].drop_duplicates()

In [None]:
# STEP 4: Encode labels
le = LabelEncoder()
label_df['label'] = le.fit_transform(label_df['behavior'])

In [None]:
# STEP 5: Build sequences and normalize
sequences = []
labels = []

# Collect all sequences for scaling
raw_sequences = []
for seq_id in label_df['sequence_id']:
    seq_data = train_df[train_df['sequence_id'] == seq_id][sensor_cols].values
    raw_sequences.append(seq_data)
    labels.append(label_df[label_df['sequence_id'] == seq_id]['label'].values[0])

# Fit scaler on all sensor data
scaler = StandardScaler()
scaler.fit(np.vstack(raw_sequences))

# Normalize each sequence
normalized_sequences = [scaler.transform(seq) for seq in raw_sequences]

# Convert to tensor and pad
tensor_sequences = [torch.tensor(seq, dtype=torch.float32) for seq in normalized_sequences]
padded_sequences = pad_sequence(tensor_sequences, batch_first=True)  # [N, T, F]
labels_tensor = torch.tensor(labels, dtype=torch.long)

In [None]:
# STEP 6: Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    padded_sequences, labels_tensor, test_size=0.2, stratify=labels_tensor, random_state=42
)

In [None]:
# STEP 7: Custom Dataset
class SensorDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = SensorDataset(X_train, y_train)
val_ds = SensorDataset(X_val, y_val)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32)

In [None]:
# STEP 8: Define LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

In [None]:
# STEP 9: Train
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(input_size=padded_sequences.shape[2], hidden_size=128, num_classes=len(le.classes_)).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(5):  # Increase if needed
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = loss_fn(preds, yb)

        # NaN check
        if torch.isnan(loss):
            print("NaN loss detected! Aborting training.")
            break

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        total_loss += loss.item()
    else:
        print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_dl):.4f}")
        continue
    break  # if NaN occurred

NaN loss detected! Aborting training.


In [None]:
torch.save(model.state_dict(), "lstm_model.pth")


In [None]:
from google.colab import files
files.download("lstm_model.pth")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# STEP 10: Evaluate
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for xb, yb in val_dl:
        xb = xb.to(device)
        outputs = model(xb)
        preds = torch.argmax(outputs, dim=1).cpu()
        all_preds.extend(preds.numpy())
        all_labels.extend(yb.numpy())

print("Validation Accuracy:", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds, target_names=le.classes_))

Validation Accuracy: 0.4653445103250869


ValueError: Number of classes, 3, does not match size of target_names, 4. Try specifying the labels parameter

In [4]:
# Save the notebook as a Python file
# IMPORTANT: Replace THIS_IS_WHERE_YOU_PUT_YOUR_NOTEBOOK_PATH with the actual path to your notebook file in Google Drive.
# Example: '/content/drive/MyDrive/my_notebook.ipynb'
notebook_path = 'THIS_IS_WHERE_YOU_PUT_YOUR_NOTEBOOK_PATH'

!jupyter nbconvert --to script "$notebook_path"

# Get the name of the generated Python file (it will have the same name as the notebook but with a .py extension)
import os
notebook_name = os.path.basename(notebook_path)
python_file_name = os.path.splitext(notebook_name)[0] + '.py'

from google.colab import files
# Download the generated Python file
files.download(python_file_name)

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

FileNotFoundError: Cannot find file: THIS_IS_WHERE_YOU_PUT_YOUR_NOTEBOOK_PATH.py