<a href="https://colab.research.google.com/github/ratnesh003/IIT-Gandhinagar-Assignment/blob/main/Gandinagar_4th_task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Learning on Raw Inertial Sensor Data

## Installing Dependencies and Importing methods

In [39]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install scikit-learn
!pip install google-colab
!pip install seaborn
!pip install tensorflow
!pip install keras



In [40]:
import numpy as np
import pandas as pd
import seaborn as sns
from google.colab import drive
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Input
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, LeaveOneGroupOut, KFold
from sklearn.preprocessing import LabelEncoder

## Mounting the google drive
Mounting the google drive to load data into the python script <br>
the dataset break down as follows according to the file structure

In [41]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
!ls /content/drive/MyDrive/ML/human+activity+recognition+using+smartphones/UCI_HAR_Dataset/UCI_HAR_Dataset

activity_labels.txt  features_info.txt	features.txt  README.txt  test	train


In [43]:
common_path = '/content/drive/MyDrive/ML/human+activity+recognition+using+smartphones/UCI_HAR_Dataset/UCI_HAR_Dataset/'

features = pd.read_csv(common_path + 'features.txt', sep='\s+', header=None, names=['index', 'feature_name'])
feature_names = features['feature_name'].tolist()

print(f"Feature names: {feature_names}")
print(f"Number of Features: {len(feature_names)}")

Feature names: ['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X', 'tBodyAcc-max()-Y', 'tBodyAcc-max()-Z', 'tBodyAcc-min()-X', 'tBodyAcc-min()-Y', 'tBodyAcc-min()-Z', 'tBodyAcc-sma()', 'tBodyAcc-energy()-X', 'tBodyAcc-energy()-Y', 'tBodyAcc-energy()-Z', 'tBodyAcc-iqr()-X', 'tBodyAcc-iqr()-Y', 'tBodyAcc-iqr()-Z', 'tBodyAcc-entropy()-X', 'tBodyAcc-entropy()-Y', 'tBodyAcc-entropy()-Z', 'tBodyAcc-arCoeff()-X,1', 'tBodyAcc-arCoeff()-X,2', 'tBodyAcc-arCoeff()-X,3', 'tBodyAcc-arCoeff()-X,4', 'tBodyAcc-arCoeff()-Y,1', 'tBodyAcc-arCoeff()-Y,2', 'tBodyAcc-arCoeff()-Y,3', 'tBodyAcc-arCoeff()-Y,4', 'tBodyAcc-arCoeff()-Z,1', 'tBodyAcc-arCoeff()-Z,2', 'tBodyAcc-arCoeff()-Z,3', 'tBodyAcc-arCoeff()-Z,4', 'tBodyAcc-correlation()-X,Y', 'tBodyAcc-correlation()-X,Z', 'tBodyAcc-correlation()-Y,Z', 'tGravityAcc-mean()-X', 'tGravityAcc-mean()-Y', 'tGravityAcc

## Data Loading and Merging
This code loads the training and test data from text files, then combines the subject, activity, and sensor data into one dataset for each (train and test) to prepare for analysis.

In [44]:
X_train = pd.read_csv(common_path + 'train/X_train.txt', sep='\s+', header=None)
y_train = pd.read_csv(common_path + 'train/y_train.txt', sep='\s+', header=None, names=['activity'])
subject_train = pd.read_csv(common_path + 'train/subject_train.txt', sep='\s+', header=None, names=['subject'])

X_test = pd.read_csv(common_path + 'test/X_test.txt', sep='\s+', header=None)
y_test = pd.read_csv(common_path + 'test/y_test.txt', sep='\s+', header=None, names=['activity'])
subject_test = pd.read_csv(common_path + 'test/subject_test.txt', sep='\s+', header=None, names=['subject'])

train_data = pd.concat([subject_train, y_train, X_train], axis=1)
test_data = pd.concat([subject_test, y_test, X_test], axis=1)

X_train_reshaped = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_reshaped = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train['activity'])
y_test_encoded = encoder.transform(y_test['activity'])

print(f"Train data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
train_data.head()

Train data shape: (7352, 563)
Test data shape: (2947, 563)


Unnamed: 0,subject,activity,0,1,2,3,4,5,6,7,...,551,552,553,554,555,556,557,558,559,560
0,1,5,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,...,-0.074323,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,1,5,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,1,5,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118
3,1,5,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,...,0.404573,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663
4,1,5,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,...,0.087753,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892


##  Evaluation Metrics
Use the following metrics to evaluate each model:

- Accuracy
- Precision
- Recall
- F1 Score

Define these metrics as scorers for use with cross-validation:

In [45]:
scorers = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='weighted', zero_division=0),
    'recall': make_scorer(recall_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

## Training and Evaluating Models

### Training the Classical Machine Learning Models

- Random Forest classifier
- Decision Tree classifier
- Logistic Regression
- AdaBoost classifier

Training four classic machine learning models on the training dataset (X_train and y_train['activity']). Each model is configured with specific parameters to ensure consistent results during evaluation.

The random_state=42 ensures reproducibility by fixing the random number generation used for the model's internal operations.

In [46]:
CNN_Model = Sequential([
    Input(shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(6, activation='softmax')
])

CNN_Model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

CNN_Model.fit(
    X_train_reshaped,
    y_train_encoded,
    epochs=20,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 104ms/step - accuracy: 0.6252 - loss: 0.9355 - val_accuracy: 0.9347 - val_loss: 0.1713
Epoch 2/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - accuracy: 0.9107 - loss: 0.2449 - val_accuracy: 0.9409 - val_loss: 0.1277
Epoch 3/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 73ms/step - accuracy: 0.9436 - loss: 0.1538 - val_accuracy: 0.9524 - val_loss: 0.1111
Epoch 4/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 58ms/step - accuracy: 0.9569 - loss: 0.1176 - val_accuracy: 0.9517 - val_loss: 0.1052
Epoch 5/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 47ms/step - accuracy: 0.9680 - loss: 0.0897 - val_accuracy: 0.9517 - val_loss: 0.1082
Epoch 6/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 57ms/step - accuracy: 0.9641 - loss: 0.0958 - val_accuracy: 0.9497 - val_loss: 0.1159
Epoch 7/20
[1m1

<keras.src.callbacks.history.History at 0x79570b8eaa10>

In [47]:
MLP_Model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(6, activation='softmax')
])

MLP_Model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

MLP_Model.fit(
    X_train,
    y_train_encoded,
    epochs=20,
    batch_size=32,
    validation_split=0
)

Epoch 1/20
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5816 - loss: 1.0033
Epoch 2/20
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8672 - loss: 0.3333
Epoch 3/20
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8930 - loss: 0.2530
Epoch 4/20
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.9100 - loss: 0.2216
Epoch 5/20
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9308 - loss: 0.1820
Epoch 6/20
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9291 - loss: 0.1755
Epoch 7/20
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9314 - loss: 0.1595
Epoch 8/20
[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9342 - loss: 0.1645
Epoch 9/20
[1m230/230[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x79570b73f7f0>

In [48]:
LSTM_Model = Sequential([
    Input(shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    LSTM(64, activation='tanh', return_sequences=True),
    LSTM(64, activation='tanh'),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(6, activation='softmax')
])

LSTM_Model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

LSTM_Model.fit(
    X_train_reshaped,
    y_train_encoded,
    epochs=20,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 711ms/step - accuracy: 0.2886 - loss: 1.4842 - val_accuracy: 0.4623 - val_loss: 1.0367
Epoch 2/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 717ms/step - accuracy: 0.4202 - loss: 1.1223 - val_accuracy: 0.3419 - val_loss: 1.3218
Epoch 3/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 651ms/step - accuracy: 0.2934 - loss: 1.4429 - val_accuracy: 0.3270 - val_loss: 1.4022
Epoch 4/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 651ms/step - accuracy: 0.2955 - loss: 1.4537 - val_accuracy: 0.2998 - val_loss: 1.4033
Epoch 5/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 667ms/step - accuracy: 0.3022 - loss: 1.4392 - val_accuracy: 0.3297 - val_loss: 1.3777
Epoch 6/20
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 642ms/step - accuracy: 0.3036 - loss: 1.4086 - val_accuracy: 0.3392 - val_loss: 1.2777
Epoc

<keras.src.callbacks.history.History at 0x79570b3f93f0>

In [49]:
trained_model = {}

trained_model['CNN'] = CNN_Model
trained_model['MLP'] = MLP_Model
trained_model['LSTM'] = LSTM_Model

### K-Fold Cross Validation
performing K-Fold Cross-Validation (K-Fold CV) on the trained models to evaluate their performance on the test dataset.

In [51]:
kfold_results = {}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

print("K-Fold Cross-Validation:")
for name, model in trained_model.items():
    temp = X_test_reshaped if name == 'CNN' or name == 'LSTM' else X_test
    cv_results = cross_validate(model, temp, y_test_encoded, cv=kfold, scoring=scorers, return_train_score=False)
    kfold_results[name] = {metric: cv_results[f'test_{metric}'].mean() for metric in scorers.keys()}
    print(f"{name}: {kfold_results[name]}")

K-Fold Cross-Validation:


TypeError: Cannot clone object '<Sequential name=sequential_18, built=True>' (type <class 'keras.src.models.sequential.Sequential'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

### LOSO Cross Validation
performing LOSO Cross-Validation (LOSO CV) on the trained models to evaluate their performance on the test dataset.

In [None]:
loso_results = {}

loso = LeaveOneGroupOut()
print("\nLeave-One-Subject-Out Cross-Validation:")

for name, model in trained_model.items():
    temp = X_test_reshaped if name == 'CNN' or name == 'LSTM' else X_test
    cv_results = cross_validate( model, temp, y_test_encoded, cv=loso.split(X_test, y_test['activity'], groups=subject_test['subject']), scoring=scorers)
    loso_results[name] = {metric: cv_results[f'test_{metric}'].mean() for metric in scorers.keys()}
    print(f"{name} (LOSO-CV): {loso_results[name]}")

## Plotting The Performance metrics of the models

### Preformance Evaluation according to K-fold Cross Validation

The Evaluation Metric of the 4 models i.e. Random Forest classifier,  Decision Tree classifier, Logistic Regression, AdaBoost classifier form K-fold Cross Validation technique.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

metrics = ['accuracy', 'precision', 'recall', 'f1']
x = np.arange(len(metrics))
width = 0.2

fig, ax = plt.subplots(figsize=(6, 4))

for i, (name, res) in enumerate(kfold_results.items()):
    ax.bar(x + i * width, [res[m] for m in metrics], width, label=name)

ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('Performance Comparison (K-Fold CV)')
ax.set_xticks(x + width)
ax.set_xticklabels(metrics)
ax.legend()

plt.tight_layout()
plt.show()

### Preformance Evaluation according to LOSO Cross Validation

The Evaluation Metric of the 4 models i.e. Random Forest classifier,  Decision Tree classifier, Logistic Regression, AdaBoost classifier form LOSO Cross Validation technique.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

metrics = ['accuracy', 'precision', 'recall', 'f1']
x = np.arange(len(metrics))
width = 0.2

fig, ax = plt.subplots(figsize=(6, 4))

for i, (name, res) in enumerate(loso_results.items()):
    ax.bar(x + i * width, [res[m] for m in metrics], width, label=name)

ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('Performance Comparison (LOSO CV)')
ax.set_xticks(x + width)
ax.set_xticklabels(metrics)
ax.legend()

plt.tight_layout()
plt.show()