In [4]:
import os
print(os.getcwd())

C:\Users\rimag


In [5]:
os.chdir('/Users/rimag/OneDrive/Documents')

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Load the accelerometer dataset from a CSV file
df = pd.read_csv('full_accelerometer_data.csv')
df.head()





Unnamed: 0,_id,timestamp,activity,x,y,z
0,6708d19115ad73d8693f5684,2024-10-11T07:19:45.106944,running,0.862985,-9.453611,0.0
1,6708d19115ad73d8693f5685,2024-10-11T07:19:45.488485,running,-1.019892,-9.532064,-1.922103
2,6708d19115ad73d8693f5686,2024-10-11T07:19:45.614679,running,1.451384,-5.962443,-1.333704
3,6708d19115ad73d8693f5687,2024-10-11T07:19:45.741773,running,2.19669,-12.081793,4.079566
4,6708d19115ad73d8693f5688,2024-10-11T07:19:45.871725,running,2.039783,-9.767423,-0.196133


In [7]:
df.shape

(19302, 6)

In [8]:
# Drop the 'id' column
if '_id' in df.columns:
    df = df.drop('_id', axis=1)

In [9]:
# Convert activity labels to lowercase to standardise them
df.loc[:,'activity'] = df['activity'].str.lower()
# remove whitespace from activity labels using .loc
df.loc[:, 'activity'] = df['activity'].str.strip()
# Verify the unique labels after cleaning
print(df['activity'].unique())



['running' 'sitting' 'walking']


In [10]:
df

Unnamed: 0,timestamp,activity,x,y,z
0,2024-10-11T07:19:45.106944,running,0.862985,-9.453611,0.000000
1,2024-10-11T07:19:45.488485,running,-1.019892,-9.532064,-1.922103
2,2024-10-11T07:19:45.614679,running,1.451384,-5.962443,-1.333704
3,2024-10-11T07:19:45.741773,running,2.196690,-12.081793,4.079566
4,2024-10-11T07:19:45.871725,running,2.039783,-9.767423,-0.196133
...,...,...,...,...,...
19297,2024-10-11T13:19:08.660695,running,0.313813,0.039227,-0.078453
19298,2024-10-11T13:19:08.792901,running,14.631522,17.024344,14.827655
19299,2024-10-11T13:19:08.925240,running,1.725970,-0.039227,1.804424
19300,2024-10-11T13:19:09.058342,running,-0.156906,-1.294478,0.196133


In [11]:
# Feature extraction: calculating magnitude and mean of each axis (x, y, z) with .loc to avoid the warning
df.loc[:, 'magnitude'] = np.sqrt(df['x']**2 + df['y']**2 + df['z']**2)
df.loc[:, 'mean_x'] = df['x'].rolling(window=10).mean()
df.loc[:, 'mean_y'] = df['y'].rolling(window=10).mean()
df.loc[:, 'mean_z'] = df['z'].rolling(window=10).mean()

# Standard deviation, variance, and range
df.loc[:, 'std_x'] = df['x'].rolling(window=10).std()
df.loc[:, 'std_y'] = df['y'].rolling(window=10).std()
df.loc[:, 'std_z'] = df['z'].rolling(window=10).std()

df.loc[:, 'var_x'] = df['x'].rolling(window=10).var()
df.loc[:, 'var_y'] = df['y'].rolling(window=10).var()
df.loc[:, 'var_z'] = df['z'].rolling(window=10).var()

df.loc[:, 'range_x'] = df['x'].rolling(window=10).apply(lambda x: x.max() - x.min())
df.loc[:, 'range_y'] = df['y'].rolling(window=10).apply(lambda x: x.max() - x.min())
df.loc[:, 'range_z'] = df['z'].rolling(window=10).apply(lambda x: x.max() - x.min())


# Drop NaN values 
df = df.dropna()

# View the extracted features
print(df.head())


                     timestamp activity         x          y         z  \
9   2024-10-11T07:19:46.519314  running  5.962443 -17.220477  1.372931   
10  2024-10-11T07:19:46.648153  running  0.745305  -2.706635  0.627626   
11  2024-10-11T07:19:46.774390  running  1.804424  -9.335931 -1.725970   
12  2024-10-11T07:19:46.901388  running  0.980665 -19.848660 -0.392266   
13  2024-10-11T07:19:47.027923  running  1.608291  -1.686744  0.823759   

    magnitude    mean_x     mean_y    mean_z     std_x     std_y     std_z  \
9   18.275134  1.965253  -9.814495 -0.023536  1.748356  5.754804  1.781174   
10   2.876677  1.953485  -9.139798  0.039227  1.756974  6.181506  1.793113   
11   9.664084  2.235916  -9.120185  0.058840  1.420728  6.180434  1.770202   
12  19.876742  2.188844 -10.508806  0.152984  1.456945  6.909141  1.711991   
13   2.471898  2.130004  -9.469301 -0.172597  1.468430  7.410014  1.072385   

       var_x      var_y     var_z   range_x    range_y   range_z  
9   3.056748  33.11

In [12]:
df

Unnamed: 0,timestamp,activity,x,y,z,magnitude,mean_x,mean_y,mean_z,std_x,std_y,std_z,var_x,var_y,var_z,range_x,range_y,range_z
9,2024-10-11T07:19:46.519314,running,5.962443,-17.220477,1.372931,18.275134,1.965253,-9.814495,-0.023536,1.748356,5.754804,1.781174,3.056748,33.117764,3.172580,6.982335,18.240369,6.197803
10,2024-10-11T07:19:46.648153,running,0.745305,-2.706635,0.627626,2.876677,1.953485,-9.139798,0.039227,1.756974,6.181506,1.793113,3.086958,38.211015,3.215254,6.982335,18.318822,6.197803
11,2024-10-11T07:19:46.774390,running,1.804424,-9.335931,-1.725970,9.664084,2.235916,-9.120185,0.058840,1.420728,6.180434,1.770202,2.018467,38.197765,3.133616,5.217138,18.318822,6.197803
12,2024-10-11T07:19:46.901388,running,0.980665,-19.848660,-0.392266,19.876742,2.188844,-10.508806,0.152984,1.456945,6.909141,1.711991,2.122690,47.736226,2.930914,5.217138,18.318822,6.197803
13,2024-10-11T07:19:47.027923,running,1.608291,-1.686744,0.823759,2.471898,2.130004,-9.469301,-0.172597,1.468430,7.410014,1.072385,2.156285,54.908314,1.150010,5.217138,19.338714,3.491167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19297,2024-10-11T13:19:08.660695,running,0.313813,0.039227,-0.078453,0.325841,5.546641,6.515538,4.503214,7.598476,8.090205,8.263464,57.736835,65.451409,68.284837,23.104467,19.966339,27.184034
19298,2024-10-11T13:19:08.792901,running,14.631522,17.024344,14.827655,26.902957,4.718960,6.417472,3.353874,5.714384,7.940008,5.076415,32.654180,63.043730,25.769987,14.827655,19.574073,15.690640
19299,2024-10-11T13:19:08.925240,running,1.725970,-0.039227,1.804424,2.497290,4.499291,5.934985,3.353874,5.790116,8.192759,5.076415,33.525440,67.121303,25.769987,14.827655,19.574073,15.690640
19300,2024-10-11T13:19:09.058342,running,-0.156906,-1.294478,0.196133,1.318621,4.503214,5.868299,3.393101,5.786593,8.254592,5.047368,33.484664,68.138282,25.475919,14.788428,19.574073,15.690640


In [13]:
# Define feature columns and target column (activity label)
feature_columns = ['magnitude', 'mean_x', 'mean_y', 'mean_z', 'std_x', 'std_y', 'std_z', 
                   'var_x', 'var_y', 'var_z', 'range_x', 'range_y', 'range_z']
X = df[feature_columns]
y = df['activity']  # Activity column should contain labels 'sitting', 'walking', 'running'.


In [14]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [15]:
# Standardise features for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:

# Initialise classifiers
dt_model = DecisionTreeClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
knn_model = KNeighborsClassifier(n_neighbors=5)


In [17]:
# Train Decision Tree model
dt_model.fit(X_train, y_train)


In [18]:
y_pred_dt = dt_model.predict(X_test)

In [19]:
# Evaluate Decision Tree

print("Decision Tree Classifier:")
print(classification_report(y_test, y_pred_dt))
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt)}\n")

Decision Tree Classifier:
              precision    recall  f1-score   support

     running       0.99      0.99      0.99      1529
     sitting       0.99      0.99      0.99      1202
     walking       0.98      0.98      0.98      1128

    accuracy                           0.99      3859
   macro avg       0.99      0.99      0.99      3859
weighted avg       0.99      0.99      0.99      3859

Accuracy: 0.9880798134231666



In [20]:
# Train Random Forest model
rf_model.fit(X_train, y_train)



In [21]:
y_pred_rf = rf_model.predict(X_test)


In [22]:
# Evaluate Random Forest
print("Random Forest Classifier:")
print(classification_report(y_test, y_pred_rf))
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}\n")

Random Forest Classifier:
              precision    recall  f1-score   support

     running       1.00      0.99      1.00      1529
     sitting       1.00      0.99      0.99      1202
     walking       0.98      1.00      0.99      1128

    accuracy                           0.99      3859
   macro avg       0.99      0.99      0.99      3859
weighted avg       0.99      0.99      0.99      3859

Accuracy: 0.9914485618035761



In [23]:

# Train K-Nearest Neighbors model
knn_model.fit(X_train, y_train)



In [24]:
y_pred_knn = knn_model.predict(X_test)

In [25]:
# Evaluate K-Nearest Neighbors
print("K-Nearest Neighbors Classifier:")
print(classification_report(y_test, y_pred_knn))
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn)}\n")

K-Nearest Neighbors Classifier:
              precision    recall  f1-score   support

     running       1.00      0.99      1.00      1529
     sitting       1.00      0.99      0.99      1202
     walking       0.97      1.00      0.99      1128

    accuracy                           0.99      3859
   macro avg       0.99      0.99      0.99      3859
weighted avg       0.99      0.99      0.99      3859

Accuracy: 0.9922259652759783



## Cross-validation
Cross validation is a technique used to assess how well a machine learning model will generalize to unseen data. Instead of splitting the dataset into just one training set and one test set, cross-validation divides the dataset into multiple training and testing splits, and the model is trained and evaluated on each split. This helps prevent overfitting and gives a better idea of model performance.

#### Purpose of Cross-Validation:

- Reliable Estimate of Model Performance: By evaluating the model multiple times across different splits of the data, the bias associated with relying on a single test set is reduced.

- Reduced Overfitting: This technique prevents the model from being overfit, but rather ensures it generalizes smoothly across different data points without overfitting.

- Model Selection: Averaging results over multiple folds helps compare the performance of different models or hyperparameters.

In [26]:
from sklearn.model_selection import cross_val_score

# Initialise classifiers
dt_model = DecisionTreeClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
knn_model = KNeighborsClassifier(n_neighbors=5)

# Use 5-fold cross-validation for each model
cv = 5  # 5-fold cross-validation (most common)

# Cross-validation for Decision Tree
dt_scores = cross_val_score(dt_model, X, y, cv=cv, scoring='accuracy')
print(f"Decision Tree Cross-Validation Accuracy: {dt_scores.mean()} ± {dt_scores.std()}")

# Cross-validation for Random Forest
rf_scores = cross_val_score(rf_model, X, y, cv=cv, scoring='accuracy')
print(f"Random Forest Cross-Validation Accuracy: {rf_scores.mean()} ± {rf_scores.std()}")

# Cross-validation for K-Nearest Neighbors
knn_scores = cross_val_score(knn_model, X, y, cv=cv, scoring='accuracy')
print(f"K-Nearest Neighbors Cross-Validation Accuracy: {knn_scores.mean()} ± {knn_scores.std()}")


Decision Tree Cross-Validation Accuracy: 0.8886590844640073 ± 0.04305132616920244
Random Forest Cross-Validation Accuracy: 0.915249829695308 ± 0.0357996457427287
K-Nearest Neighbors Cross-Validation Accuracy: 0.9023481158208927 ± 0.0698042575989596


## Deploy model using pickle package

In [27]:
import pickle

In [28]:

# Save the Decision Tree model
with open('dt_model.pkl', 'wb') as file:
    pickle.dump(dt_model, file)

# Save the Random Forest model
with open('rf_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

# Save the KNN model
with open('knn_model.pkl', 'wb') as file:
    pickle.dump(knn_model, file)
    
print("Models have been saved to separate files.")



Models have been saved to separate files.


In [29]:
# run this cell to check the version of scikit-learn on your PC

import sklearn
sklearn.__version__


'1.5.2'

You can find the two .pkl files in the same directory as this Notebook. You can send the files to the Raspberry Pi using scp (secure file copy over SSH, use normal cmd promt)






(now back to pi)On your Raspberry Pi, you can load the files back into Python objects, provided that the scikit-learn library is of the same verision. You can check this on your Pi using the following command in a terminal:



Check that the two versions match. If not, you might need to upgrade scikit-learn on your Pi:




Once comepleted, you can move on to loading the model into your Pi.

##### Note: executing the following cell may not cause error in Jupyter Notebook, but you should continue on your Raspberry Pi to deploy the model.

Create a new Python script in the directory where you saved the two .pkl files (i.e. ~\Documents).

In [31]:
# Run this on your Pi

import pickle

with open('dt_model.pkl', 'rb') as file:
    model = pickle.load(file)



print(model) # check that we got the  model back correct

DecisionTreeClassifier(random_state=42)


In [None]:
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Load the K-Nearest Neighbors model
with open('/home/pi/Documents/knn_model.pkl', 'rb') as f: #change the pathway as needed
    knn_model = pickle.load(f)

# Load the Random Forest model
with open('/home/pi/Documents/rf_model.pkl', 'rb') as f:
    rf_model = pickle.load(f)

# Load the Decision Tree model
with open('/home/pi/Documents/dt_model.pkl', 'rb') as f:
    dt_model = pickle.load(f)

# Now you can use `knn_model`, `rf_model`, and `dt_model`

## Attempt at deploying


run this on pi using py file


In [None]:
import time
import board
import busio
import pickle
import numpy as np
import pandas as pd
from adafruit_adxl34x import ADXL343

# Load the pre-trained models
with open('dt_model.pkl', 'rb') as model_file:
    dt_model = pickle.load(model_file)

with open('knn_model.pkl', 'rb') as model_file:
    knn_model = pickle.load(model_file)

with open('rf_model.pkl', 'rb') as model_file:
    rf_model = pickle.load(model_file)

# Setup I2C connection and ADXL343 sensor
i2c = busio.I2C(board.SCL, board.SDA)
accelerometer = ADXL343(i2c)
accelerometer.range = ADXL343.Range.RANGE_4_G


# Feature extraction function
def extract_features(data):
    if len(data) > 0:
        df = pd.DataFrame(data, columns=['x', 'y', 'z'])
        df['magnitude'] = np.sqrt(df['x']**2 + df['y']**2 + df['z']**2)
        
        # Extract features
        features = {
            'mean_x': df['x'].mean(),
            'mean_y': df['y'].mean(),
            'mean_z': df['z'].mean(),
            'magnitude': df['magnitude'].mean(),
            'std_x': df['x'].std(),
            'std_y': df['y'].std(),
            'std_z': df['z'].std(),
            'var_x': df['x'].var(),
            'var_y': df['y'].var(),
            'var_z': df['z'].var(),
            'range_x': df['x'].max() - df['x'].min(),
            'range_y': df['y'].max() - df['y'].min(),
            'range_z': df['z'].max() - df['z'].min()
        }
        
        return pd.DataFrame([features])
    return None

# Collect data and make predictions
data_buffer = []
try:
    while True:
        x, y, z = accelerometer.acceleration
        data_buffer.append((x, y, z))

        if len(data_buffer) >= 10:  # Adjust this number based on your needs
            features_df = extract_features(data_buffer)
            if features_df is not None:
                # Predict using all models
                dt_prediction = dt_model.predict(features_df)
                knn_prediction = knn_model.predict(features_df)
                rf_prediction = rf_model.predict(features_df)

                # Print predictions from all models
                print(f'Decision Tree Prediction: {dt_prediction[0]}')
                print(f'KNN Prediction: {knn_prediction[0]}')
                print(f'Random Forest Prediction: {rf_prediction[0]}')

            # Clear the buffer
            data_buffer = []

        time.sleep(0.1)  # Adjust sleep for desired sampling rate

except KeyboardInterrupt:
    print("Stopping data collection.")
finally:
    print("Data collection complete.")





In [None]:
#FOR CONFUSION MATRIX (DRAFT)
from sklearn.metrics import confusion_matrix, classification_report

# Generate confusion matrix for each model
dt_cm = confusion_matrix(y_true, dt_predictions)
knn_cm = confusion_matrix(y_true, knn_predictions)
rf_cm = confusion_matrix(y_true, rf_predictions)

# Print classification reports
print("Decision Tree Classification Report:")
print(classification_report(y_true, dt_predictions))

print("KNN Classification Report:")
print(classification_report(y_true, knn_predictions))

print("Random Forest Classification Report:")
print(classification_report(y_true, rf_predictions))



In [None]:

#VISUALISING CONFUSION MATRIX (I think pick which one is most accurate?idk)
import matplotlib.pyplot as plt
import seaborn as sns

def plot_confusion_matrix(cm, model_name):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['walking', 'sitting', 'running'], yticklabels=['walking', 'sitting', 'running'])
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# Plot confusion matrices for each model
plot_confusion_matrix(dt_cm, 'Decision Tree')
plot_confusion_matrix(knn_cm, 'KNN')
plot_confusion_matrix(rf_cm, 'Random Forest')