In [1]:
try:
    from google.colab import drive

    drive.mount("/content/drive")
    import zipfile

    with zipfile.ZipFile("/content/drive/MyDrive/Bigdata/Dataset.zip", "r") as zip_ref:
        zip_ref.extractall("./")
    datasetFolder = "/content/Dataset/Protocol/"
except:
    datasetFolder = "/content/Dataset/Protocol/"
    print("Using Local Machine")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import pickle
from datetime import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    roc_curve,
)
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    VotingClassifier,
)

In [4]:
ModelDir = "./Models/" + datetime.now().strftime("%Y%m%d-%H") + "/"

if not os.path.exists("./Models/"):
    os.mkdir("./Models/")

if not os.path.exists(ModelDir):
    os.mkdir(ModelDir)

In [5]:
def HeartRatePreprocess(dataFrame: pd.DataFrame) -> pd.DataFrame:
    heartRateStartIndex = 0
    heartRates = dataFrame["heart rate (bpm)"].values
    heartRate = dataFrame["heart rate (bpm)"].median()
    for index in range(len(heartRates)):
        if heartRates[index] > 0:
            heartRate = heartRates[index]
            heartRateStartIndex = index
            break
    newHeartRates = []
    dataFrame = dataFrame[:][heartRateStartIndex:]
    for index in range(dataFrame.shape[0]):
        if heartRates[index] > 0:
            heartRate = heartRates[index]
            newHeartRates.append(heartRate)
        else:
            newHeartRates.append(heartRate)
    dataFrame["heart rate (bpm)"] = newHeartRates
    return dataFrame

In [6]:
def PreprocessDataset(dataFrame: pd.DataFrame) -> pd.DataFrame:
    dataFrame = dataFrame.drop(columns=["timestamp (s)"])
    columns = dataFrame.columns
    preprocessedDataFrame = []
    uniqueActivities = list(dataFrame["activityID"].unique())
    for uniqueActivity in uniqueActivities:
        subDataFrame = pd.DataFrame(
            dataFrame[dataFrame["activityID"] == uniqueActivity]
        )
        for column in columns:
            if column == "heart rate (bpm)":
                subDataFrame = HeartRatePreprocess(subDataFrame)
            else:
                columnMean = dataFrame[column].mean()
                subDataFrame[column] = subDataFrame[column].fillna(columnMean)
                # columnMedian = dataFrame[column].median()
                # subDataFrame[column] = subDataFrame[column].fillna(columnMedian)

        preprocessedDataFrame.append(subDataFrame)
    preprocessedDataFrame = pd.concat(preprocessedDataFrame, ignore_index=True)
    return preprocessedDataFrame

In [7]:
def DescribeDataFrame(dataFrame: pd.DataFrame) -> None:
    for column in dataFrame.columns:
        NaNCount = dataFrame.shape[0] - dataFrame[column].count()
        print("NaN Count in", column, ":", NaNCount)
        print(dataFrame[column].describe())
        print("-" * 30)

In [8]:
def LoadDataSet(folderPath):
    if os.path.isdir(folderPath) and folderPath[-1] != "/":
        print("enter a valid folderPath")
        return None
    dataFrames = []
    csvFiles = os.listdir(folderPath)
    for csvFile in csvFiles:
        print("Reading DataSet from", csvFile)
        dataFrame = pd.read_csv(
            folderPath + csvFile, index_col=None, header=0, dtype=float
        )
        dataFrame = PreprocessDataset(dataFrame)
        dataFrames.append(dataFrame)
    dataFrames = pd.concat(dataFrames, ignore_index=True)
    return dataFrames


dataFrame = LoadDataSet("Dataset/Protocol/")

Reading DataSet from subject102.csv
Reading DataSet from subject104.csv


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Reading DataSet from subject101.csv
Reading DataSet from subject107.csv
Reading DataSet from subject106.csv
Reading DataSet from subject103.csv
Reading DataSet from subject108.csv
Reading DataSet from subject109.csv
Reading DataSet from subject105.csv


In [9]:
dataFrame = dataFrame.dropna()
DescribeDataFrame(dataFrame)

NaN Count in activityID : 0
count    2.872047e+06
mean     5.465736e+00
std      6.331087e+00
min      0.000000e+00
25%      0.000000e+00
50%      3.000000e+00
75%      7.000000e+00
max      2.400000e+01
Name: activityID, dtype: float64
------------------------------
NaN Count in heart rate (bpm) : 0
count    2.872047e+06
mean     1.098783e+02
std      2.587579e+01
min      5.700000e+01
25%      9.000000e+01
50%      1.080000e+02
75%      1.250000e+02
max      2.020000e+02
Name: heart rate (bpm), dtype: float64
------------------------------
NaN Count in hand_temperature (°C)  : 0
count    2.872047e+06
mean     3.265261e+01
std      1.840998e+00
min      2.475000e+01
25%      3.143750e+01
50%      3.312500e+01
75%      3.400000e+01
max      3.550000e+01
Name: hand_temperature (°C) , dtype: float64
------------------------------
NaN Count in hand_acc_16g_X : 0
count    2.872047e+06
mean    -4.961409e+00
std      5.971751e+00
min     -1.453670e+02
25%     -9.021390e+00
50%     -5.760690e

In [10]:
dataFrame.head()

Unnamed: 0,activityID,heart rate (bpm),hand_temperature (°C),hand_acc_16g_X,hand_acc_16g_Y,hand_acc_16g_Z,hand_acc_6g_X,hand_acc_6g_Y,hand_acc_6g_Z,hand_gyro_X,...,ankle_acc_16g_Z,ankle_acc_6g_X,ankle_acc_6g_Y,ankle_acc_6g_Z,ankle_gyro_X,ankle_gyro_Y,ankle_gyro_Z,ankle_mag_X,ankle_mag_Y,ankle_mag_Z
0,0.0,100.0,33.0,2.80997,7.0943,-7.52376,2.79373,7.43545,-7.07744,1.14829,...,-1.76432,9.73032,-0.756157,-1.35749,-0.001695,-0.034529,-0.040074,-47.5393,-2.38909,60.9623
1,0.0,100.0,33.0,2.77138,6.98077,-7.52396,2.80627,7.29929,-7.24326,1.24888,...,-1.76223,9.70051,-0.816589,-1.29659,0.055437,-0.01156,0.015434,-47.4246,-2.72082,60.7103
2,0.0,100.0,33.0,2.7671,7.13119,-7.67848,2.78766,7.11797,-7.48445,1.27178,...,-2.03407,9.73043,-0.81662,-1.32702,0.017184,-0.034663,0.007239,-47.5412,-2.60153,60.3412
3,0.0,100.0,33.0,2.60926,7.13068,-7.87328,2.8163,7.17794,-7.72623,1.30517,...,-1.7627,9.71546,-0.846854,-1.31171,-0.015741,-0.053576,0.026668,-47.5325,-3.15502,60.459
4,0.0,100.0,33.0,2.27397,7.05877,-7.72411,2.64914,7.10356,-7.8165,1.26933,...,-1.83944,9.70044,-0.861982,-1.31158,0.01429,-0.019795,0.019187,-47.6507,-2.92537,60.091


In [11]:
dataFrame.columns

Index(['activityID', 'heart rate (bpm)', 'hand_temperature (°C) ',
       'hand_acc_16g_X', 'hand_acc_16g_Y', 'hand_acc_16g_Z', 'hand_acc_6g_X',
       'hand_acc_6g_Y', 'hand_acc_6g_Z', 'hand_gyro_X', 'hand_gyro_Y',
       'hand_gyro_Z', 'hand_mag_X', 'hand_mag_Y', 'hand_mag_Z',
       'chest_temperature (°C) ', 'chest_acc_16g_X', 'chest_acc_16g_Y',
       'chest_acc_16g_Z', 'chest_acc_6g_X', 'chest_acc_6g_Y', 'chest_acc_6g_Z',
       'chest_gyro_X', 'chest_gyro_Y', 'chest_gyro_Z', 'chest_mag_X',
       'chest_mag_Y', 'chest_mag_Z', 'ankle_temperature (°C) ',
       'ankle_acc_16g_X', 'ankle_acc_16g_Y', 'ankle_acc_16g_Z',
       'ankle_acc_6g_X', 'ankle_acc_6g_Y', 'ankle_acc_6g_Z', 'ankle_gyro_X',
       'ankle_gyro_Y', 'ankle_gyro_Z', 'ankle_mag_X', 'ankle_mag_Y',
       'ankle_mag_Z'],
      dtype='object')

In [12]:
X = dataFrame.drop(columns=["activityID"])
y = dataFrame["activityID"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [13]:
RFClassifer = RandomForestClassifier(
    n_estimators=100, random_state=42, n_jobs=-1, verbose=True
)
GBClassifer = GradientBoostingClassifier(
    n_estimators=100, random_state=42, verbose=True
)
SVMClassifer = make_pipeline(
    StandardScaler(), SVC(probability=True, random_state=42, verbose=True)
)
# KNNClassifer = KNeighborsClassifier(n_jobs=-1)
# MPClassifer = make_pipeline(StandardScaler(), MLPClassifier(random_state=42, verbose= True))
# XGClassifer = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=-1)

EnsembleClassifer = VotingClassifier(
    estimators=[
        ("RF", RFClassifer),
        ("GB", GBClassifer)
        # , ('SVM', SVMClassifer),
        # ("KNN", KNNClassifer),('MLP', MPClassifer),("XGB", XGClassifer)
    ],
    voting="soft",
)

In [14]:
EnsembleClassifer.fit(X_train, y_train)
pklFileName = "RF-GB"
with open(ModelDir + pklFileName + ".pkl", "wb") as pklFile:
    pickle.dump(EnsembleClassifer, pklFile)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 13.6min finished


      Iter       Train Loss   Remaining Time 
         1           1.8024         1221.78m
         2           1.6266         1208.92m
         3           1.5001         1195.06m
         4           1.3998         1179.74m
         5           1.3178         1166.12m
         6           1.2444         1152.76m
         7           1.1817         1140.03m
         8           1.1259         1126.68m
         9           1.0802         1113.51m
        10           1.0345         1101.13m
        20           0.7235          979.07m
        30           0.5678          860.97m
        40           0.4680          740.74m
        50           0.4003          616.83m
        60           0.3489          494.49m
        70           0.3085          371.62m
        80           0.2769          248.03m
        90           0.2483          124.12m
       100           0.2245            0.00s


In [None]:
y_pred = EnsembleClassifer.predict(X_test)
y_pred_proba = EnsembleClassifer.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print("Model Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


print("Plotting Confusion Matrix")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()
