In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import math
from scipy import stats
import h5py
from sklearn.preprocessing import MinMaxScaler, StandardScaler

activityIDdict = {0: 'transient',
              1: 'lying',# no change in index
              2: 'sitting',# no change in index
              3: 'standing',# no change in index
              4: 'walking',# no change in index
              5: 'running',# no change in index
              6: 'cycling',# no change in index
              7: 'Nordic_walking',# no change in index
              9: 'watching_TV', # not in dataset
              10: 'computer_work',# not in dataset
              11: 'car driving', # not in dataset
              12: 'ascending_stairs', # new index:8
              13: 'descending_stairs', # new index:9
              16: 'vacuum_cleaning', # new index:10
              17: 'ironing', # new index:11
              18: 'folding_laundry',# not in dataset
              19: 'house_cleaning', # not in dataset
              20: 'playing_soccer', # not in dataset
              24: 'rope_jumping' # new index: 0 
              }
#{24:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7,12:8,13:9,16:10,17:11}

In [None]:
df= pd.read_table('C:/Users/dixit/Desktop/PAMAP2_Dataset/Protocol/subject101.dat', header=None,sep='\s+')

In [None]:
['rope_jumping','lying','sitting','standing','walking','running','cycling','Nordic_walking','ascending_stairs','descending_stairs','vacuum_cleaning','ironing']

In [None]:
df.head()

In [None]:
def read_files():
    list_of_files = ['C:/Users/dixit/Desktop/PAMAP2_Dataset/Protocol/subject101.dat',
                     'C:/Users/dixit/Desktop/PAMAP2_Dataset/Protocol/subject102.dat',
                     'C:/Users/dixit/Desktop/PAMAP2_Dataset/Protocol/subject103.dat',
                     'C:/Users/dixit/Desktop/PAMAP2_Dataset/Protocol/subject104.dat',
                     'C:/Users/dixit/Desktop/PAMAP2_Dataset/Protocol/subject105.dat',
                     'C:/Users/dixit/Desktop/PAMAP2_Dataset/Protocol/subject106.dat',
                     'C:/Users/dixit/Desktop/PAMAP2_Dataset/Protocol/subject107.dat',
                     'C:/Users/dixit/Desktop/PAMAP2_Dataset/Protocol/subject108.dat',
                     'C:/Users/dixit/Desktop/PAMAP2_Dataset/Protocol/subject109.dat' ]
    
    subjectID = [1,2,3,4,5,6,7,8,9]
    

    
    colNames = ["timestamp", "activityID","heartrate"]
    
    IMUhand = ['handTemperature', 
               'handAcc16_1', 'handAcc16_2', 'handAcc16_3', 
               'handAcc6_1', 'handAcc6_2', 'handAcc6_3', 
               'handGyro1', 'handGyro2', 'handGyro3', 
               'handMagne1', 'handMagne2', 'handMagne3',
               'handOrientation1', 'handOrientation2', 'handOrientation3', 'handOrientation4']
    
    IMUchest = ['chestTemperature', 
               'chestAcc16_1', 'chestAcc16_2', 'chestAcc16_3', 
               'chestAcc6_1', 'chestAcc6_2', 'chestAcc6_3', 
               'chestGyro1', 'chestGyro2', 'chestGyro3', 
               'chestMagne1', 'chestMagne2', 'chestMagne3',
               'chestOrientation1', 'chestOrientation2', 'chestOrientation3', 'chestOrientation4']
    
    
    IMUankle = ['ankleTemperature', 
               'ankleAcc16_1', 'ankleAcc16_2', 'ankleAcc16_3', 
               'ankleAcc6_1', 'ankleAcc6_2', 'ankleAcc6_3', 
               'ankleGyro1', 'ankleGyro2', 'ankleGyro3', 
               'ankleMagne1', 'ankleMagne2', 'ankleMagne3',
               'ankleOrientation1', 'ankleOrientation2', 'ankleOrientation3', 'ankleOrientation4']
    
    columns = colNames + IMUhand + IMUchest + IMUankle
    
    dataCollection = pd.DataFrame()
    for file in list_of_files:
        print(file," is reading...")
        procData = pd.read_table(file, header=None, sep='\s+')
        procData.columns = columns
        procData['subject_id'] = int(file[-5])
        dataCollection = dataCollection.append(procData, ignore_index=True)
        
    print("all files have been read...")
        
    dataCollection.reset_index(drop=True, inplace=True)
    
    return dataCollection

In [None]:
data = read_files()

In [None]:
data.head()
data[data['subject_id']==1]

In [None]:
data['heartrate'].isnull().sum()/len(data['heartrate']) *100
## the feature 'heartrate' has over 90% of NAN values

In [None]:
def dataCleaning(dataCollection):
    dataCollection = dataCollection.drop(['timestamp', 'handOrientation1', 'handOrientation2', 'handOrientation3', 'handOrientation4',
                                         'chestOrientation1', 'chestOrientation2', 'chestOrientation3', 'chestOrientation4',
                                         'ankleOrientation1', 'ankleOrientation2', 'ankleOrientation3', 'ankleOrientation4'],
                                         axis = 1)  # removal of orientation columns as they are not needed
    dataCollection = dataCollection.drop(dataCollection[dataCollection.activityID == 0].index) #removal of any row of activity 0 as it is transient activity which it is not used
    dataCollection = dataCollection.apply(pd.to_numeric, errors = 'coerce') #removal of non numeric data in cells
    dataCollection = dataCollection.drop('heartrate',axis=1)#as 98% of rows has null value
    dataCollection = dataCollection.dropna()
    print("data cleaned!")
    return dataCollection

In [None]:
cleaned_data = dataCleaning(data)

In [None]:
cleaned_data.head()

In [15]:
def reset_label(dataCollection): 
    # Convert original labels {1, 2, 3, 4, 5, 6, 7, 12, 13, 16, 17, 24} to new labels. 
    mapping = {24:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7,12:8,13:9,16:10,17:11} # old activity Id to new activity Id 
    for i in [24,12,13,16,17]:
        dataCollection.loc[dataCollection.activityID == i, 'activityID'] = mapping[i]

    return dataCollection

In [16]:
data_reset = reset_label(cleaned_data)

In [17]:
def class_breakdown(data,col):
    # group data by the class value and calculate the number of rows
    counts = data.groupby(col).size()
    # retrieve raw rows
    counts = counts.values
    # summarize
    for i in range(len(counts)):
        percent = counts[i] / len(data) * 100
        print('Class=%d, total=%d, percentage=%.3f' % (i+1, counts[i], percent))

In [18]:
class_breakdown(data_reset,'activityID')

Class=1, total=47579, percentage=2.476
Class=2, total=192290, percentage=10.008
Class=3, total=184645, percentage=9.610
Class=4, total=188984, percentage=9.836
Class=5, total=229709, percentage=11.955
Class=6, total=95641, percentage=4.978
Class=7, total=163302, percentage=8.499
Class=8, total=184444, percentage=9.599
Class=9, total=117094, percentage=6.094
Class=10, total=104865, percentage=5.458
Class=11, total=174976, percentage=9.107
Class=12, total=237902, percentage=12.382


In [19]:
X=data_reset.drop(['activityID'],axis=1)
y=data_reset['activityID']

In [20]:
from imblearn.under_sampling import NearMiss
from sklearn.preprocessing import LabelEncoder
def sampling(X,y):
    y_encoded = LabelEncoder().fit_transform(y)
    undersample = NearMiss(version=1)
    X_sampled,y_sampled=undersample.fit_resample(X,y_encoded)
    return X_sampled,y_sampled

In [21]:
X_sampled,y_sampled=sampling(X,y)

In [22]:
X_=X_sampled.drop('subject_id',axis=1)
X_subID=X_sampled['subject_id']

In [23]:
def scale(df):#pandas dataframe 
    features=df.columns[0:39]
    scaler = MinMaxScaler(feature_range=(-1,1))
    #scaler = StandardScaler()
    #df.iloc[:,[1,-1]] = scaler.fit_transform(df.iloc[:,[1,-1]])
    df[features]=scaler.fit_transform(df[features])
    #df=scaler.fit_transform(df)
    return df

In [24]:
data_scaled =scale(X_)

In [25]:
data_scaled.shape

(570948, 39)

In [26]:
X_sampled_scaled=pd.concat([pd.DataFrame(y_sampled,columns = ['activityID']),pd.DataFrame(data_scaled)],axis=1)

In [27]:
X_sampled_scaled=pd.concat([pd.DataFrame(X_sampled_scaled),pd.DataFrame(X_subID,columns = ['subject_id'])],axis=1)

In [28]:
X_sampled_scaled[X_sampled_scaled['subject_id']==1]

Unnamed: 0,activityID,handTemperature,handAcc16_1,handAcc16_2,handAcc16_3,handAcc6_1,handAcc6_2,handAcc6_3,handGyro1,handGyro2,...,ankleAcc6_1,ankleAcc6_2,ankleAcc6_3,ankleGyro1,ankleGyro2,ankleGyro3,ankleMagne1,ankleMagne2,ankleMagne3,subject_id
0,0,0.000000,0.582383,0.212703,-0.561298,0.562925,0.103353,0.043748,0.132603,-0.126435,...,0.146034,-0.026100,0.155555,0.187146,0.140922,-0.170805,-0.189001,0.418305,-0.461998,1
1,0,0.000000,0.578270,0.214282,-0.558970,0.561514,0.107877,0.044565,0.129600,-0.126985,...,0.145298,-0.025856,0.156153,0.186261,0.142555,-0.170401,-0.206053,0.423275,-0.473034,1
2,0,0.000000,0.575302,0.210668,-0.563306,0.559691,0.108639,0.045814,0.126588,-0.128092,...,0.144808,-0.025611,0.156452,0.188107,0.143635,-0.166816,-0.200219,0.413122,-0.465169,1
3,0,0.000000,0.575302,0.195369,-0.560908,0.556952,0.100118,0.038355,0.123791,-0.127763,...,0.144804,-0.025612,0.155264,0.187273,0.140692,-0.167630,-0.200290,0.417363,-0.466728,1
4,0,0.000000,0.579624,0.203510,-0.562079,0.556112,0.089587,0.042141,0.128218,-0.127336,...,0.144806,-0.025856,0.155859,0.187279,0.139304,-0.168035,-0.194790,0.424258,-0.471513,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570823,11,0.588235,0.644669,0.172532,-0.563923,0.634265,0.066371,0.040053,0.162919,-0.125323,...,0.162001,-0.008210,0.222824,0.083844,0.182750,-0.204519,-0.166470,-0.270121,0.310251,1
570832,11,0.564706,0.612712,0.210675,-0.548116,0.599067,0.110477,0.087806,0.087984,-0.085186,...,0.146593,-0.009239,0.173609,0.184505,0.139583,-0.168210,-0.213074,-0.165675,0.298939,1
570899,11,0.588235,0.439615,0.195959,-0.564177,0.416797,0.087256,0.034817,0.081955,-0.123723,...,0.134845,0.048618,0.182667,0.172377,0.156948,-0.172809,-0.199872,-0.429908,0.255842,1
570930,11,0.576471,0.574327,0.194468,-0.551084,0.559568,0.085570,0.067960,0.132166,-0.132640,...,0.146343,-0.010949,0.172726,0.185717,0.144283,-0.169214,-0.210238,-0.164432,0.303742,1


In [38]:
def segment_signal(data, window_size): # data is numpy array
    n = len(data)
    X = []
    y = []
    start = 0
    end = 0
    while start + window_size - 1 < n:
        end = start + window_size-1
        # if the frame contains the same activity and from the same object
        X.append(data[start:(end+1),1:-1])
        y.append(data[start][0])
        start += window_size//2 # 50% overlap
    print(np.asarray(X).shape, np.asarray(y).shape)
    return {'inputs' : np.asarray(X), 'labels': np.asarray(y,dtype=int)}

In [39]:
# data segmented
window_size=128

data_segmented=segment_signal(X_sampled_scaled.to_numpy(),window_size)

(8920, 128, 39) (8920,)


In [40]:
def save_data(data,file_name): # save the data in h5 format
    f = h5py.File(file_name,'w')
    for key in data:
        print(key)
        f.create_dataset(key,data = data[key])       
    f.close()
    print('Done.')    

In [41]:
file_name = 'pamap_scaled.h5'

In [1]:
file_name = 'pamap_sampled_scaled_segmented.h5'

In [48]:
save_data(data_segmented, file_name)

inputs
labels
Done.


In [49]:
import sys
from keras.models import Model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
from sklearn import metrics
import h5py
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tensorflow.keras.utils import to_categorical

In [54]:
path = "C:/Users/dixit/Desktop/Ensem_HAR/Ensem_HAR-main/Implementation_on_PAMAP2/pamap_sampled_scaled_segmented.h5"

In [55]:
f = h5py.File(path, 'r')
X = f.get('inputs')
y = f.get('labels') 
print(type(X))
print(type(y))
X = np.array(X)
y = np.array(y)

<class 'h5py._hl.dataset.Dataset'>
<class 'h5py._hl.dataset.Dataset'>


In [56]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state = 100)

In [57]:
X_train, X_val, y_train, y_val=train_test_split(X_train, y_train, test_size=0.3, random_state = 100)

In [58]:
trainy_one_hot = to_categorical(y_train)
testy_one_hot = to_categorical(y_test)

In [59]:
X_train.shape, X_test.shape

((4370, 128, 39), (2676, 128, 39))