### Import packages

In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
import glob
import time
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

Check if have cuda device (GPU)

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("use", device)

use cuda


### ML model

In [3]:
class Model(nn.Module):
	def __init__(self, n_input = 90, n_hidden = 128, num_layers=1, n_classes = 7):
		super(Model, self).__init__()
			
		self.rnn = nn.LSTM(input_size=n_input, hidden_size=n_hidden, num_layers=num_layers, batch_first=True, bidirectional=False)
		self.classifier = nn.Sequential(
			nn.Linear(n_hidden, n_classes)
		)

	def forward(self, x):
		# (batch_size, n_steps, n_input)
		x, _ = self.rnn(x)
		x = self.classifier(x[:, -1, :])
		return x

### Dataset

Prepare dataset for training

In [4]:
class CSIDataset(Dataset):
	def __init__(
			self, 
			set, # fns for this split
			processed_path="./Dataset/Processed/", 
			use_processed=True, 
			split="train",
			window_size=1000, 
			threshold=60, 
			slide_size=200
			):
		self.window_size = window_size
		self.threshold = threshold
		self.slide_size = slide_size
		# fix seed for reproduce
		# np.random.seed(0)

		# create processed data directory
		if not os.path.isdir(os.path.join(processed_path)):
			os.makedirs(os.path.join(processed_path))

		# read and concat all data
		self.csi = []
		self.label = []
		for i, label in enumerate (["bed", "fall", "pickup", "run", "sitdown", "standup", "walk"]):
			output_csi_fn = os.path.join(processed_path, split + "_" + label + "_csi.csv")
			output_label_fn = os.path.join(processed_path, split + "_" + label + "_label.csv")
			# use processed, don't need to process again
			if use_processed and os.path.isfile(output_csi_fn) and os.path.isfile(output_label_fn):
				x = pd.read_csv(
					output_csi_fn, 
					header=None, 
					engine="c"
					).to_numpy().reshape((-1, self.window_size, 90))
				y = pd.read_csv(
					output_label_fn, 
					header=None, 
					engine="c"
					).to_numpy()
			else:
				# else, process now
				print("No processed", label, "found, start process")
				x = []
				y = []
				input_csv_files = set[label]['csi']
				annotation_csv_files = set[label]['label']
				for i in range(len(input_csv_files)):
					if not int(input_csv_files[i].split('input_')[1].split('.csv')[0].split('_')[-1]) == int(annotation_csv_files[i].split('annotation_')[1].split('.csv')[0].split('_')[-1]):
						# file name of input and annotation(label) not match
						raise "error"
					else:
						x_, y_ = self.process_data(input_csv_files[i], annotation_csv_files[i])
						x.append(x_)
						y.append(y_)
				x = np.concatenate(x, axis=0)
				y = np.concatenate(y, axis=0)

				# shuffle x and y together
				# idxs = np.arange(x.shape[0])
				# np.random.shuffle(idxs)
				# x = x[idxs]
				# y = y[idxs]

				# save to file for the future use
				np.savetxt(output_csi_fn, x.reshape((-1, self.window_size * 90)), delimiter = ",", fmt='%.4f')
				np.savetxt(output_label_fn, y, delimiter = ",", fmt='%d')
			self.csi.append(x)
			self.label.append(y)
			print("finish", label)
		self.csi = np.concatenate(self.csi, axis=0)
		self.label = np.concatenate(self.label, axis=0)
		self.label = np.squeeze(self.label)
		# some statistic information
		unique, counts = np.unique(self.label, return_counts=True)
		print(dict(zip(unique, counts)))

	def process_data(self, csi_fn, label_fn):
		csi_raw = pd.read_csv(
			csi_fn, 
			header=None, 
			engine="c"
			).to_numpy()
		#data import by slide window
		k = 0
		x_list = []
		while k <= (len(csi_raw) + 1 - 2 * self.window_size):
			# for each row
			# 0: time
			# 1~91: amplitude
			# 91: 181: phase
			single_window = np.array(csi_raw[k:k + self.window_size, 1:91])
			x_list.append(single_window)
			k += self.slide_size
		x = np.dstack(x_list)
		# (window_size, feature, # samples)
		x = np.transpose(x, [2, 0, 1])

		label_raw = pd.read_csv(
			label_fn, 
			header=None, 
			engine="c"
			).to_numpy()
		#data import by slide window
		k = 0
		y_list = []
		while k <= (len(label_raw) + 1 - 2 * self.window_size):
			single_window = label_raw[k:k + self.window_size]
			bed = 0
			fall = 0
			walk = 0
			pickup = 0
			run = 0
			sitdown = 0
			standup = 0
			noactivity = 0
			for j in range(self.window_size):
				if single_window[j] == "bed":
					bed += 1
				elif single_window[j] == "fall":
					fall += 1
				elif single_window[j] == "pickup":
					pickup += 1
				elif single_window[j] == "run":
					run += 1
				elif single_window[j] == "sitdown":
					sitdown += 1
				elif single_window[j] == "standup":
					standup += 1
				elif single_window[j] == "walk":
					walk += 1
				else:
					noactivity += 1

			if bed > self.window_size * self.threshold / 100:
				y_list.append(0)
			elif fall > self.window_size * self.threshold / 100:
				y_list.append(1)
			elif pickup > self.window_size * self.threshold / 100:
				y_list.append(2)
			elif run > self.window_size * self.threshold / 100:
				y_list.append(3)
			elif sitdown > self.window_size * self.threshold / 100:
				y_list.append(4)
			elif standup > self.window_size * self.threshold / 100:
				y_list.append(5)
			elif walk > self.window_size * self.threshold / 100:
				y_list.append(6)
			else:
				y_list.append(-1)
			k += self.slide_size
		y = np.array(y_list)

		# remove no activity
		mask = y != -1
		x = x[mask]
		y = y[mask]
		return x, y

	def __len__(self):
		return self.csi.shape[0]
	
	def __getitem__(self, idx):
		return torch.tensor(self.csi[idx], dtype=torch.float), \
			torch.tensor(self.label[idx], dtype=torch.long)

### Configs

In [5]:
# Parameters
input_path = "./Dataset/Data/"
processed_path = "./Dataset/Processed/"
use_processed = False
window_size = 500
threshold = 40
slide_size = 400

learning_rate = 0.00001
training_epos = 2000
batch_size = 128
display_epo = 50

model = Model()

### Split train/valid/test file-wise

In [6]:
train_ratio = 0.7
valid_ratio = 0.2
test_ratio = 0.1

train_set_fns = {}
valid_set_fns = {}
test_set_fns = {}

def shuffle_and_split(csi_files, label_files, train_ratio, valid_ratio, test_ratio):
	num_files = len(csi_files)
	idxs = np.arange(num_files)
	np.random.shuffle(idxs)
	idx1 = int(num_files * train_ratio)
	idx2 = int(num_files * (train_ratio + valid_ratio))
	train_csi_fns = csi_files[:idx1]
	train_label_fns = label_files[:idx1]
	valid_csi_fns = csi_files[idx1:idx2]
	valid_label_fns = label_files[idx1:idx2]
	test_csi_fns = csi_files[idx2:]
	test_label_fns = label_files[idx2:]
	return train_csi_fns, valid_csi_fns, test_csi_fns, \
		train_label_fns, valid_label_fns, test_label_fns

for i, label in enumerate (["bed", "fall", "pickup", "run", "sitdown", "standup", "walk"]):
	train_set_fns[label] = {'csi': [], 'label': []}
	valid_set_fns[label] = {'csi': [], 'label': []}
	test_set_fns[label] = {'csi': [], 'label': []}
	filepath1 = os.path.join(input_path, "input_*" + str(label) + "*.csv")
	filepath2 = os.path.join(input_path, "annotation_*" + str(label) + "*.csv")
	input_csv_files = sorted(glob.glob(filepath1))
	annotation_csv_files = sorted(glob.glob(filepath2))
	# Because naming in the data are inconsistent, we process three part separately.
	input_csv_files_sankalp = [fn for fn in input_csv_files if "sankalp" in fn]
	annotation_csv_files_sankalp = [fn for fn in annotation_csv_files if "sankalp" in fn]
	train_csi_fns, valid_csi_fns, test_csi_fns, train_label_fns, valid_label_fns, test_label_fns \
		= shuffle_and_split(input_csv_files_sankalp, annotation_csv_files_sankalp, train_ratio, valid_ratio, test_ratio)
	train_set_fns[label]['csi'] += train_csi_fns
	train_set_fns[label]['label'] += train_label_fns
	valid_set_fns[label]['csi'] += valid_csi_fns
	valid_set_fns[label]['label'] += valid_label_fns
	test_set_fns[label]['csi'] += test_csi_fns
	test_set_fns[label]['label'] += test_label_fns
	input_csv_files_siamak = [fn for fn in input_csv_files if "siamak" in fn]
	annotation_csv_files_siamak = [fn for fn in annotation_csv_files if "siamak" in fn]
	train_csi_fns, valid_csi_fns, test_csi_fns, train_label_fns, valid_label_fns, test_label_fns \
		= shuffle_and_split(input_csv_files_siamak, annotation_csv_files_siamak, train_ratio, valid_ratio, test_ratio)
	train_set_fns[label]['csi'] += train_csi_fns
	train_set_fns[label]['label'] += train_label_fns
	valid_set_fns[label]['csi'] += valid_csi_fns
	valid_set_fns[label]['label'] += valid_label_fns
	test_set_fns[label]['csi'] += test_csi_fns
	test_set_fns[label]['label'] += test_label_fns
	input_csv_files_date = [fn for fn in input_csv_files if ("siamak" not in fn and "sankalp" not in fn)]
	annotation_csv_files_date = [fn for fn in annotation_csv_files if ("siamak" not in fn and "sankalp" not in fn)]
	train_csi_fns, valid_csi_fns, test_csi_fns, train_label_fns, valid_label_fns, test_label_fns \
		= shuffle_and_split(input_csv_files_date, annotation_csv_files_date, train_ratio, valid_ratio, test_ratio)
	train_set_fns[label]['csi'] += train_csi_fns
	train_set_fns[label]['label'] += train_label_fns
	valid_set_fns[label]['csi'] += valid_csi_fns
	valid_set_fns[label]['label'] += valid_label_fns
	test_set_fns[label]['csi'] += test_csi_fns
	test_set_fns[label]['label'] += test_label_fns
	# print(label)
	# print("csi  : train:", len(train_set[label]['csi']), ", valid:", len(valid_set[label]['csi']), ", test:", len(test_set[label]['csi']))
	# print("label: train:", len(train_set[label]['label']), ", valid:", len(valid_set[label]['label']), ", test:", len(test_set[label]['label']))
# print

Dataset instance

training set and testing set

In [7]:
train_set = CSIDataset(
split="train", 
window_size=window_size, 
threshold=threshold, 
slide_size=slide_size, 
use_processed=use_processed, 
set=train_set_fns,
processed_path=processed_path
)
valid_set = CSIDataset(
split="valid", 
window_size=window_size, 
threshold=threshold, 
slide_size=slide_size, 
use_processed=use_processed, 
set=valid_set_fns,
processed_path=processed_path
)
test_set = CSIDataset(
split="test", 
window_size=window_size, 
threshold=threshold, 
slide_size=slide_size, 
use_processed=use_processed, 
set=test_set_fns,
processed_path=processed_path
)

No processed bed found, start process


finish bed
No processed fall found, start process
finish fall
No processed pickup found, start process
finish pickup
No processed run found, start process
finish run
No processed sitdown found, start process
finish sitdown
No processed standup found, start process
finish standup
No processed walk found, start process
finish walk
{0: 484, 1: 351, 2: 355, 3: 823, 4: 348, 5: 252, 6: 1066}
No processed bed found, start process
finish bed
No processed fall found, start process
finish fall
No processed pickup found, start process
finish pickup
No processed run found, start process
finish run
No processed sitdown found, start process
finish sitdown
No processed standup found, start process
finish standup
No processed walk found, start process
finish walk
{0: 150, 1: 101, 2: 117, 3: 278, 4: 79, 5: 70, 6: 280}
No processed bed found, start process
finish bed
No processed fall found, start process
finish fall
No processed pickup found, start process
finish pickup
No processed run found, start pr

### Dataloader, loss function and optimizer

In [8]:
model.to(device)
train_dataloader = DataLoader(train_set, batch_size=batch_size, num_workers=0, shuffle=True)
valid_dataloader = DataLoader(valid_set, batch_size=batch_size, num_workers=0, shuffle=False)
test_dataloader = DataLoader(test_set, batch_size=batch_size, num_workers=0, shuffle=False)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


### Training main loop

In [9]:
train_losses = []
valid_losses = []
for epoch_index in range(0, training_epos):
    start_time = time.time()

    train_loss = 0
    model.train()
    correct = 0
    for batch in tqdm(train_dataloader, leave=False):
        csi, label = batch
        csi = csi.to(device)
        label = label.to(device)

        outputs = model(csi)
        loss = criterion(outputs, label)
        # calculate prediction accuracy
        correct += torch.sum(outputs.argmax(dim=1) == label).cpu().item()

        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(train_set)
    train_accuracy = 100 * correct / len(train_set)
    train_losses.append(train_loss)

    valid_loss = 0
    model.eval()
    correct = 0
    for batch in tqdm(valid_dataloader, leave=False):
        csi, label = batch
        csi = csi.to(device)
        label = label.to(device)

        outputs = model(csi)
        loss = criterion(outputs, label)
        # calculate prediction accuracy
        correct += torch.sum(outputs.argmax(dim=1) == label).cpu().item()

        valid_loss += loss.item()

    valid_loss = valid_loss / len(valid_set)
    valid_accuracy = 100 * correct / len(valid_set)
    valid_losses.append(valid_loss)

    
    end_time = time.time()
    if epoch_index % display_epo == 0:
        print('epo: ', epoch_index, 
            ', train:', train_loss, ', acc: ', round(train_accuracy, 2),
            ', valid:', valid_loss, ', acc: ', round(valid_accuracy, 2),
            ', time:', round((end_time - start_time) / 60, 3))


                                               

epo:  900 , train: 0.002572270041900214 , acc:  91.25 , valid: 0.01251925321512444 , acc:  62.05 , time: 0.01


                                               

epo:  950 , train: 0.002409834107908097 , acc:  91.36 , valid: 0.01252647599508596 , acc:  61.02 , time: 0.009


                                               

epo:  1000 , train: 0.002152598347311339 , acc:  92.28 , valid: 0.012333669246629228 , acc:  62.6 , time: 0.011


                                               

epo:  1050 , train: 0.002081197928753713 , acc:  92.69 , valid: 0.012297694655351861 , acc:  62.51 , time: 0.012


                                               

epo:  1100 , train: 0.0018946209309314314 , acc:  93.5 , valid: 0.012024847074996594 , acc:  63.35 , time: 0.011


                                               

epo:  1150 , train: 0.0017998462062421199 , acc:  93.56 , valid: 0.011993593781493431 , acc:  64.47 , time: 0.011


                                               

epo:  1200 , train: 0.0016566261337256426 , acc:  94.4 , valid: 0.011958065199297528 , acc:  64.65 , time: 0.011


                                               

epo:  1250 , train: 0.0016007170154044537 , acc:  94.43 , valid: 0.012135524929955949 , acc:  65.49 , time: 0.011


                                               

epo:  1300 , train: 0.0014804769778225726 , acc:  94.75 , valid: 0.012064341833425123 , acc:  65.02 , time: 0.011


                                               

epo:  1350 , train: 0.0014004166576971856 , acc:  95.08 , valid: 0.012245115513025328 , acc:  63.53 , time: 0.011


                                               

epo:  1400 , train: 0.0013160739852443083 , acc:  95.35 , valid: 0.012600038689236308 , acc:  64.09 , time: 0.011


                                               

epo:  1450 , train: 0.001239715704502506 , acc:  95.68 , valid: 0.012612746870794962 , acc:  64.56 , time: 0.011


                                               

epo:  1500 , train: 0.001177699605094638 , acc:  95.9 , valid: 0.012914636620255405 , acc:  63.44 , time: 0.011


                                               

epo:  1550 , train: 0.0011155155203076087 , acc:  96.0 , valid: 0.012892038849897163 , acc:  64.84 , time: 0.013


                                               

epo:  1600 , train: 0.0010391216078070916 , acc:  96.28 , valid: 0.013300603822220203 , acc:  64.19 , time: 0.012


                                               

epo:  1650 , train: 0.000980447769189341 , acc:  96.71 , valid: 0.013145226545112078 , acc:  64.28 , time: 0.01


                                               

epo:  1700 , train: 0.0009349568033872917 , acc:  96.87 , valid: 0.013394672482512719 , acc:  63.53 , time: 0.01


                                               

epo:  1750 , train: 0.0008863368661191743 , acc:  97.12 , valid: 0.012970451975977699 , acc:  65.86 , time: 0.01


                                               

epo:  1800 , train: 0.0008382629537329656 , acc:  97.17 , valid: 0.013400132046189419 , acc:  63.63 , time: 0.01


                                               

epo:  1850 , train: 0.0007888513198767903 , acc:  97.47 , valid: 0.013554447390312372 , acc:  63.91 , time: 0.01


                                               

epo:  1900 , train: 0.0007500155664601706 , acc:  97.64 , valid: 0.013882947882940602 , acc:  64.09 , time: 0.01


                                               

epo:  1950 , train: 0.0007095190737681015 , acc:  97.83 , valid: 0.01385824716368387 , acc:  64.28 , time: 0.009


                                               

### Testing (on testing set)

In [12]:
test_label = []
test_prediction = []
test_loss = 0
model.eval()
start_time = time.time()
for batch in tqdm(test_dataloader, leave=False):
    csi, label = batch
    csi = csi.to(device)
    label = label.to(device)

    outputs = model(csi)
    loss = criterion(outputs, label)
    test_prediction.append(outputs.argmax(dim=1).cpu().detach().numpy())
    test_label.append(label.cpu().detach().numpy())

    test_loss += loss.item()
end_time = time.time()
test_label = np.concatenate(test_label, axis=0)
test_prediction = np.concatenate(test_prediction, axis=0)
accuracy = np.sum(test_label == test_prediction) / test_label.shape[0]
print('test:', test_loss / len(test_set),
    ', acc: ', round(accuracy, 2),
    ', time:', round((end_time - start_time) / 60, 3))
confusion_matrix = np.zeros((7, 7))
for i in range(test_label.shape[0]):
    confusion_matrix[test_label[i], test_prediction[i]] += 1

                                             

test: 0.01769778216784855 , acc:  0.54 , time: 0.005




### Confusion matrix

In [13]:
print(confusion_matrix)

[[ 56.   0.  12.   2.   3.   2.   3.]
 [  0.  34.   0.  19.   0.   0.   0.]
 [ 17.  10.   8.  15.   0.  10.  14.]
 [  0.  12.   8. 133.   0.   3.  13.]
 [ 26.   1.   1.   0.  12.   4.   2.]
 [  7.   4.   1.   0.   0.  21.   3.]
 [ 14.   4.   1.  58.  12.  12.  79.]]
