# Synthetic Data Motion Classification (Spencer or Nate?)

## Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd '/content/drive/Shareddrives/209AS MobSec'

/content/drive/Shareddrives/209AS MobSec


In [3]:
import numpy as np
import pandas as pd
import os
import json

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
#from sklearn.metrics import plot_confusion_matrix

## Helper Functions

In [4]:
def sliding_window(frame_length, step, Xsampleslist, ysampleslist):
    """
    Splits time series in ysampleslist and Xsampleslist
    into segments by applying a sliding overlapping window
    of size equal to frame_length with steps equal to step
    it does this for all the samples and appends all the output together.
    So, the participant distinction is not kept
    Parameters
    ----------
    frame_length : int
        Length of sliding window
    step : int
        Stepsize between windows
    Xsamples : list
        Existing list of window fragments
    ysamples : list
        Existing list of window fragments
    Xsampleslist : list
        Samples to take sliding windows from
    ysampleslist
        Samples to take sliding windows from
    """
    Xsamples = []
    ysamples = []
    for j in range(len(Xsampleslist)):
        X = Xsampleslist[j]
        y = ysampleslist[j]
        for i in range(0, X.shape[0] - frame_length, step):
            xsub = X[i:i + frame_length, :]
            ysub = 0
            for z in range(i,i + frame_length + 1, 1):
              if y[z] == 1:
                ysub = 1
            # ysub = y
            Xsamples.append(xsub)
            ysamples.append(ysub)
    return Xsamples, ysamples

## Data Preprocessing (real)

In [5]:
datadir = os.path.join('/content/drive/Shareddrives/209AS MobSec/Data');
filenames = os.listdir(datadir)
filenames.sort()
datasets = [pd.read_csv(os.path.join(datadir, fn), header=0) for fn in filenames if fn.endswith(".csv")]

datadir = os.path.join('/content/drive/Shareddrives/209AS MobSec/Nathan_Iphone_Data');
filenames = os.listdir(datadir)
filenames.sort()
datasets.extend(pd.read_csv(os.path.join(datadir, fn), header=0) for fn in filenames)
            
columns_to_use = ["accelerometerAccelerationX(G)", "accelerometerAccelerationY(G)", "accelerometerAccelerationZ(G)", "gyroRotationX(rad/s)", "gyroRotationY(rad/s)", "gyroRotationZ(rad/s)"]
columns_to_use2 = ["accelerometerAccelerationX", "accelerometerAccelerationY", "accelerometerAccelerationZ", "gyroRotationX", "gyroRotationY", "gyroRotationZ"] # some of the datasets exclude a unit of measurement

# populate x datasets and y datasets
x_data = []
y_data = []
for i in range(0, len(datasets)):
  try:
    x_data.append(datasets[i][columns_to_use])
    y_data.append(datasets[i]["label(N)"])
  except KeyError:
    x_data.append(datasets[i][columns_to_use2])
    y_data.append(datasets[i]["label"])

# fix naming conventions to use columns_to_use2 for each dataset
for df in x_data:
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)
for df in y_data:
  df.rename("label", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice 

## Data Preprocessing (synthetic)

In [6]:
# Add Spencer's synthesized data
datasets_s = []
datadir = os.path.join('/content/drive/Shareddrives/209AS MobSec/Spencer_Spoofed_Data')
datasets_s.append(pd.read_csv(os.path.join(datadir + "/Trial_ACTGan", "spencer_GAN_data.csv")))
datasets_s.append(pd.read_csv(os.path.join(datadir + "/Trial_LSTM", "Spencer_LSTM_data.csv")))
datasets_s.append(pd.read_csv(os.path.join(datadir, "spencer_data_100k_phone.csv")))

# Add label column (1=nate, 0=spencer)
for df in datasets_s:
  df["label"] = [0 for i in range(len(df))]

# Add Nate's synthesized data
datasets_n = []
datadir = os.path.join('/content/drive/Shareddrives/209AS MobSec/Nate_Spoofed_Data')
datasets_n.append(pd.read_csv(os.path.join(datadir + "/Trial 2_Data_ACTGan", "GAN_Trial2.csv")))
datasets_n.append(pd.read_csv(os.path.join(datadir + "/Trial 3_Data_LSTM", "data_lstm.csv")))
datasets_n.append(pd.read_csv(os.path.join(datadir, "nathan_data_100k_phone.csv")))

# Add label column (1=nate, 0=spencer)
for df in datasets_n:
  df["label"] = [1 for i in range(len(df))]

datasets_synth = datasets_s + datasets_n
columns_to_use = ["accelerometerAccelerationX(G)", "accelerometerAccelerationY(G)", "accelerometerAccelerationZ(G)", "gyroRotationX(rad/s)", "gyroRotationY(rad/s)", "gyroRotationZ(rad/s)"]
columns_to_use2 = ["accelerometerAccelerationX", "accelerometerAccelerationY", "accelerometerAccelerationZ", "gyroRotationX", "gyroRotationY", "gyroRotationZ"] # some of the datasets exclude a unit of measurement
# populate x datasets and y datasets
x_data_synth = []
y_data_synth = []
for i in range(0, len(datasets_synth)):
  try:
    x_data_synth.append(datasets_synth[i][columns_to_use])
    y_data_synth.append(datasets_synth[i]["label"])
  except KeyError:
    x_data_synth.append(datasets_synth[i][columns_to_use2])
    y_data_synth.append(datasets_synth[i]["label"])

# fix naming conventions to use columns_to_use2 for each dataset
for df in x_data_synth:
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice 

In [12]:
xall = [np.array(df) for df in x_data]
yall = [np.array(df) for df in y_data]
X_train, X_test, y_train, y_test = train_test_split(xall, yall, test_size=0.20)

# window with 100 samples, each sample has 6 measurements (acc x,y,z & gyro x,y,z), for 600 data points per window
window_size = 100
x_tr, y_tr = sliding_window(window_size, 1, X_train, y_train)
x_te, y_te = sliding_window(window_size, 1, X_test, y_test)

x_tr = np.reshape(x_tr, (-1, window_size*6))
x_te = np.reshape(x_te, (-1, window_size*6))

In [8]:
xall_synth = [np.array(df) for df in x_data_synth]
yall_synth = [np.array(df) for df in y_data_synth]
X_train_synth, X_test_synth, y_train_synth, y_test_synth = train_test_split(xall_synth, yall_synth, test_size=0.20)

# window with 100 samples, each sample has 6 measurements (acc x,y,z & gyro x,y,z), for 600 data points per window
window_size = 100
x_tr_s, y_tr_s = sliding_window(window_size, 1, X_train_synth, y_train_synth)
x_te_s, y_te_s = sliding_window(window_size, 1, X_test_synth, y_test_synth)

x_tr_s = np.reshape(x_tr_s, (-1, window_size*6))
x_te_s = np.reshape(x_te_s, (-1, window_size*6))

## Model Selection and Fitting

In [13]:
# model = BaggingClassifier(base_estimator=LinearSVC(),n_estimators=10)
# model = KNeighborsClassifier()
# model = GaussianNB()
model = RandomForestClassifier(max_depth=10)
# model = DecisionTreeClassifier()
# model = LogisticRegression(max_iter=1000)
# model = LinearSVC()

model.fit(x_tr, y_tr)

## Model Predictions

In [14]:
predictions = model.predict(x_te_s)
cm = confusion_matrix(y_te_s, predictions)
display(cm)

accuracy = accuracy_score(y_te_s, predictions)
precision = precision_score(y_te_s, predictions)
recall = recall_score(y_te_s, predictions)
print('Accuracy = {:0.5f}, Precision = {:0.5f}, Recall = {:0.5f}'.format(accuracy,precision,recall))

array([[43378,    87],
       [ 3589, 25741]])

Accuracy = 0.94950, Precision = 0.99663, Recall = 0.87763


BaggingClassifier: Accuracy = 0.78143, Precision = 0.78328, Recall = 0.77816

KNN: Accuracy = 0.66092, Precision = 0.68496, Recall = 0.59592

GaussianNB: Accuracy = 0.95143, Precision = 1.00000, Recall = 0.90286

Random Forest: Accuracy = 0.51918, Precision = 0.50978, Recall = 1.00000

Decision Tree: Accuracy = 0.59980, Precision = 0.58884, Recall = 0.66143

Logistic Regression: Accuracy = 0.86714, Precision = 0.80773, Recall = 0.96367

Linear SVC: Accuracy = 0.72316, Precision = 0.68556, Recall = 0.82449