# Spoofed or Real Data Classification

## Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd '/content/drive/Shareddrives/209AS MobSec'

/content/drive/Shareddrives/209AS MobSec


In [3]:
import numpy as np
import pandas as pd
import os
import json

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
#from sklearn.metrics import plot_confusion_matrix

## Helper Functions

In [4]:
def sliding_window(frame_length, step, Xsampleslist, ysampleslist):
    """
    Splits time series in ysampleslist and Xsampleslist
    into segments by applying a sliding overlapping window
    of size equal to frame_length with steps equal to step
    it does this for all the samples and appends all the output together.
    So, the participant distinction is not kept
    Parameters
    ----------
    frame_length : int
        Length of sliding window
    step : int
        Stepsize between windows
    Xsamples : list
        Existing list of window fragments
    ysamples : list
        Existing list of window fragments
    Xsampleslist : list
        Samples to take sliding windows from
    ysampleslist
        Samples to take sliding windows from
    """
    Xsamples = []
    ysamples = []
    for j in range(len(Xsampleslist)):
        X = Xsampleslist[j]
        y = ysampleslist[j]
        for i in range(0, X.shape[0] - frame_length, step):
            xsub = X[i:i + frame_length, :]
            ysub = 0
            for z in range(i,i + frame_length + 1, 1):
              if y[z] == 1:
                ysub = 1
            # ysub = y
            Xsamples.append(xsub)
            ysamples.append(ysub)
    return Xsamples, ysamples

## Data Preprocessing

In [5]:
datasets = []
# Add Spencer's synthesized data
datadir = os.path.join('/content/drive/Shareddrives/209AS MobSec/Spencer_Spoofed_Data')
datasets.append(pd.read_csv(os.path.join(datadir + "/Trial_ACTGan", "spencer_GAN_data.csv")))
datasets.append(pd.read_csv(os.path.join(datadir + "/Trial_LSTM", "Spencer_LSTM_data.csv")))
datasets.append(pd.read_csv(os.path.join(datadir, "spencer_data_100k_phone.csv")))
# Add Nate's synthesized data
datadir = os.path.join('/content/drive/Shareddrives/209AS MobSec/Nate_Spoofed_Data')
datasets.append(pd.read_csv(os.path.join(datadir + "/Trial 2_Data_ACTGan", "GAN_Trial2.csv")))
datasets.append(pd.read_csv(os.path.join(datadir + "/Trial 3_Data_LSTM", "data_lstm.csv")))
datasets.append(pd.read_csv(os.path.join(datadir, "nathan_data_100k_phone.csv")))
# Add synthesized column (1=synthesized, 0=real)
for df in datasets:
  df["synthesized"] = [1 for i in range(len(df))]

columns_to_use = ["accelerometerAccelerationX(G)", "accelerometerAccelerationY(G)", "accelerometerAccelerationZ(G)", "gyroRotationX(rad/s)", "gyroRotationY(rad/s)", "gyroRotationZ(rad/s)"]
columns_to_use2 = ["accelerometerAccelerationX", "accelerometerAccelerationY", "accelerometerAccelerationZ", "gyroRotationX", "gyroRotationY", "gyroRotationZ"] # some of the datasets exclude a unit of measurement
# populate x datasets and y datasets
x_data = []
y_data = []
for i in range(0, len(datasets)):
  try:
    x_data.append(datasets[i][columns_to_use])
    y_data.append(datasets[i]["synthesized"])
  except KeyError:
    x_data.append(datasets[i][columns_to_use2])
    y_data.append(datasets[i]["synthesized"])

# Add real data
datasets_real = []
datadir = os.path.join('/content/drive/Shareddrives/209AS MobSec/Data');
filenames = os.listdir(datadir)
filenames.sort()
datasets_real = [pd.read_csv(os.path.join(datadir, fn), header=0) for fn in filenames if fn.endswith(".csv")]

datadir = os.path.join('/content/drive/Shareddrives/209AS MobSec/Nathan_Iphone_Data');
filenames = os.listdir(datadir)
filenames.sort()
datasets_real.extend(pd.read_csv(os.path.join(datadir, fn), header=0) for fn in filenames)

# Add synthesized column (1=synthesized, 0=real)
for df in datasets_real:
  df["synthesized"] = [0 for i in range(len(df))]

# populate x datasets and y datasets
x_data_real = []
y_data_real = []
for i in range(0, len(datasets_real)):
  try:
    x_data_real.append(datasets_real[i][columns_to_use])
    y_data_real.append(datasets_real[i]["synthesized"])
  except KeyError:
    x_data_real.append(datasets_real[i][columns_to_use2])
    y_data_real.append(datasets_real[i]["synthesized"])

# fix naming conventions to use columns_to_use2 for each dataset
for df in x_data_real:
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)
for df in x_data:
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)

x_data.extend(x_data_real)
y_data.extend(y_data_real)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({columns_to_use[i]: columns_to_use2[i] for i in range(len(columns_to_use))}, axis=1, inplace=True)
A value is trying to be set on a copy of a slice 

In [6]:
s_0 = 0
s_1 = 0
for df in y_data:
  s_0 += (df == 0).sum()
  s_1 += (df == 1).sum()
print(f"Real Samples (0): {s_0}, Synthesized Samples (1): {s_1}")

Real Samples (0): 345375, Synthesized Samples (1): 220000


In [7]:
xall = [np.array(df) for df in x_data]
yall = [np.array(df) for df in y_data]
X_train, X_test, y_train, y_test = train_test_split(xall, yall, test_size=0.20)

# window with 100 samples, each sample has 6 measurements (acc x,y,z & gyro x,y,z), for 600 data points per window
window_size = 100
x_tr, y_tr = sliding_window(window_size, 1, X_train, y_train)
x_te, y_te = sliding_window(window_size, 1, X_test, y_test)

x_tr = np.reshape(x_tr, (-1, window_size*6))
x_te = np.reshape(x_te, (-1, window_size*6))

## Model Selection and Fitting

In [8]:
# model = BaggingClassifier(base_estimator=LinearSVC(),n_estimators=10)
# model = KNeighborsClassifier()
# model = GaussianNB()
# model = RandomForestClassifier(max_depth=10)
# model = DecisionTreeClassifier()
# model = LogisticRegression(max_iter=1000)
model = LinearSVC()

model.fit(x_tr, y_tr)



## Model Predictions

In [9]:
predictions = model.predict(x_te)
cm = confusion_matrix(y_te, predictions)
display(cm)

accuracy = accuracy_score(y_te, predictions)
precision = precision_score(y_te, predictions)
recall = recall_score(y_te, predictions)
print('Accuracy = {:0.5f}, Precision = {:0.5f}, Recall = {:0.5f}'.format(accuracy,precision,recall))

array([[39458, 23312],
       [ 3242,  1658]])

Accuracy = 0.60760, Precision = 0.06640, Recall = 0.33837


BaggingClassifier: Accuracy = 0.53753, Precision = 0.04040, Recall = 0.20020

KNN: Accuracy = 0.91675, Precision = 0.00000, Recall = 0.00000

GaussianNB: Accuracy = 0.40183, Precision = 0.09804, Recall = 0.75429

Random Forest: Accuracy = 0.94682, Precision = 0.66851, Recall = 0.71653

Decision Tree: Accuracy = 0.85453, Precision = 0.33424, Recall = 0.75347

Logistic Regression: Accuracy = 0.55707, Precision = 0.06275, Recall = 0.31000

Linear SVC: Accuracy = 0.60760, Precision = 0.06640, Recall = 0.33837