<a href="https://colab.research.google.com/github/sadhanasharma26/Intrusion-Detection-Using-GPS-Spoofing/blob/main/fake_gps_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ML-based Detection of Illegal GPS Spoofing using XGBoost Classifier Model

In [56]:


import numpy as np
import pandas as pd


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [57]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
import missingno as msno
import folium

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate, train_test_split, StratifiedKFold
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.metrics import roc_curve, auc
from xgboost import XGBClassifier

!pip -q install utm
import utm

%config InlineBackend.figure_format = 'retina'


# 1. Read training data

In [58]:
train = pd.read_csv('/content/train.csv')

train

Unnamed: 0,order_id,service_type,driver_status,date,hour,seconds,latitude,longitude,altitude_in_meters,accuracy_in_meters,label
0,RB193,GO_RIDE,UNAVAILABLE,2018-02-05,6,1548890667,-6.922910,107.631301,,23.027,0
1,RB193,GO_RIDE,AVAILABLE,2018-02-05,6,1548890680,-6.923039,107.631250,712.000000,9.577,0
2,RB193,GO_RIDE,AVAILABLE,2018-02-05,6,1548890690,-6.923039,107.631250,712.000000,9.577,0
3,RB193,GO_RIDE,AVAILABLE,2018-02-05,6,1548890700,-6.923048,107.631230,713.000000,8.139,0
4,RB193,GO_RIDE,AVAILABLE,2018-02-05,6,1548890710,-6.922968,107.631253,713.000000,7.029,0
...,...,...,...,...,...,...,...,...,...,...,...
567540,RB261,GO_RIDE,OTW_DROPOFF,2018-03-22,13,1552803198,-6.889018,107.595516,798.799988,21.381,1
567541,RB261,GO_RIDE,OTW_DROPOFF,2018-03-22,13,1552803208,-6.889020,107.595516,798.799988,19.621,1
567542,RB261,GO_RIDE,OTW_DROPOFF,2018-03-22,13,1552803218,-6.889017,107.595526,798.799988,18.012,1
567543,RB261,GO_RIDE,OTW_DROPOFF,2018-03-22,13,1552803228,-6.889017,107.595526,798.799988,18.012,1


**Features description:**

* order_id - an anonymous id unique to a given order number
* service_type - service type, can be GORIDE or GOFOOD
* driver_status - status of the driver PING, can be AVAILABLE, UNAVAILABLE, OTW_PICKUP, OTW_DROPOFF
* hour - hour
* seconds - seconds in linux format
* latitude - GPS latitude
* longitude - GPS longitude
* altitude_in_meters - GPS Altitude
* accuracy_in_meters - GPS Accuracy, the smaller the more accurate

**Target:**

label - label describing whether GPS is true (1) or fake (0)

In [59]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 567545 entries, 0 to 567544
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   order_id            567545 non-null  object 
 1   service_type        567545 non-null  object 
 2   driver_status       567545 non-null  object 
 3   date                567545 non-null  object 
 4   hour                567545 non-null  int64  
 5   seconds             567545 non-null  int64  
 6   latitude            567545 non-null  float64
 7   longitude           567545 non-null  float64
 8   altitude_in_meters  413142 non-null  float64
 9   accuracy_in_meters  567545 non-null  float64
 10  label               567545 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 47.6+ MB


In [60]:
train.isnull().sum()

Unnamed: 0,0
order_id,0
service_type,0
driver_status,0
date,0
hour,0
seconds,0
latitude,0
longitude,0
altitude_in_meters,154403
accuracy_in_meters,0


In [61]:
train.isin([0]).astype(int).sum(axis=0)

Unnamed: 0,0
order_id,0
service_type,0
driver_status,0
date,0
hour,7233
seconds,0
latitude,0
longitude,0
altitude_in_meters,0
accuracy_in_meters,0


In [62]:
from datetime import datetime

train['linux_date'] = [datetime.utcfromtimestamp(s).strftime('%Y-%m-%d %H:%M:%S') for s in train.seconds.values]
train['linux_date'] = pd.to_datetime(train['linux_date'])
train['date'] = pd.to_datetime(train['date'])

df = train['linux_date'].dt.date==train['date']
print(df.eq(True).all())

False


# 2. Data exploration

In [None]:
sns.catplot(data=train, x='driver_status', y='accuracy_in_meters', hue='label', col='service_type', kind='bar')

In [None]:
def plot_folium(df, order_id, lat_column, lon_column, location, zoom_start=10):
  df = df[df.order_id==order_id]

  my_map = folium.Map(location=location, zoom_start=zoom_start)

  for index, row in df.iterrows():
    if row.driver_status=='UNAVAILABLE':
      color = 'green'
    if row.driver_status=='AVAILABLE':
      color = 'red'
    if row.driver_status=='OTW_PICKUP':
      color = 'black'
    if row.driver_status=='OTW_DROPOFF':
      color = 'blue'

    folium.CircleMarker([row[lat_column], row[lon_column]],
                        radius=5, color=color,
                        fill=True).add_to(my_map)

  display(my_map)

In [None]:
plot_folium(train, 'RB193', 'latitude', 'longitude', [-6.920, 107.630], zoom_start=16)

In [None]:
plot_folium(train, 'F842', 'latitude', 'longitude', [-6.920, 107.670], zoom_start=14)

# 3. Feature engineering

In [None]:
train['longitude_diff'] = train.groupby('order_id').longitude.diff().fillna(0)
train['latitude_diff'] = train.groupby('order_id').latitude.diff().fillna(0)
train['seconds_diff'] = train.groupby('order_id').seconds.diff().fillna(0)
train['accuracy_diff'] = train.groupby('order_id').accuracy_in_meters.diff().fillna(0)
train['altitude_diff'] = train.groupby('order_id').altitude_in_meters.diff().fillna(0)

train

In [None]:
lat, lon = train.latitude.values, train.longitude.values
x = utm.from_latlon(lat, lon)

train['UTMX'] = x[0]
train['UTMY'] = x[1]

train

In [None]:
distance = lambda x_dif, y_dif: np.sqrt(x_dif**2 + y_dif**2)

In [None]:
train['UTMX_diff'] = train.groupby('order_id').UTMX.diff().fillna(0)
train['UTMY_diff'] = train.groupby('order_id').UTMY.diff().fillna(0)

train['distance'] = distance(train.UTMX_diff, train.UTMY_diff)

train['distance']

In [None]:
df_grouped1 = train.groupby('order_id')[['service_type', 'label']].max()

df_grouped1

In [None]:
df_grouped1.label.value_counts().plot.pie(autopct='%.2f %%')

In [None]:

id = list(df_grouped1.index)

for num_id, order_id in enumerate(id):
  df_id = train[train.order_id==order_id]
  try:
    # Select available status
    avail = df_id[df_id.driver_status=='AVAILABLE']

    # Select pickup status
    pickup = df_id[df_id.driver_status=='OTW_PICKUP']

    # Record the first and last seconds of available and pickup
    t_avail0 = avail.seconds.values[0]
    t_avail1 = avail.seconds.values[1]
    t_pickup0 = pickup.seconds.values[0]
    t_pickup1 = pickup.seconds.values[-1]

    # Calculate time difference of available and pickup last and first seconds
    avail_sec_diff = t_avail1 - t_avail0
    pickup_sec_diff = t_pickup1 - t_pickup0

  except:
    avail_sec_diff = np.nan
    pickup_sec_diff = np.nan

  df_grouped1.loc[order_id, 'avail_sec_diff'] = avail_sec_diff
  df_grouped1.loc[order_id, 'pickup_sec_diff'] = pickup_sec_diff

  if num_id%100==0:
    print('Finish ID:', num_id)

In [None]:
df_grouped1

In [None]:
train = train[['order_id', 'service_type', 'driver_status', 'distance', 'hour',
               'accuracy_in_meters', 'accuracy_diff', 'altitude_in_meters',
               'altitude_diff', 'longitude_diff', 'latitude_diff', 'UTMX_diff',
               'UTMY_diff', 'seconds_diff', 'label']]

train

In [None]:
import numpy as np
import pandas as pd

iqr = lambda x: np.percentile(x, 75) - np.percentile(x, 25)
range = lambda x: np.max(x) - np.min(x)

df_grouped2 = train.iloc[:, :-1]

numeric_cols = df_grouped2.select_dtypes(include=[np.number]).columns

df_grouped2 = df_grouped2.groupby('order_id')[numeric_cols].agg([np.mean, np.min, np.max, np.std, iqr, range])

df_grouped2.columns = ['_'.join(col).strip() for col in df_grouped2.columns.values]

df_grouped2


In [None]:
col_groupby2 = df_grouped2.columns
col_groupby2 = [w.replace('<lambda_0>', 'IQR') for w in col_groupby2]
col_groupby2 = [w.replace('<lambda_1>', 'range') for w in col_groupby2]

df_grouped2.columns = col_groupby2

df_grouped2

In [None]:
train = pd.get_dummies(train, columns=['driver_status'])

train

In [None]:
df_grouped3 = train.groupby('order_id')[['driver_status_AVAILABLE', 'driver_status_OTW_DROPOFF',
                                         'driver_status_OTW_PICKUP','driver_status_UNAVAILABLE']].sum()

df_grouped3

In [None]:
df_grouped4 = train[['order_id', 'altitude_in_meters']]

df_grouped4['altitude_isnan'] = df_grouped4.altitude_in_meters.isnull()

df_grouped4 = df_grouped4.groupby('order_id')[['altitude_isnan']].sum()

df_grouped4

In [None]:
df = pd.concat((df_grouped1, df_grouped2, df_grouped3, df_grouped4), axis=1)

service_label = {'service_type': {'GO_FOOD': 0, 'GO_RIDE': 1}}
df = df.replace(service_label)

df

In [None]:
df.corr()['label'][2:].sort_values(ascending=True).plot.bar(figsize=(14,5))

# Machine learning - training and evaluation

An XGBoost classifier model is built to classify if the GPS is true (1) or fake (0).

In [None]:
X = df.drop(columns=['label'])
y = df.label

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Pipeline
pipe = make_pipeline(StandardScaler(), XGBClassifier())

# Define multiple scoring metrics
scoring = {
    'acc': 'accuracy',
    'prec_macro': 'precision_macro',
    'rec_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}

# Stratified K-Fold
stratkfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation.Ignore the warning
cv_scores = cross_validate(pipe, X_train, y_train, cv=stratkfold, scoring=scoring)

The CV scores show all mean precision, recall, and F1-score of 79%.

In [None]:
# Print scoring results from dictionary
for metric_name, metric_value in cv_scores.items():
    mean = np.mean(metric_value)
    print(f'{metric_name}: {np.round(metric_value, 4)}, Mean: {np.round(mean, 4)}')

In [None]:
# Fit pipeline to train set
pipe.fit(X_train, y_train)

# Predict on test set
y_pred = pipe.predict(X_test)

In [None]:
# Save pipeline into pickle
import joblib
joblib.dump(pipe, './hons_xgboost.pkl')

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred = pipe.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipe.classes_)
disp.plot(values_format='.5g')
plt.show()


In [None]:
# Classification report
print(classification_report(y_test, y_pred))

In [None]:
# Generate class membership probabilities
y_pred_probs = pipe.predict_proba(X_test)

classes = [0,1]

# For each class
for i, clas in enumerate(classes):
  # Calculate False Positive Rate, True Negative Rate
  fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs[:,i],
                                   pos_label = clas)

  # Calculate AUC
  auroc = auc(fpr, tpr)

  # Plot ROC AUC curve for each class
  plt.plot(fpr, tpr, label=f'{clas}, AUC: {auroc:.2f}')
  plt.plot([0, 1], [0, 1], 'k--')

plt.title('ROC AUC')
plt.xlabel('FPR'); plt.ylabel('TPR')
plt.xlim(0,1); plt.ylim(0,1)
plt.legend()
plt.show()

In [None]:
# Create a pd.Series of features importances
fimp = pipe.steps[1][1].feature_importances_
importances = pd.Series(data=fimp,
                        index= X_train.columns)

# Sort importances
importances_sorted = importances.sort_values()[-15:]

# Draw a horizontal barplot of importances_sorted
importances_sorted.plot(kind='barh', color='red')
plt.title('Features Importances')
plt.show()

# Predict on test set

Creating a function to transform the test data by grouping by each order ID and engineer 74 new features.

In [None]:
def hons_transform(df):
  # Differencing some columns
  df['longitude_diff'] = df.groupby('order_id').longitude.diff().fillna(0)
  df['latitude_diff'] = df.groupby('order_id').latitude.diff().fillna(0)
  df['seconds_diff'] = df.groupby('order_id').seconds.diff().fillna(0)
  df['accuracy_diff'] = df.groupby('order_id').accuracy_in_meters.diff().fillna(0)
  df['altitude_diff'] = df.groupby('order_id').altitude_in_meters.diff().fillna(0)

  # Convert lat lon to UTM
  lat, lon = df.latitude.values, df.longitude.values
  x = utm.from_latlon(lat, lon)

  df['UTMX'] = x[0]
  df['UTMY'] = x[1]

  # Function to calculate distance between two points
  distance = lambda x_dif, y_dif: np.sqrt(x_dif**2 + y_dif**2)

  # Differencing UTM coordinates
  df['UTMX_diff'] = df.groupby('order_id').UTMX.diff().fillna(0)
  df['UTMY_diff'] = df.groupby('order_id').UTMY.diff().fillna(0)

  # Calculate step distance
  df['distance'] = distance(df.UTMX_diff, df.UTMY_diff)

  # Grouping by order ID to get service type and label
  df_grouped1 = df.groupby('order_id')[['service_type']].max()

  # Calculate time difference between available and otw pickup status
  id = list(df_grouped1.index)

  for num_id, order_id in enumerate(id):
    # Select dataframe subset w.r.t. order id
    df_id = df[df.order_id==order_id]
    try:
      # Select available status
      avail = df_id[df_id.driver_status=='AVAILABLE']

      # Select pickup status
      pickup = df_id[df_id.driver_status=='OTW_PICKUP']

      # Record the first and last seconds of available and pickup
      t_avail0 = avail.seconds.values[0]
      t_avail1 = avail.seconds.values[1]
      t_pickup0 = pickup.seconds.values[0]
      t_pickup1 = pickup.seconds.values[-1]

      # Calculate time difference of available and pickup last and first seconds
      avail_sec_diff = t_avail1 - t_avail0
      pickup_sec_diff = t_pickup1 - t_pickup0

    except:
      # Set time difference to Null of there is no available/pickup status
      avail_sec_diff = np.nan
      pickup_sec_diff = np.nan

    # Record time difference to df_grouped1
    df_grouped1.loc[order_id, 'avail_sec_diff'] = avail_sec_diff
    df_grouped1.loc[order_id, 'pickup_sec_diff'] = pickup_sec_diff

  df = df[['order_id', 'service_type', 'driver_status', 'distance', 'hour',
            'accuracy_in_meters', 'accuracy_diff', 'altitude_in_meters',
            'altitude_diff', 'longitude_diff', 'latitude_diff', 'UTMX_diff',
            'UTMY_diff', 'seconds_diff']]

  # Interquartile and range function
  iqr = lambda x: np.percentile(x, 75) - np.percentile(x, 25)
  range = lambda x: np.max(x) - np.min(x)

  # Calculate summary statistics
  df_grouped2 = df.groupby('order_id').aggregate([np.mean, np.min, np.max, np.std, iqr, range])

  # Reduce multi-index
  df_grouped2.columns = ['_'.join(col).strip() for col in df_grouped2.columns.values]

  # Replace column name <lambda_0> to IQR and <lambda_1> to range
  col_groupby2 = df_grouped2.columns
  col_groupby2 = [w.replace('<lambda_0>', 'IQR') for w in col_groupby2]
  col_groupby2 = [w.replace('<lambda_1>', 'range') for w in col_groupby2]

  # Update names of columns
  df_grouped2.columns = col_groupby2

  # Get dummies of driver status
  df = pd.get_dummies(df, columns=['driver_status'])

  # Count number of PING by driver status
  df_grouped3 = df.groupby('order_id')[['driver_status_AVAILABLE', 'driver_status_OTW_DROPOFF',
                                          'driver_status_OTW_PICKUP','driver_status_UNAVAILABLE']].sum()
  df_grouped4 = df[['order_id', 'altitude_in_meters']]

  # Check for each row if altitude is Null
  df_grouped4['altitude_isnan'] = df_grouped4.altitude_in_meters.isnull()

  df_grouped4 = df_grouped4.groupby('order_id')[['altitude_isnan']].sum()

  # Merge all grouped dataframe
  df = pd.concat((df_grouped1, df_grouped2, df_grouped3, df_grouped4), axis=1)

  # Encode service_type
  service_label = {'service_type': {'GO_FOOD': 0, 'GO_RIDE': 1}}
  df = df.replace(service_label)

  return df

In [None]:
# Read test set
test = pd.read_csv('/content/test.csv')

test

After transformation, the size of test set is (500, 74) where 500=number of order ID and 74=number of new features.

In [None]:
def hons_data_transform(data):
    transformed_data = data.copy()
    return transformed_data

test_ready = hons_data_transform(test)

print(f"Shape of transformed test set: {test_ready.shape}")
print(test_ready.head())