In [1]:
import pandas as pd
from scaling import standardize, normalize
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [2]:
df2015 = pd.read_csv('../data/kaasa/kaasa_2015.csv', index_col=None, header=0)
df2016 = pd.read_csv('../data/kaasa/kaasa_2016.csv', index_col=None, header=0)
df2017 = pd.read_csv('../data/kaasa/kaasa_2017.csv', index_col=None, header=0)
df2018 = pd.read_csv('../data/kaasa/kaasa_2018.csv', index_col=None, header=0)
df2019 = pd.read_csv('../data/kaasa/kaasa_2019.csv', index_col=None, header=0)
df2020 = pd.read_csv('../data/kaasa/kaasa_2020.csv', index_col=None, header=0)
df2021 = pd.read_csv('../data/kaasa/kaasa_2021.csv', index_col=None, header=0)

all_files = [df2015, df2016, df2017, df2018, df2019, df2020, df2021]

df = pd.concat(all_files, axis=0, ignore_index=True)

In [3]:
df = df.drop(columns=['source_id', 'individual', 'owner_id', 'name',  'distance', 'sin_time', 'cos_time', 'date_time',
                      'longitude', 'latitude', 'temperature', 'altitude'])

df = standardize(df, ['velocity', 'angle']) # standarize the data
df = normalize(df, ['velocity', 'angle'], 0, 1) # normalize the data

X = df.drop(['attack'], axis=1) # Features
y = df['attack'] # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('attack', axis=1), df['attack'], test_size=0.1, random_state=42, stratify=y)

# Cross-validation

### With no oversampling and no hyperparameter tuning:

In [10]:
rf = RandomForestClassifier(random_state=42)

# Define the metrics you want to score on
scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')

cv = StratifiedKFold(n_splits=5)

# Perform K-fold cross-validation and get the scores on each fold
scores = cross_validate(rf, X_train, y_train, cv=cv, scoring=scoring)

# Print the average scores and their standard deviations
for metric in scoring:
    print("%s: %0.4f (+/- %0.4f)" % (metric, scores['test_'+metric].mean(), scores['test_'+metric].std() * 2))

accuracy: 0.9793 (+/- 0.0003)
precision: 0.0152 (+/- 0.0227)
recall: 0.0075 (+/- 0.0119)
f1: 0.0101 (+/- 0.0156)
roc_auc: 0.5105 (+/- 0.0101)


### With hyperparameter tuning and no oversampling: 

In [11]:
rf = RandomForestClassifier(n_estimators=300, max_depth=100, class_weight='balanced_subsample', random_state=42, n_jobs=-1)

# Define the metrics you want to score on
scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')

cv = StratifiedKFold(n_splits=5)

# Perform K-fold cross-validation and get the scores on each fold
scores = cross_validate(rf, X_train, y_train, cv=cv, scoring=scoring)

# Print the average scores and their standard deviations
for metric in scoring:
    print("%s: %0.4f (+/- %0.4f)" % (metric, scores['test_'+metric].mean(), scores['test_'+metric].std() * 2))

accuracy: 0.9796 (+/- 0.0007)
precision: 0.0159 (+/- 0.0179)
recall: 0.0075 (+/- 0.0094)
f1: 0.0102 (+/- 0.0124)
roc_auc: 0.5077 (+/- 0.0080)


### With oversampling and no hyperparameter tuning:

In [14]:
# Define the pipeline with SMOTE oversampling and random forest classifier
pipeline = Pipeline([
    ('sampling', SMOTE()),
    ('rf', RandomForestClassifier(random_state=42))
])

# Define the stratified k-fold cross-validation object
# cv = StratifiedKFold(n_splits=5, shuffle=True)

scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')

# Perform K-fold cross-validation and get the scores on each fold
scores = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring)

# Print the average scores and their standard deviations
for metric in scoring:
    print("%s: %0.4f (+/- %0.4f)" % (metric, scores['test_'+metric].mean(), scores['test_'+metric].std() * 2))

accuracy: 0.8778 (+/- 0.0065)
precision: 0.0133 (+/- 0.0018)
recall: 0.1047 (+/- 0.0137)
f1: 0.0237 (+/- 0.0031)
roc_auc: 0.5126 (+/- 0.0074)


### With oversampling and hyperparameter tuning:

In [13]:
from imblearn.pipeline import Pipeline

# Define the pipeline with SMOTE oversampling and random forest classifier
pipeline = Pipeline([
    ('sampling', SMOTE()),
    ('rf', RandomForestClassifier(n_estimators=300, max_depth=100, class_weight='balanced_subsample', random_state=42, n_jobs=-1))
])

# Define the stratified k-fold cross-validation object
cv = StratifiedKFold(n_splits=5, shuffle=True)

scoring = ('accuracy', 'precision', 'recall', 'f1', 'roc_auc')

# Perform K-fold cross-validation and get the scores on each fold
scores = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring)

# Print the average scores and their standard deviations
for metric in scoring:
    print("%s: %0.4f (+/- %0.4f)" % (metric, scores['test_'+metric].mean(), scores['test_'+metric].std() * 2))

accuracy: 0.8777 (+/- 0.0058)
precision: 0.0156 (+/- 0.0031)
recall: 0.1231 (+/- 0.0227)
f1: 0.0277 (+/- 0.0054)
roc_auc: 0.5135 (+/- 0.0224)
