In [1]:
import pandas as pd
import numpy as np
from scaling import standardize, normalize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:

df2015 = pd.read_csv('../data/kaasa/kaasa_2015.csv', index_col=None, header=0)
df2016 = pd.read_csv('../data/kaasa/kaasa_2016.csv', index_col=None, header=0)
df2017 = pd.read_csv('../data/kaasa/kaasa_2017.csv', index_col=None, header=0)
df2018 = pd.read_csv('../data/kaasa/kaasa_2018.csv', index_col=None, header=0)
df2019 = pd.read_csv('../data/kaasa/kaasa_2019.csv', index_col=None, header=0)
df2020 = pd.read_csv('../data/kaasa/kaasa_2020.csv', index_col=None, header=0)
df2021 = pd.read_csv('../data/kaasa/kaasa_2021.csv', index_col=None, header=0)

all_files = [df2015, df2016, df2017, df2018, df2019, df2020, df2021]

df = pd.concat(all_files, axis=0, ignore_index=True)

df['date_time'] = pd.to_datetime(df['date_time'])

df = df.drop(columns=['source_id', 'individual', 'owner_id', 'name',  'distance', 'sin_time', 'cos_time', 'date_time',  'longitude', 'latitude', 'temperature', 'altitude'])
df = standardize(df, ['velocity', 'angle']) # standarize the data
df = normalize(df, ['velocity', 'angle'], 0, 1) # normalize the data

X = df.drop(['attack'], axis=1) # Features
y = df['attack'] # Target variable

In [3]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter grid to test
rs_space= {   
    'random_state': np.array([42]), 
    'n_estimators':np.arange(50, 500, step=50),
    'max_depth':list(np.arange(10, 100, step=10)) + [None],
    }

# Initialize and fit the random forest classifier
rf = RandomForestClassifier()

# First do a random search on a broad range of values
rf_random = RandomizedSearchCV(rf, rs_space, n_iter=200, scoring='recall', n_jobs=-1, cv=3)
model_random = rf_random.fit(X, y)

print('Best hyperparameters are: '+ str(model_random.best_params_))
print('Best score is: '+ str(model_random.best_score_))

In [None]:
# Then narrow the grid space down for grid search, and use the values we got in random search
param_grid = {
    'random_state': [42, 123, 456],
    'n_estimators': [50, 100, 200],
}

grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='recall', n_jobs=-1)
grid_search.fit(X, y)

print("Optimal parameters: ", grid_search.best_params_)
print("Best recall score: ", grid_search.best_score_)