In [1]:
import pandas as pd
import numpy as np
from Scaling import standardize, normalize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, roc_auc_score

In [2]:
df2015 = pd.read_csv('../data/kaasa/kaasa_2015.csv', index_col=None, header=0)
df2016 = pd.read_csv('../data/kaasa/kaasa_2016.csv', index_col=None, header=0)
df2017 = pd.read_csv('../data/kaasa/kaasa_2017.csv', index_col=None, header=0)
df2018 = pd.read_csv('../data/kaasa/kaasa_2018.csv', index_col=None, header=0)
df2019 = pd.read_csv('../data/kaasa/kaasa_2019.csv', index_col=None, header=0)
df2020 = pd.read_csv('../data/kaasa/kaasa_2020.csv', index_col=None, header=0)
df2021 = pd.read_csv('../data/kaasa/kaasa_2021.csv', index_col=None, header=0)

all_files = [df2015, df2016, df2017, df2018, df2019, df2020, df2021]

df = pd.concat(all_files, axis=0, ignore_index=True)

df = df.drop(columns=['source_id', 'individual', 'owner_id', 'name',  'distance', 'sin_time', 'cos_time', 'date_time',
                      'longitude', 'latitude', 'temperature', 'altitude'])

df = standardize(df, ['velocity', 'angle']) # standarize the data
df = normalize(df, ['velocity', 'angle'], 0, 1) # normalize the data

X = df.drop(['attack'], axis=1) # Features
y = df['attack'] # Target variable

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('attack', axis=1), df['attack'], test_size=0.1, random_state=42, stratify=y)

In [4]:
# Create the SMOTE object
smote = SMOTE(random_state=42)

# Apply SMOTE to the training set
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

rf = RandomForestClassifier(random_state=42)

# Randomized Grid Search

In [5]:
# Create the parameter grid
rs_space = {
    'n_estimators': [100, 150, 200],
    'max_depth': [30, 50, 80, 100],
    #'class_weight': ['balanced', 'balanced_subsample', {0: 1, 1: 5}, {0: 1, 1: 25}, {0: 1, 1: 50}, {0: 1, 1: 100}]
}

#scoring = {'accuracy': make_scorer(accuracy_score), 
#           'recall': make_scorer(recall_score), 
#           'precision': make_scorer(precision_score),
#           'roc_auc': make_scorer(roc_auc_score)}

# First do a random search on a broad range of values
rf_random = RandomizedSearchCV(rf, rs_space, n_iter=10, scoring='recall', n_jobs=-1, cv=3)
rf_random.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and best score
print("Best parameters: ", rf_random.best_params_)
print("Best score: ", rf_random.best_score_)

KeyboardInterrupt: 

f√∏rste: 
Best parameters:  {'n_estimators': 200, 'max_depth': 120, 'class_weight': 'balanced_subsample'}
Best score:  0.500330577445653
Best estimator:  RandomForestClassifier(class_weight='balanced_subsample', max_depth=120,
                       n_estimators=200, random_state=42)

andre: 
best params. n esti: 150, max_depth: 120, class weight: 0:1 , 1: 100

# Grid Search

In [5]:
# Create the parameter grid
param_grid = {
    'n_estimators': [100],
    'max_depth': [100, 120],
    'class_weight': ['balanced_subsample', {0: 1, 1: 100}]
}

scoring = {'accuracy': make_scorer(accuracy_score), 
           'recall': make_scorer(recall_score), 
           'precision': make_scorer(precision_score),
           'roc_auc': make_scorer(roc_auc_score)}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='recall', cv=3, n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and best score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)