# Training

## Importing libraries

In [None]:
import pandas as pd
import time
from joblib import dump

In [None]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

## Selecting the dataset

In [None]:
datasets = ['binary_cic_ids_2017', 'cic_ids_2017', 'binary_nsl_kdd', 'nsl_kdd', 'binary_unsw_nb15', 'unsw_nb15']

dataset = datasets[0]

## Selecting the classifier

In [None]:
classifiers = ['AB', 'KNN', 'LDA', 'LR', 'NB', 'RF']

classifier = classifiers[5]

## Loading the data

In [None]:
train = pd.read_csv(f"../datasets-tratados/{dataset}_train_normalized.csv")

## "Splitting into X and y"

In [None]:
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

## Adaboost

In [None]:
if classifier == classifiers[0]:
    # Create a base model (e.g., a weak decision tree)
    base_model = DecisionTreeClassifier(max_depth=1)

    # Create an AdaBoost model using the base model
    model = AdaBoostClassifier(base_model, n_estimators=50, random_state=42)


## k-NN

In [None]:
if classifier == classifiers[1]:
    # Create a k-NN model
    k_value = 3
    model = KNeighborsClassifier(n_neighbors=k_value)

## Linear Discriminant Analysis

In [None]:
if classifier == classifiers[2]:
    # Create a Linear Discriminant Analysis model
    model = LinearDiscriminantAnalysis()

## Logistic Regression

In [None]:
if classifier == classifiers[3]:
    # Create a Logistic Regression model
    model = LogisticRegression()

## Naive Bayes

In [None]:
if classifier == classifiers[4]:
    # Create a Naive Bayes model
    model = MultinomialNB()

## Random Forest

In [None]:
if classifier == classifiers[5]:
    # Create a Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)

## Cross Validation

In [None]:
# Define the number of folds (k)
num_folds = 5

# Create a KFold object
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# Start time
start_time_cv = time.time()

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=kf)

# End time
end_time_cv = time.time()
cv_time = end_time_cv - start_time_cv

print("Cross-validation scores:", cv_scores)
print("Mean of the scores:", cv_scores.mean())
print("Standard deviation of the scores:", cv_scores.std())

print(f"Cross-Validation Time: {cv_time} seconds")

## Training

In [None]:
# Start time
start_time_train = time.time()

# Training
model.fit(X_train, y_train)

# End time
end_time_train = time.time()
training_time = end_time_train - start_time_train

print(f"Training Time: {training_time} seconds")

## Adding the cross-validation scores to a DataFrame

In [None]:
# Try to load the CSV file
try:
    cv_results = pd.read_csv(f"../results/{dataset}_cross_validation.csv")
except:
    # If the file does not exist, create an empty DataFrame
    cv_results = pd.DataFrame()

In [None]:
new_line = pd.DataFrame({'Classifier': [f'{classifier}'], 'Score1': round(cv_scores[0], 4),
                         'Score2': round(cv_scores[1], 4), 'Score3': round(cv_scores[2], 4),
                         'Score4': round(cv_scores[3], 4), 'Score5': round(cv_scores[4], 4),
                         'Mean': round(cv_scores.mean(), 4), 'Standard Deviation': round(cv_scores.std(), 4)})

cv_results = pd.concat([cv_results, new_line], ignore_index=True)

In [None]:
cv_results

In [None]:
#Saving the DataFrame
cv_results.to_csv(f"../results/{dataset}_cross_validation.csv", index=False)

## "Adding the elapsed times to a DataFrame"

In [None]:
# Try to load the CSV file
try:
    times = pd.read_csv(f"../results/{dataset}_times.csv")
except:
    # If the file does not exist, create an empty DataFrame
    times = pd.DataFrame()

In [None]:
new_line = pd.DataFrame({'Classifier': [f'{classifier}'],
                         'Cross-Validation': round(cv_time, 4),
                         'Training': round(training_time, 4)})

times = pd.concat([times, new_line], ignore_index=True)

In [None]:
times

In [None]:
#Saving the DataFrame
times.to_csv(f"../results/{dataset}_times.csv", index=False)

## "Saving the trained model to a file"

In [None]:
dump(model, f"../trained-models/{dataset}_{classifier}.joblib")