In [1]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, f1_score, precision_score, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
import seaborn as sns
import warnings


%matplotlib inline
#Ignore all warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [5]:
# Read in the dataset, combine the individual files and contenate the files into one
csv_files = glob("../data/raw/MachineLearningCSV/MachineLearningCVE/*.csv")
dataframes = [pd.read_csv(file) for file in csv_files]      
combined_df = pd.concat(dataframes, ignore_index=True)

In [6]:
# Replace all the inf values with NaNs before replacing them
combined_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop NaN values
combined_df.dropna(inplace=True)

# Check if there are duplicates in the dataframe.  
combined_df.duplicated().sum()
combined_df.drop_duplicates(inplace=True)

combined_df.to_pickle('../data/processed/combined_df.pkl')

In [2]:
combined_df = pd.read_pickle("../data/processed/combined_df.pkl")

In [8]:
numerical_columns = combined_df.select_dtypes(include=['number']).columns.tolist()
categorical_columns = combined_df.select_dtypes(include=['object', 'category']).columns.to_list()
num_categories = len(categorical_columns)

print(f"Categorical Columns: {categorical_columns}")
print(f"Number of Categorical Columns: {num_categories}")

Categorical Columns: [' Label']
Number of Categorical Columns: 1


In [None]:
labels  = combined_df[' Label']
indicies = range(len(labels))

plt.figure(figsize=(10, 6))
plt.scatter(indicies, labels, alpha=0.5, color="skyblue")
plt.title("Scatter Plot of Class Distribution")
plt.xlabel("Index")
plt.ylabel('Label')
plt.xticks(rotation=45)
plt.show()

In [None]:
class_counts = combined_df[' Label'].value_counts()

sorted_class_counts = class_counts.sort_values(ascending=True)

# Plot the class distribution
plt.figure(figsize=(10, 10))
sorted_class_counts.plot(kind='barh', color='skyblue')
plt.title('Class Distribution')
plt.xlabel('Count')
plt.ylabel('Label')
plt.xticks(rotation=45)
plt.show()

In [None]:
class_counts

In [3]:
le = LabelEncoder()
combined_df[' Label'] = le.fit_transform(combined_df[' Label'])

In [4]:
decoded_labels = le.inverse_transform(combined_df[' Label'])

print(f"Encoded Labels: {combined_df[' Label']}")
print(f"Decoded Labels: {decoded_labels}")

Encoded Labels: 0          0
1          0
2          0
3          0
4          0
          ..
2830737    0
2830738    0
2830739    0
2830740    0
2830742    0
Name:  Label, Length: 2520798, dtype: int64
Decoded Labels: ['BENIGN' 'BENIGN' 'BENIGN' ... 'BENIGN' 'BENIGN' 'BENIGN']


In [None]:
scaler = StandardScaler()
combined_df['numerical_columns'] = scaler.fit_transform(combined_df[numerical_columns])

In [5]:
# Split the data

X = combined_df.drop(' Label', axis=1)
y = combined_df[' Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8)

# Print the shapes of the training and testing sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (2016638, 78)
X_test shape: (504160, 78)
y_train shape: (2016638,)
y_test shape: (504160,)


In [7]:
from collections import Counter

class_counts = Counter(y_train)
print(class_counts)

Counter({0: 1675906, 4: 138335, 2: 102458, 10: 72627, 3: 8191, 7: 4755, 6: 4312, 5: 4150, 11: 2585, 1: 1568, 12: 1162, 14: 525, 9: 35, 13: 20, 8: 9})


In [8]:
total_samples = sum(class_counts.values())
number_classes = len(class_counts)

class_weights = {cls: total_samples / (number_classes * count) for cls, count in class_counts.items()}

print(class_weights)

{0: 0.08022080792916389, 4: 0.9718620257587258, 2: 1.3121721420809829, 10: 1.8511370885942326, 3: 16.413445651731575, 7: 28.273929197336138, 12: 115.69925415949513, 11: 52.00871695680206, 6: 31.1786951144094, 5: 32.39579116465863, 1: 85.74141156462585, 14: 256.0810158730159, 8: 14938.059259259258, 9: 3841.215238095238, 13: 6722.126666666667}


In [9]:
# Define the parameter grid

param_grid = {
    'n_estimators': [50, 75, 100],
    'max_samples': [0.25, 0.5, 0.75],
    'max_depth': [2, 4, 6],
    'criterion': ['gini', 'entropy']
}

# Define the scoring metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'recall': make_scorer(recall_score, average='weighted'),
    'precision': make_scorer(precision_score, average='weighted'),
    'f1': make_scorer(f1_score, average='weighted')
}

In [10]:
clf = RandomForestClassifier(verbose=1, class_weight=class_weights)

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=scoring, refit='f1', cv=3, n_jobs=-1)

grid_search.fit(X_train, y_train)

best_clf = grid_search.best_estimator_

y_pred = best_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")


: 

In [None]:
clf = BalancedRandomForestClassifier(sampling_strategy="all", criterion="entropy", random_state=42)
clf_random = RandomForestClassifier()
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=scoring, refit='f1', cv=3, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best estimator
best_clf = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1}")

# Calculate the count of each predicted class label
predicted_class_counts = pd.Series(y_pred).value_counts()

# Plot the class distribution as a horizontal bar chart
# plt.figure(figsize=(10, 6))
# predicted_class_counts.sort_values(ascending=True).plot(kind='barh', color='skyblue')
# plt.title('Class Distribution After Balanced Random Forest')
# plt.xlabel('Count')
# plt.ylabel('Class Label')
# plt.show()


In [11]:
importances = best_clf.feature_importances_

In [12]:
features = X.columns
importance_df = pd.DataFrame({'Features':features, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)


In [None]:
plt.figure(figsize=(10, 12))
plt.barh(importance_df["Features"], importance_df['Importance'], color='skyblue')
plt.xlabel("importance")
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()
plt.show()

In [6]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

: 

In [10]:
label_counts = combined_df[' Label'].value_counts()

In [11]:
label_percentages = (label_counts / label_counts.sum()) * 100

In [None]:
plt.figure(figsize=(10, 6))
label_percentages.plot(kind='bar')
plt.title("Label Counts as Percentage of Total Labels")
plt.xlabel('Labels')
plt.ylabel("Percentage")

plt.show()

In [None]:
plt.plot(combined_df[' Label'])

In [37]:
combined_df[' Label'] = combined_df[' Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
