# Wahzu Classifier

## Pre-processing

In [58]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [59]:
safe_df = pd.read_csv('safe.csv')
threat_df = pd.read_csv('threat.csv')

In [60]:
import pandas as pd

# Select 6 random samples from safe_df
safe_sample = safe_df.sample(6)

# Select 4 random samples from threat_df
threat_sample = threat_df.sample(4)

# Concatenate the two samples
combined_df = pd.concat([safe_sample, threat_sample])

# Save to a CSV file
combined_df.to_csv("combined_sample.csv", index=False)

In [61]:
safe_df.replace(' ', np.nan, inplace=True)

In [62]:
threat_df.replace(' ', np.nan, inplace=True)

In [63]:
safe_df['Flag'] = 0
threat_df['Flag'] = 1

In [64]:
extra_columns = [
    "_source.data.win.eventdata.originalFileName",
    "_source.data.win.eventdata.image",
    "_source.data.win.eventdata.description",
    "_source.data.win.eventdata.parentCommandLine",
    "_source.data.win.eventdata.parentImage",
    "_source.data.win.eventdata.commandLine",
    "_source.data.win.eventdata.integrityLevel",
    "_source.data.win.eventdata.user",
    "_source.data.win.eventdata.targetFilename",
    "_source.data.win.eventdata.appName",
    "_source.data.win.eventdata.moduleName",
    "_source.data.win.eventdata.appPath"
]

In [65]:
safe_df[extra_columns] = np.nan

In [66]:
final_cols = [
    'agent.name',
    'data.win.system.eventID',
    'data.win.system.channel',
    'data.win.system.severityValue',
    'data.win.system.providerName',
    'rule.firedtimes',#
    'rule.level',
    'rule.groups',
    'location',
    'data.win.eventdata.logonProcessName',
    'data.win.eventdata.elevatedToken',
    'data.win.eventdata.processName',#
    'data.win.eventdata.targetDomainName',
    'data.win.eventdata.logonType',#
    'rule.mitre.technique',
    'rule.mitre.tactic',
    'syscheck.path',
    'syscheck.event',
    'syscheck.value_name',
    'syscheck.win_perm_after',
    'data.win.eventdata.p1',
    'data.win.eventdata.serviceType',#
    'data.vulnerability.severity',#
    'data.vulnerability.cve',#
    'data.vulnerability.cvss.cvss3.base_score',# scale
    'data.win.eventdata.originalFileName',
    'data.win.eventdata.image',
    #'data.win.eventdata.integrityLevel',
    'Flag'
]

print(len(final_cols))

28


In [67]:
safe_df.columns = safe_df.columns.str.replace('^_source.', '', regex=True)
threat_df.columns = threat_df.columns.str.replace('^_source.', '', regex=True)

In [68]:
safe_final_df = safe_df[final_cols]
threat_final_df = threat_df[final_cols]

In [69]:
combined_df = pd.concat([safe_final_df, threat_final_df], axis=0)

In [70]:
filtering_df = combined_df

In [71]:
flagged_logs = []

In [72]:
combined_df['data.vulnerability.cve'].value_counts()

data.vulnerability.cve
CVE-2024-29061    2
CVE-2024-21343    2
CVE-2024-21407    2
CVE-2024-30066    2
CVE-2024-30099    2
                 ..
CVE-2024-38043    2
CVE-2024-38057    2
CVE-2024-38101    2
CVE-2024-38060    2
CVE-2024-21305    2
Name: count, Length: 187, dtype: int64

In [73]:
# Flag rows where 'data.win.eventdata.serviceType' is 'kernel mode driver'
flagged_rows = combined_df[combined_df['data.win.eventdata.serviceType'] == 'kernel mode driver']

# Add these flagged rows to the flagged_logs list
new_flagged_logs = flagged_rows.to_dict('records')  # Convert the rows to dictionary format for easier logging

flagged_logs = flagged_logs + new_flagged_logs

# Remove the flagged rows from the original dataframe
combined_df = combined_df[combined_df['data.win.eventdata.serviceType'] != 'kernel mode driver']

In [74]:
# Flag rows where 'data.win.eventdata.serviceType' is 'kernel mode driver'
flagged_rows = combined_df[combined_df['data.vulnerability.severity'] == 'Critical']

# Add these flagged rows to the flagged_logs list
new_flagged_logs = flagged_rows.to_dict('records')  # Convert the rows to dictionary format for easier logging

flagged_logs = flagged_logs + new_flagged_logs

# Remove the flagged rows from the original dataframe
combined_df = combined_df[combined_df['data.vulnerability.severity'] != 'Critical']

In [75]:
combined_df['data.vulnerability.cve'] = combined_df['data.vulnerability.cve'].astype(str) 

In [76]:
# Flag rows where 'data.win.eventdata.serviceType' is 'kernel mode driver'
flagged_rows = combined_df[combined_df['data.vulnerability.cve'] != 'nan']

# Add these flagged rows to the flagged_logs list
new_flagged_logs = flagged_rows.to_dict('records')  # Convert the rows to dictionary format for easier logging

flagged_logs = flagged_logs + new_flagged_logs

# Remove the flagged rows from the original dataframe
combined_df = combined_df[combined_df['data.vulnerability.cve'] == 'nan']

In [77]:
#print(flagged_logs)
print(len(flagged_logs))

377


In [78]:
# Replace NaN values with 'unmodified' in the 'syscheck.event' column
combined_df['syscheck.event'] = combined_df['syscheck.event'].fillna('unmodified')

In [79]:
combined_df['data.win.eventdata.image'] = combined_df['data.win.eventdata.image'].apply(lambda x: 1 if isinstance(x, str) and x.startswith('C:\\Windows\\System32\\') else 0)

In [80]:
# Columns to One-Hot Encode
one_hot_columns = ['agent.name', 'data.win.system.eventID', 'data.win.system.channel', 'location', 'data.win.system.providerName', 'rule.mitre.technique', 'rule.mitre.tactic', 'syscheck.event']

# Columns to Log Transform
log_transform_columns = ['rule.firedtimes']

# Columns to Label Encode
# Structured dictionary to store column names and their respective orderings
label_encoding_column_order_mapping = {
    'data.win.system.severityValue': [np.nan, 'INFORMATION', 'AUDIT_SUCCESS', 'WARNING', 'ERROR', 'AUDIT_FAILURE'],
}

# Dictionary with acceptable values for specific columns
acceptable_values = {
    'data.win.eventdata.targetDomainName': ['NT AUTHORITY'],  # Only 'NT AUTHORITY' is acceptable for column 'data.win.eventdata.targetDomainName'
    'data.win.eventdata.originalFileName': ['wannacry.exe', 'notpetya.exe', 'trickbot.exe', 'emotet.exe', 'ryuk.exe', 'locky.exe', 'cryptolocker.exe', 'keylogger.exe', 'winspy.exe', 'darkcomet.exe', 'nanocore.exe', 'teamviewer.exe', 'anydesk.exe', 'radmin.exe', 'vncserver.exe', 'remcmd.exe', 'mimikatz.exe', 'procdump.exe', 'dumpert.exe', 'pwdump.exe', 'nmap.exe', 'angryipscanner.exe', 'metasploit.exe', 'sqlmap.exe', 'xmrig.exe', 'minerd.exe', 'cryptonight.exe', 'ccminer.exe', 'svchost.exe', 'explorer.exe', 'lsass.exe', 'csrss.exe', 'winlogon.exe', 'wscript.exe', 'cscript.exe', 'powershell.exe', 'mshta.exe', 'regsvr32.exe', 'installutil.exe', 'msiexec.exe', 'schtasks.exe', 'certutil.exe', 'bitsadmin.exe', 'ftp.exe', 'sc.exe', 'driverquery.exe', 'rundll32.exe', 'taskeng.exe', 'conhost.exe'] ,
    'data.win.eventdata.logonType': ['wannacry.exe', 'notpetya.exe'], # New list required
    'data.win.eventdata.processName': ['wannacry.exe', 'notpetya.exe'] # New list required
    
}

# List of columns for missing value indicator
missing_value_columns = ['data.win.eventdata.logonProcessName ', 'data.win.eventdata.elevatedToken', 'syscheck.path', 'syscheck.value_name', 'syscheck.win_perm_after', 'data.win.eventdata.p1']

# List of columns for scaling
scaling_columns = []

In [81]:
combined_df['data.win.system.severityValue'].unique()

       'AUDIT_FAILURE'], dtype=object)

In [82]:
import joblib
from sklearn.preprocessing import OneHotEncoder

# Fit the encoder on the specified columns
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
encoded = one_hot_encoder.fit_transform(combined_df[one_hot_columns])

# Create a DataFrame from the encoded data
encoded_df = pd.DataFrame(encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_columns))

# Drop the original categorical columns from combined_df and concatenate the encoded columns
combined_df = combined_df.drop(columns=one_hot_columns).reset_index(drop=True)
combined_df = pd.concat([combined_df, encoded_df.reset_index(drop=True)], axis=1)

# Save the fitted OneHotEncoder for use in wahzu_classifier.py
joblib.dump(one_hot_encoder, 'one_hot_encoder.pkl')
print("OneHotEncoder saved as 'one_hot_encoder.pkl'")


OneHotEncoder saved as 'one_hot_encoder.pkl'


In [83]:
# Log Transformation
for column in log_transform_columns:
    # Check for non-positive values to avoid issues with log(0) or log of negative numbers
    if (combined_df[column] <= 0).any():
        raise ValueError(f"Column '{column}' contains non-positive values, cannot apply log transform.")
    combined_df[column] = np.log(combined_df[column])

In [84]:
# Function to map each column based on its order
def label_encode_with_custom_order(df, label_encoding_column_order_mapping):
    for column, order in label_encoding_column_order_mapping.items():
        # Create a dictionary mapping the custom order to numerical values
        order_mapping = {value: idx for idx, value in enumerate(order)}
        
        # Apply the mapping to the column
        df[column] = df[column].map(order_mapping)
    
    return df

# Apply the custom encoding
combined_df = label_encode_with_custom_order(combined_df, label_encoding_column_order_mapping)

In [85]:
# Acceptable Values Mapping
for column, acceptable in acceptable_values.items():
    if column in combined_df.columns:
        combined_df[column] = combined_df[column].apply(lambda x: 1 if x in acceptable else 0)

In [86]:
# Missing Value Indicator
for column in missing_value_columns:
    if column in combined_df.columns:
        combined_df[column] = combined_df[column].apply(lambda x: 0 if pd.isnull(x) else 1)

In [87]:
# Scale data.vulnerability.cvss.cvss3.base_score to get between 0-1
combined_df['data.vulnerability.cvss.cvss3.base_score'] = combined_df['data.vulnerability.cvss.cvss3.base_score'].fillna(0)  # Replace NaN with 0
combined_df['data.vulnerability.cvss.cvss3.base_score'] = combined_df['data.vulnerability.cvss.cvss3.base_score'].astype(float)  # Convert to float for division
combined_df['data.vulnerability.cvss.cvss3.base_score'] = combined_df['data.vulnerability.cvss.cvss3.base_score'].apply(lambda x: x / 10)  # Scale the values

In [88]:
print("Processed DataFrame:")
display(combined_df)

Processed DataFrame:


Unnamed: 0,data.win.system.severityValue,rule.firedtimes,rule.level,rule.groups,data.win.eventdata.logonProcessName,data.win.eventdata.elevatedToken,data.win.eventdata.processName,data.win.eventdata.targetDomainName,data.win.eventdata.logonType,syscheck.path,syscheck.value_name,syscheck.win_perm_after,data.win.eventdata.p1,data.win.eventdata.serviceType,data.vulnerability.severity,data.vulnerability.cve,data.vulnerability.cvss.cvss3.base_score,data.win.eventdata.originalFileName,data.win.eventdata.image,Flag,agent.name_ubuntu,data.win.system.eventID_1000,data.win.system.eventID_10000,data.win.system.eventID_10001,data.win.system.eventID_1001,data.win.system.eventID_10010,data.win.system.eventID_1003,data.win.system.eventID_1014,data.win.system.eventID_102,data.win.system.eventID_1026,data.win.system.eventID_1033,data.win.system.eventID_1035,data.win.system.eventID_1040,data.win.system.eventID_11,data.win.system.eventID_11707,data.win.system.eventID_12289,data.win.system.eventID_13,data.win.system.eventID_16384,data.win.system.eventID_17,data.win.system.eventID_1796,data.win.system.eventID_20,data.win.system.eventID_3,data.win.system.eventID_300,data.win.system.eventID_301,data.win.system.eventID_302,data.win.system.eventID_326,data.win.system.eventID_4004,data.win.system.eventID_4616,data.win.system.eventID_4624,data.win.system.eventID_4625,data.win.system.eventID_4634,data.win.system.eventID_4647,data.win.system.eventID_6000,data.win.system.eventID_7040,data.win.system.eventID_7045,data.win.system.eventID_8224,data.win.system.eventID_86,data.win.system.eventID_nan,data.win.system.channel_Microsoft-Windows-Sysmon/Operational,data.win.system.channel_Security,data.win.system.channel_System,data.win.system.channel_nan,location_/var/log/dpkg.log,location_/var/log/kern.log,location_/var/log/syslog,location_EventChannel,location_active-response\active-responses.log,location_netstat listening ports,location_sca,location_syscheck,location_wazuh-agent,location_wazuh-monitord,location_wazuh-remoted,data.win.system.providerName_Application Error,data.win.system.providerName_BTHUSB,data.win.system.providerName_ESENT,data.win.system.providerName_Microsoft-Windows-CertificateServicesClient-CertEnroll,data.win.system.providerName_Microsoft-Windows-DNS-Client,data.win.system.providerName_Microsoft-Windows-DistributedCOM,data.win.system.providerName_Microsoft-Windows-Search,data.win.system.providerName_Microsoft-Windows-Security-Auditing,data.win.system.providerName_Microsoft-Windows-Security-SPP,data.win.system.providerName_Microsoft-Windows-Sysmon,data.win.system.providerName_Microsoft-Windows-TPM-WMI,data.win.system.providerName_Microsoft-Windows-WindowsUpdateClient,data.win.system.providerName_Microsoft-Windows-Winlogon,data.win.system.providerName_MsiInstaller,data.win.system.providerName_Service Control Manager,data.win.system.providerName_VSS,data.win.system.providerName_Windows Error Reporting,data.win.system.providerName_nan,"rule.mitre.technique_[""Account Discovery"",""PowerShell""]","rule.mitre.technique_[""Account Discovery"",""Windows Command Shell""]","rule.mitre.technique_[""Account Discovery""]","rule.mitre.technique_[""Application Shimming""]","rule.mitre.technique_[""Command and Scripting Interpreter""]","rule.mitre.technique_[""Create Account""]","rule.mitre.technique_[""Deobfuscate/Decode Files or Information""]","rule.mitre.technique_[""Disable or Modify System Firewall""]","rule.mitre.technique_[""Disable or Modify Tools""]","rule.mitre.technique_[""File Deletion"",""Data Destruction"",""Modify Registry""]","rule.mitre.technique_[""File Deletion""]","rule.mitre.technique_[""Ingress Tool Transfer"",""Command and Scripting Interpreter""]","rule.mitre.technique_[""Ingress Tool Transfer""]","rule.mitre.technique_[""Lateral Tool Transfer""]","rule.mitre.technique_[""Local Account""]","rule.mitre.technique_[""Modify Registry""]","rule.mitre.technique_[""Network Share Discovery"",""Windows Command Shell""]","rule.mitre.technique_[""Network Share Discovery""]","rule.mitre.technique_[""Non-Application Layer Protocol""]","rule.mitre.technique_[""Obfuscated Files or Information"",""Modify Registry""]","rule.mitre.technique_[""PowerShell""]","rule.mitre.technique_[""Process Injection""]","rule.mitre.technique_[""Registry Run Keys / Startup Folder""]","rule.mitre.technique_[""Rename System Utilities"",""Deobfuscate/Decode Files or Information""]","rule.mitre.technique_[""Rundll32""]","rule.mitre.technique_[""Security Account Manager"",""PowerShell""]","rule.mitre.technique_[""Security Account Manager""]","rule.mitre.technique_[""Security Software Discovery""]","rule.mitre.technique_[""Service Stop""]","rule.mitre.technique_[""Stored Data Manipulation"",""Modify Registry""]","rule.mitre.technique_[""Sudo and Sudo Caching""]","rule.mitre.technique_[""System Information Discovery"",""Windows Management Instrumentation""]","rule.mitre.technique_[""System Owner/User Discovery""]","rule.mitre.technique_[""Valid Accounts"",""Account Access Removal""]","rule.mitre.technique_[""Valid Accounts"",""Remote Services""]","rule.mitre.technique_[""Valid Accounts""]","rule.mitre.technique_[""Visual Basic""]","rule.mitre.technique_[""Windows Command Shell"",""DLL Search Order Hijacking""]","rule.mitre.technique_[""Windows Command Shell""]","rule.mitre.technique_[""Windows Service""]",rule.mitre.technique_nan,"rule.mitre.tactic_[""Command and Control""]","rule.mitre.tactic_[""Credential Access"",""Execution""]","rule.mitre.tactic_[""Credential Access""]","rule.mitre.tactic_[""Defense Evasion"",""Impact""]","rule.mitre.tactic_[""Defense Evasion"",""Persistence"",""Privilege Escalation"",""Initial Access"",""Impact""]","rule.mitre.tactic_[""Defense Evasion"",""Persistence"",""Privilege Escalation"",""Initial Access"",""Lateral Movement""]","rule.mitre.tactic_[""Defense Evasion"",""Persistence"",""Privilege Escalation"",""Initial Access""]","rule.mitre.tactic_[""Defense Evasion"",""Privilege Escalation""]","rule.mitre.tactic_[""Defense Evasion""]","rule.mitre.tactic_[""Discovery"",""Execution""]","rule.mitre.tactic_[""Discovery""]","rule.mitre.tactic_[""Execution"",""Persistence"",""Privilege Escalation"",""Defense Evasion""]","rule.mitre.tactic_[""Execution""]","rule.mitre.tactic_[""Impact"",""Defense Evasion""]","rule.mitre.tactic_[""Impact""]","rule.mitre.tactic_[""Lateral Movement""]","rule.mitre.tactic_[""Persistence"",""Privilege Escalation""]","rule.mitre.tactic_[""Persistence""]","rule.mitre.tactic_[""Privilege Escalation"",""Defense Evasion"",""Execution""]","rule.mitre.tactic_[""Privilege Escalation"",""Defense Evasion""]","rule.mitre.tactic_[""Privilege Escalation"",""Persistence""]",rule.mitre.tactic_nan,syscheck.event_deleted,syscheck.event_modified,syscheck.event_unmodified
0,3,0.000000,5,"[""windows"",""windows_system""]",,0,0,0,0,0,0,0,0,,,,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,2,1.791759,3,"[""windows"",""windows_security"",""authentication_...",Advapi,1,0,1,0,0,0,0,0,,,,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,0.000000,7,"[""syslog"",""dpkg"",""config_changed""]",,0,0,0,0,0,0,0,0,,,,0.0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0,1.386294,7,"[""sca""]",,0,0,0,0,0,0,0,0,,,,0.0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0,2.772589,7,"[""sca""]",,0,0,0,0,0,0,0,0,,,,0.0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4756,1,0.000000,3,"[""sysmon"",""sysmon_eid1_detections"",""windows""]",,0,0,0,0,0,0,0,0,,,,0.0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4757,1,0.693147,4,"[""sysmon"",""sysmon_eid1_detections"",""windows""]",,0,0,0,0,0,0,0,0,,,,0.0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4758,1,2.197225,3,"[""sysmon"",""sysmon_eid1_detections"",""windows""]",,0,0,0,0,0,0,0,0,,,,0.0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4759,1,2.197225,4,"[""sysmon"",""sysmon_eid1_detections"",""windows""]",,0,0,0,0,0,0,0,0,,,,0.0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [89]:
# Convert 'rule.level' to numeric, forcing invalid parsing to NaN
combined_df['rule.level'] = pd.to_numeric(combined_df['rule.level'], errors='coerce')

# Define the bins and labels
bins = [0, 4, 10, 15]
labels = ['LOW', 'MED', 'HIGH']

# Binning the 'rule.level' column
combined_df['rule.level'] = pd.cut(
    combined_df['rule.level'],
    bins=bins,
    labels=labels,
    include_lowest=True,
    right=True
)

# Handle any NaN values that may result from out-of-bounds values
combined_df['rule.level'] = combined_df['rule.level'].fillna('LOW')

# Map the binned categories to specific numerical values
category_mapping = {'LOW': 0, 'MED': 1, 'HIGH': 2}
combined_df['rule.level'] = combined_df['rule.level'].map(category_mapping)

In [90]:
import ast

# Step 1: Parse the 'rule.groups' column from string representation of lists to actual lists
combined_df['rule.groups'] = combined_df['rule.groups'].apply(ast.literal_eval)

# Step 2: Extract all unique individual values from the 'rule.groups' column
unique_values = set()
for groups in combined_df['rule.groups']:
    unique_values.update(groups)

unique_values = sorted(unique_values)  # Optional: sort the unique values for consistency

# Step 3: Create a new column for each unique value and encode it
for value in unique_values:
    combined_df[value] = combined_df['rule.groups'].apply(lambda x: 1 if value in x else 0)

# Step 4: Remove the original 'rule.groups' column
combined_df = combined_df.drop('rule.groups', axis=1)

In [91]:
combined_df.head()

Unnamed: 0,data.win.system.severityValue,rule.firedtimes,rule.level,data.win.eventdata.logonProcessName,data.win.eventdata.elevatedToken,data.win.eventdata.processName,data.win.eventdata.targetDomainName,data.win.eventdata.logonType,syscheck.path,syscheck.value_name,syscheck.win_perm_after,data.win.eventdata.p1,data.win.eventdata.serviceType,data.vulnerability.severity,data.vulnerability.cve,data.vulnerability.cvss.cvss3.base_score,data.win.eventdata.originalFileName,data.win.eventdata.image,Flag,agent.name_ubuntu,data.win.system.eventID_1000,data.win.system.eventID_10000,data.win.system.eventID_10001,data.win.system.eventID_1001,data.win.system.eventID_10010,data.win.system.eventID_1003,data.win.system.eventID_1014,data.win.system.eventID_102,data.win.system.eventID_1026,data.win.system.eventID_1033,data.win.system.eventID_1035,data.win.system.eventID_1040,data.win.system.eventID_11,data.win.system.eventID_11707,data.win.system.eventID_12289,data.win.system.eventID_13,data.win.system.eventID_16384,data.win.system.eventID_17,data.win.system.eventID_1796,data.win.system.eventID_20,data.win.system.eventID_3,data.win.system.eventID_300,data.win.system.eventID_301,data.win.system.eventID_302,data.win.system.eventID_326,data.win.system.eventID_4004,data.win.system.eventID_4616,data.win.system.eventID_4624,data.win.system.eventID_4625,data.win.system.eventID_4634,data.win.system.eventID_4647,data.win.system.eventID_6000,data.win.system.eventID_7040,data.win.system.eventID_7045,data.win.system.eventID_8224,data.win.system.eventID_86,data.win.system.eventID_nan,data.win.system.channel_Microsoft-Windows-Sysmon/Operational,data.win.system.channel_Security,data.win.system.channel_System,data.win.system.channel_nan,location_/var/log/dpkg.log,location_/var/log/kern.log,location_/var/log/syslog,location_EventChannel,location_active-response\active-responses.log,location_netstat listening ports,location_sca,location_syscheck,location_wazuh-agent,location_wazuh-monitord,location_wazuh-remoted,data.win.system.providerName_Application Error,data.win.system.providerName_BTHUSB,data.win.system.providerName_ESENT,data.win.system.providerName_Microsoft-Windows-CertificateServicesClient-CertEnroll,data.win.system.providerName_Microsoft-Windows-DNS-Client,data.win.system.providerName_Microsoft-Windows-DistributedCOM,data.win.system.providerName_Microsoft-Windows-Search,data.win.system.providerName_Microsoft-Windows-Security-Auditing,data.win.system.providerName_Microsoft-Windows-Security-SPP,data.win.system.providerName_Microsoft-Windows-Sysmon,data.win.system.providerName_Microsoft-Windows-TPM-WMI,data.win.system.providerName_Microsoft-Windows-WindowsUpdateClient,data.win.system.providerName_Microsoft-Windows-Winlogon,data.win.system.providerName_MsiInstaller,data.win.system.providerName_Service Control Manager,data.win.system.providerName_VSS,data.win.system.providerName_Windows Error Reporting,data.win.system.providerName_nan,"rule.mitre.technique_[""Account Discovery"",""PowerShell""]","rule.mitre.technique_[""Account Discovery"",""Windows Command Shell""]","rule.mitre.technique_[""Account Discovery""]","rule.mitre.technique_[""Application Shimming""]","rule.mitre.technique_[""Command and Scripting Interpreter""]","rule.mitre.technique_[""Create Account""]","rule.mitre.technique_[""Deobfuscate/Decode Files or Information""]","rule.mitre.technique_[""Disable or Modify System Firewall""]","rule.mitre.technique_[""Disable or Modify Tools""]","rule.mitre.technique_[""File Deletion"",""Data Destruction"",""Modify Registry""]","rule.mitre.technique_[""File Deletion""]","rule.mitre.technique_[""Ingress Tool Transfer"",""Command and Scripting Interpreter""]","rule.mitre.technique_[""Ingress Tool Transfer""]","rule.mitre.technique_[""Lateral Tool Transfer""]","rule.mitre.technique_[""Local Account""]","rule.mitre.technique_[""Modify Registry""]","rule.mitre.technique_[""Network Share Discovery"",""Windows Command Shell""]","rule.mitre.technique_[""Network Share Discovery""]","rule.mitre.technique_[""Non-Application Layer Protocol""]","rule.mitre.technique_[""Obfuscated Files or Information"",""Modify Registry""]","rule.mitre.technique_[""PowerShell""]","rule.mitre.technique_[""Process Injection""]","rule.mitre.technique_[""Registry Run Keys / Startup Folder""]","rule.mitre.technique_[""Rename System Utilities"",""Deobfuscate/Decode Files or Information""]","rule.mitre.technique_[""Rundll32""]","rule.mitre.technique_[""Security Account Manager"",""PowerShell""]","rule.mitre.technique_[""Security Account Manager""]","rule.mitre.technique_[""Security Software Discovery""]","rule.mitre.technique_[""Service Stop""]","rule.mitre.technique_[""Stored Data Manipulation"",""Modify Registry""]","rule.mitre.technique_[""Sudo and Sudo Caching""]","rule.mitre.technique_[""System Information Discovery"",""Windows Management Instrumentation""]","rule.mitre.technique_[""System Owner/User Discovery""]","rule.mitre.technique_[""Valid Accounts"",""Account Access Removal""]","rule.mitre.technique_[""Valid Accounts"",""Remote Services""]","rule.mitre.technique_[""Valid Accounts""]","rule.mitre.technique_[""Visual Basic""]","rule.mitre.technique_[""Windows Command Shell"",""DLL Search Order Hijacking""]","rule.mitre.technique_[""Windows Command Shell""]","rule.mitre.technique_[""Windows Service""]",rule.mitre.technique_nan,"rule.mitre.tactic_[""Command and Control""]","rule.mitre.tactic_[""Credential Access"",""Execution""]","rule.mitre.tactic_[""Credential Access""]","rule.mitre.tactic_[""Defense Evasion"",""Impact""]","rule.mitre.tactic_[""Defense Evasion"",""Persistence"",""Privilege Escalation"",""Initial Access"",""Impact""]","rule.mitre.tactic_[""Defense Evasion"",""Persistence"",""Privilege Escalation"",""Initial Access"",""Lateral Movement""]","rule.mitre.tactic_[""Defense Evasion"",""Persistence"",""Privilege Escalation"",""Initial Access""]","rule.mitre.tactic_[""Defense Evasion"",""Privilege Escalation""]","rule.mitre.tactic_[""Defense Evasion""]","rule.mitre.tactic_[""Discovery"",""Execution""]","rule.mitre.tactic_[""Discovery""]","rule.mitre.tactic_[""Execution"",""Persistence"",""Privilege Escalation"",""Defense Evasion""]","rule.mitre.tactic_[""Execution""]","rule.mitre.tactic_[""Impact"",""Defense Evasion""]","rule.mitre.tactic_[""Impact""]","rule.mitre.tactic_[""Lateral Movement""]","rule.mitre.tactic_[""Persistence"",""Privilege Escalation""]","rule.mitre.tactic_[""Persistence""]","rule.mitre.tactic_[""Privilege Escalation"",""Defense Evasion"",""Execution""]","rule.mitre.tactic_[""Privilege Escalation"",""Defense Evasion""]","rule.mitre.tactic_[""Privilege Escalation"",""Persistence""]",rule.mitre.tactic_nan,syscheck.event_deleted,syscheck.event_modified,syscheck.event_unmodified,active_response,adduser,authentication_failed,authentication_success,config_changed,dpkg,local,ossec,pam,policy_changed,sca,sshd,su,sudo,syscheck,syscheck_entry_added,syscheck_entry_deleted,syscheck_entry_modified,syscheck_registry,syslog,sysmon,sysmon_eid11_detections,sysmon_eid13_detections,sysmon_eid1_detections,sysmon_eid3_detections,sysmon_process-anomalies,system_error,systemd,time_changed,usb,windows,windows_application,windows_security,windows_system
0,3,0.0,1,,0,0,0,0,0,0,0,0,,,,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
1,2,1.791759,0,Advapi,1,0,1,0,0,0,0,0,,,,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2,0,0.0,1,,0,0,0,0,0,0,0,0,,,,0.0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,1.386294,1,,0,0,0,0,0,0,0,0,,,,0.0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,2.772589,1,,0,0,0,0,0,0,0,0,,,,0.0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [92]:
combined_df.shape[1]

190

In [93]:
X_train.shape[1]

192

## Neural Network Training

In [94]:
import joblib

In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Assuming the preprocessed data is in a DataFrame 'data'
data = combined_df

# Separate features and target
X = data.drop(columns=['Flag'])
y = data['Flag']

X.shape[1]

combined_df.shape[1]

# Convert categorical features to one-hot encoded variables
X = pd.get_dummies(X)

X.shape[1]

# One-hot encode the target variable if it's a categorical classification problem
y = to_categorical(y)

# Split into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape[1]

# Standardize the features (mean=0, variance=1)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the neural network model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))  # First hidden layer
model.add(Dense(32, activation='relu'))  # Second hidden layer
model.add(Dense(y_train.shape[1], activation='softmax'))  # Output layer with softmax for multi-class classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)

print(f'Test accuracy: {test_accuracy}')

Epoch 1/50
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8464 - loss: 0.3227 - val_accuracy: 0.9160 - val_loss: 0.1654
Epoch 2/50
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 479us/step - accuracy: 0.9046 - loss: 0.1666 - val_accuracy: 0.8976 - val_loss: 0.1654
Epoch 3/50
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 495us/step - accuracy: 0.9026 - loss: 0.1683 - val_accuracy: 0.9003 - val_loss: 0.1799
Epoch 4/50
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 488us/step - accuracy: 0.9001 - loss: 0.1591 - val_accuracy: 0.9081 - val_loss: 0.1682
Epoch 5/50
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 500us/step - accuracy: 0.9051 - loss: 0.1547 - val_accuracy: 0.9134 - val_loss: 0.1671
Epoch 6/50
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 482us/step - accuracy: 0.8984 - loss: 0.1573 - val_accuracy: 0.9094 - val_loss: 0.1707
Epoch 7/50
[1m96/96[0m [32m

In [39]:
!pip install optuna



In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split

# Assuming the preprocessed data is in a DataFrame 'combined_df'
data = combined_df

# Separate features and target
X = data.drop(columns=['Flag'])
y = data['Flag']

# Convert categorical features to one-hot encoded variables
X = pd.get_dummies(X)

# One-hot encode the target variable if it's a categorical classification problem
y = pd.get_dummies(y).values  # For compatibility with PyTorch, keep y as a NumPy array

# Split into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (mean=0, variance=1)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)  # Keeping it as float for cross-entropy
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Split into training and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the neural network model
class NeuralNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_dim)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.softmax(self.fc3(x))
        return x

# Initialize the model
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]  # Number of classes
model = NeuralNet(input_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with validation
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, torch.max(y_batch, 1)[1])  # Convert one-hot targets to class indices
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Validation loop
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, torch.max(y_batch, 1)[1])
            val_loss += loss.item()
    
    # Print loss for each epoch
    print(f'Epoch {epoch + 1}, Train Loss: {train_loss / len(train_loader)}, Val Loss: {val_loss / len(val_loader)}')

# Evaluate the model on test data
model.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        loss = criterion(outputs, torch.max(y_batch, 1)[1])
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == torch.max(y_batch, 1)[1]).sum().item()

test_loss /= len(test_loader)
test_accuracy = correct / total
print(f'Test Loss: {test_loss}, Test Accuracy: {test_accuracy}')

Epoch 1, Train Loss: 0.47614954101542634, Val Loss: 0.3972092630962531
Epoch 2, Train Loss: 0.3950323440755407, Val Loss: 0.39943285783131915
Epoch 3, Train Loss: 0.3924840847030282, Val Loss: 0.39777419716119766
Epoch 4, Train Loss: 0.39140726315478486, Val Loss: 0.3988143975536029
Epoch 5, Train Loss: 0.3907882273197174, Val Loss: 0.39883766944209736
Epoch 6, Train Loss: 0.38953741764028865, Val Loss: 0.39765791470805806
Epoch 7, Train Loss: 0.38941666080305976, Val Loss: 0.3973461911082268
Epoch 8, Train Loss: 0.39052721206098795, Val Loss: 0.39726420988639194
Epoch 9, Train Loss: 0.39044651699562866, Val Loss: 0.39773157984018326
Epoch 10, Train Loss: 0.3893698963026206, Val Loss: 0.39836282034715015
Epoch 11, Train Loss: 0.38965664835025865, Val Loss: 0.3966613585750262
Epoch 12, Train Loss: 0.3890403714030981, Val Loss: 0.3988422577579816
Epoch 13, Train Loss: 0.3909934399028619, Val Loss: 0.39856184646487236
Epoch 14, Train Loss: 0.38863105482111376, Val Loss: 0.3975221688548724

In [41]:
#mix
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import copy

# Assuming the preprocessed data is in a DataFrame 'combined_df'
data = combined_df

# Separate features and target
X = data.drop(columns=['Flag'])
y = data['Flag']

# Convert categorical features to one-hot encoded variables
X = pd.get_dummies(X)

# One-hot encode the target variable if it's a categorical classification problem
y = pd.get_dummies(y).values  # Convert target to one-hot encoding

# Split into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoader for batching
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Split into training and validation sets
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the neural network model
class BestNeuralNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(BestNeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# Initialize the model
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]  # Number of classes
model = BestNeuralNet(input_dim, output_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # L2 regularization

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.5, verbose=True)

# Training loop with early stopping
num_epochs = 50
best_val_loss = float('inf')
patience = 5
early_stopping_counter = 0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, torch.max(y_batch, 1)[1])
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        train_loss += loss.item()
    
    # Validation step
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, torch.max(y_batch, 1)[1])
            val_loss += loss.item()

    # Average losses
    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Learning rate scheduler step
    scheduler.step(val_loss)

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= patience:
            print("Early stopping triggered")
            break

# Load best model weights after early stopping
model.load_state_dict(best_model_wts)

# Evaluate the model on test data
model.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        loss = criterion(outputs, torch.max(y_batch, 1)[1])
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == torch.max(y_batch, 1)[1]).sum().item()

test_loss /= len(test_loader)
test_accuracy = correct / total
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

Epoch 1, Train Loss: 0.3581, Val Loss: 0.1756
Epoch 2, Train Loss: 0.1932, Val Loss: 0.1565
Epoch 3, Train Loss: 0.1746, Val Loss: 0.1585
Epoch 4, Train Loss: 0.1617, Val Loss: 0.1513
Epoch 5, Train Loss: 0.1597, Val Loss: 0.1518
Epoch 6, Train Loss: 0.1551, Val Loss: 0.1572
Epoch 7, Train Loss: 0.1543, Val Loss: 0.1569
Epoch 8, Train Loss: 0.1517, Val Loss: 0.1604
Epoch 9, Train Loss: 0.1560, Val Loss: 0.1642
Early stopping triggered
Test Loss: 0.1508, Test Accuracy: 0.8877


In [95]:
#optune
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import copy

# Data Preparation (as before)
data = combined_df
X = pd.get_dummies(data.drop(columns=['Flag']))
y = pd.get_dummies(data['Flag']).values

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

# Optuna Objective Function
def objective(trial):
    # Suggest hyperparameters
    hidden_dim1 = trial.suggest_int('hidden_dim1', 64, 256, step=64)
    hidden_dim2 = trial.suggest_int('hidden_dim2', 32, 128, step=32)
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-2)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])

    # Define model with hyperparameters
    class TuningNeuralNet(nn.Module):
        def __init__(self, input_dim, output_dim):
            super(TuningNeuralNet, self).__init__()
            self.fc1 = nn.Linear(input_dim, hidden_dim1)
            self.bn1 = nn.BatchNorm1d(hidden_dim1)
            self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
            self.bn2 = nn.BatchNorm1d(hidden_dim2)
            self.fc3 = nn.Linear(hidden_dim2, output_dim)
            self.dropout = nn.Dropout(dropout_rate)
            self.relu = nn.ReLU()

        def forward(self, x):
            x = self.relu(self.bn1(self.fc1(x)))
            x = self.dropout(x)
            x = self.relu(self.bn2(self.fc2(x)))
            x = self.fc3(x)
            return x

    # Initialize model, criterion, optimizer, and scheduler
    input_dim = X_train.shape[1]
    output_dim = y_train.shape[1]
    model = TuningNeuralNet(input_dim, output_dim)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.5, verbose=True)
    
    # DataLoader with batch size from Optuna
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Training with early stopping
    num_epochs = 30
    best_val_loss = float('inf')
    patience = 5
    early_stopping_counter = 0
    best_model_wts = copy.deepcopy(model.state_dict())
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, torch.max(y_batch, 1)[1])
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()
            train_loss += loss.item()
        
        # Validation step
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch)
                loss = criterion(outputs, torch.max(y_batch, 1)[1])
                val_loss += loss.item()
        
        # Average validation loss
        val_loss /= len(val_loader)
        scheduler.step(val_loss)
        
        # Early stopping logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= patience:
                break
    
    # Load the best model weights
    model.load_state_dict(best_model_wts)
    
    return best_val_loss

# Optuna study and optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Output the best hyperparameters
print("Best hyperparameters:", study.best_params)

# Train final model with best hyperparameters
best_params = study.best_params
best_hidden_dim1 = best_params['hidden_dim1']
best_hidden_dim2 = best_params['hidden_dim2']
best_dropout_rate = best_params['dropout_rate']
best_lr = best_params['lr']
best_weight_decay = best_params['weight_decay']
best_batch_size = best_params['batch_size']

# Define final model with optimized hyperparameters
class FinalBestModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(FinalBestModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, best_hidden_dim1)
        self.bn1 = nn.BatchNorm1d(best_hidden_dim1)
        self.fc2 = nn.Linear(best_hidden_dim1, best_hidden_dim2)
        self.bn2 = nn.BatchNorm1d(best_hidden_dim2)
        self.fc3 = nn.Linear(best_hidden_dim2, output_dim)
        self.dropout = nn.Dropout(best_dropout_rate)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.fc3(x)
        return x

# Initialize and train final model with best hyperparameters
final_model = FinalBestModel(input_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(final_model.parameters(), lr=best_lr, weight_decay=best_weight_decay)
final_train_loader = DataLoader(train_dataset, batch_size=best_batch_size, shuffle=True)

[I 2024-11-02 22:34:45,109] A new study created in memory with name: no-name-10a8ec47-f90f-4d98-85de-d4a55b986e58
[I 2024-11-02 22:34:46,284] Trial 0 finished with value: 0.1652400019296086 and parameters: {'hidden_dim1': 128, 'hidden_dim2': 32, 'dropout_rate': 0.24094089090340087, 'lr': 0.0018373546182579165, 'weight_decay': 9.262139302261187e-05, 'batch_size': 16}. Best is trial 0 with value: 0.1652400019296086.
[I 2024-11-02 22:34:46,617] Trial 1 finished with value: 0.15997035366793474 and parameters: {'hidden_dim1': 256, 'hidden_dim2': 128, 'dropout_rate': 0.2240844841432819, 'lr': 0.009137204830351858, 'weight_decay': 7.537180013062508e-06, 'batch_size': 64}. Best is trial 1 with value: 0.15997035366793474.
[I 2024-11-02 22:34:47,948] Trial 2 finished with value: 0.2253587394952774 and parameters: {'hidden_dim1': 64, 'hidden_dim2': 32, 'dropout_rate': 0.21327779841870634, 'lr': 3.7485814518314075e-05, 'weight_decay': 7.906997548757106e-06, 'batch_size': 64}. Best is trial 1 with 

Best hyperparameters: {'hidden_dim1': 128, 'hidden_dim2': 32, 'dropout_rate': 0.3740169188776487, 'lr': 0.004482084635692737, 'weight_decay': 0.006114339748032287, 'batch_size': 64}


In [99]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import copy
import torch.optim as optim

# Assuming X and y are your full dataset features and labels
# Step 1: Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)  # 70% train, 30% temp
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 15% each for val/test

# Step 2: Fit the scaler on the training data and transform the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform on training data
X_val_scaled = scaler.transform(X_val)          # Transform validation data
X_test_scaled = scaler.transform(X_test)        # Transform test data

# Step 3: Save the scaler
joblib.dump(scaler, 'wahzu_scaler.pkl')
print("Scaler saved as 'wahzu_scaler.pkl'")

# Step 4: Prepare the scaled data for PyTorch DataLoader
train_dataset = TensorDataset(torch.tensor(X_train_scaled, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
val_dataset = TensorDataset(torch.tensor(X_val_scaled, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(X_test_scaled, dtype=torch.float32), torch.tensor(y_test, dtype=torch.float32))

# Updated hyperparameters
hidden_dim1 = 128
hidden_dim2 = 32
dropout_rate = 0.3740169188776487
lr = 0.004482084635692737
weight_decay = 0.006114339748032287
batch_size = 64

# Define the updated model class if not already done
class FinalBestModel(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim1, hidden_dim2, dropout_rate):
        super(FinalBestModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Model initialization with updated parameters
input_dim = X_train.shape[1]
output_dim = y_train.shape[1]
final_model = FinalBestModel(input_dim, output_dim, hidden_dim1, hidden_dim2, dropout_rate)

# Define loss function and optimizer with updated parameters
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(final_model.parameters(), lr=lr, weight_decay=weight_decay)

# DataLoaders with updated batch size
final_train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
final_val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
final_test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training loop with early stopping
num_epochs = 50
best_val_loss = float('inf')
patience = 5
early_stopping_counter = 0
best_model_wts = copy.deepcopy(final_model.state_dict())

for epoch in range(num_epochs):
    final_model.train()
    train_loss = 0.0
    for X_batch, y_batch in final_train_loader:
        optimizer.zero_grad()
        outputs = final_model(X_batch)
        loss = criterion(outputs, torch.max(y_batch, 1)[1])
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(final_model.parameters(), max_norm=1.0)
        
        optimizer.step()
        train_loss += loss.item()
    
    # Validation step
    final_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in final_val_loader:
            outputs = final_model(X_batch)
            loss = criterion(outputs, torch.max(y_batch, 1)[1])
            val_loss += loss.item()

    # Average losses
    train_loss /= len(final_train_loader)
    val_loss /= len(final_val_loader)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    
    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_wts = copy.deepcopy(final_model.state_dict())
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= patience:
            print("Early stopping triggered")
            break

# Load the best model weights after early stopping
final_model.load_state_dict(best_model_wts)

# Save the final trained model weights
torch.save(final_model.state_dict(), "wahzu_model.pth")
print("Model weights saved as 'wahzu_model.pth'")

# Evaluate the model on test data
final_model.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for X_batch, y_batch in final_test_loader:
        outputs = final_model(X_batch)
        loss = criterion(outputs, torch.max(y_batch, 1)[1])
        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == torch.max(y_batch, 1)[1]).sum().item()

test_loss /= len(final_test_loader)
test_accuracy = correct / total
print(f'Final Test Loss: {test_loss:.4f}, Final Test Accuracy: {test_accuracy:.4f}')

Scaler saved as 'wahzu_scaler.pkl'
Epoch 1, Train Loss: 0.2458, Val Loss: 0.1593
Epoch 2, Train Loss: 0.1530, Val Loss: 0.1542
Epoch 3, Train Loss: 0.1486, Val Loss: 0.1723
Epoch 4, Train Loss: 0.1447, Val Loss: 0.1926
Epoch 5, Train Loss: 0.1586, Val Loss: 0.2102
Epoch 6, Train Loss: 0.1428, Val Loss: 0.2329
Epoch 7, Train Loss: 0.1661, Val Loss: 0.2361
Early stopping triggered
Model weights saved as 'wahzu_model.pth'
Final Test Loss: 0.1456, Final Test Accuracy: 0.9231


In [44]:
import torch

# Save the model's state dictionary
torch.save(final_model.state_dict(), "wahzu_model.pth")

In [45]:
# Set pandas to display all columns without truncation
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent breaking into new lines

In [46]:
test_df = X_train.sample(n=10, random_state=None)

In [47]:
test_df.head(10)

Unnamed: 0,data.win.system.severityValue,rule.firedtimes,data.win.eventdata.elevatedToken,data.win.eventdata.processName,data.win.eventdata.targetDomainName,data.win.eventdata.logonType,syscheck.path,syscheck.value_name,syscheck.win_perm_after,data.win.eventdata.p1,data.vulnerability.cvss.cvss3.base_score,data.win.eventdata.originalFileName,data.win.eventdata.image,agent.name_ubuntu,data.win.system.eventID_1000,data.win.system.eventID_10000,data.win.system.eventID_10001,data.win.system.eventID_1001,data.win.system.eventID_10010,data.win.system.eventID_1003,data.win.system.eventID_1014,data.win.system.eventID_102,data.win.system.eventID_1026,data.win.system.eventID_1033,data.win.system.eventID_1035,data.win.system.eventID_1040,data.win.system.eventID_11,data.win.system.eventID_11707,data.win.system.eventID_12289,data.win.system.eventID_13,data.win.system.eventID_16384,data.win.system.eventID_17,data.win.system.eventID_1796,data.win.system.eventID_20,data.win.system.eventID_3,data.win.system.eventID_300,data.win.system.eventID_301,data.win.system.eventID_302,data.win.system.eventID_326,data.win.system.eventID_4004,data.win.system.eventID_4616,data.win.system.eventID_4624,data.win.system.eventID_4625,data.win.system.eventID_4634,data.win.system.eventID_4647,data.win.system.eventID_6000,data.win.system.eventID_7040,data.win.system.eventID_7045,data.win.system.eventID_8224,data.win.system.eventID_86,data.win.system.eventID_nan,data.win.system.channel_Microsoft-Windows-Sysmon/Operational,data.win.system.channel_Security,data.win.system.channel_System,data.win.system.channel_nan,location_/var/log/dpkg.log,location_/var/log/kern.log,location_/var/log/syslog,location_EventChannel,location_active-response\active-responses.log,location_netstat listening ports,location_sca,location_syscheck,location_wazuh-agent,location_wazuh-monitord,location_wazuh-remoted,data.win.system.providerName_Application Error,data.win.system.providerName_BTHUSB,data.win.system.providerName_ESENT,data.win.system.providerName_Microsoft-Windows-CertificateServicesClient-CertEnroll,data.win.system.providerName_Microsoft-Windows-DNS-Client,data.win.system.providerName_Microsoft-Windows-DistributedCOM,data.win.system.providerName_Microsoft-Windows-Search,data.win.system.providerName_Microsoft-Windows-Security-Auditing,data.win.system.providerName_Microsoft-Windows-Security-SPP,data.win.system.providerName_Microsoft-Windows-Sysmon,data.win.system.providerName_Microsoft-Windows-TPM-WMI,data.win.system.providerName_Microsoft-Windows-WindowsUpdateClient,data.win.system.providerName_Microsoft-Windows-Winlogon,data.win.system.providerName_MsiInstaller,data.win.system.providerName_Service Control Manager,data.win.system.providerName_VSS,data.win.system.providerName_Windows Error Reporting,data.win.system.providerName_nan,"rule.mitre.technique_[""Account Discovery"",""PowerShell""]","rule.mitre.technique_[""Account Discovery"",""Windows Command Shell""]","rule.mitre.technique_[""Account Discovery""]","rule.mitre.technique_[""Application Shimming""]","rule.mitre.technique_[""Command and Scripting Interpreter""]","rule.mitre.technique_[""Create Account""]","rule.mitre.technique_[""Deobfuscate/Decode Files or Information""]","rule.mitre.technique_[""Disable or Modify System Firewall""]","rule.mitre.technique_[""Disable or Modify Tools""]","rule.mitre.technique_[""File Deletion"",""Data Destruction"",""Modify Registry""]","rule.mitre.technique_[""File Deletion""]","rule.mitre.technique_[""Ingress Tool Transfer"",""Command and Scripting Interpreter""]","rule.mitre.technique_[""Ingress Tool Transfer""]","rule.mitre.technique_[""Lateral Tool Transfer""]","rule.mitre.technique_[""Local Account""]","rule.mitre.technique_[""Modify Registry""]","rule.mitre.technique_[""Network Share Discovery"",""Windows Command Shell""]","rule.mitre.technique_[""Network Share Discovery""]","rule.mitre.technique_[""Non-Application Layer Protocol""]","rule.mitre.technique_[""Obfuscated Files or Information"",""Modify Registry""]","rule.mitre.technique_[""PowerShell""]","rule.mitre.technique_[""Process Injection""]","rule.mitre.technique_[""Registry Run Keys / Startup Folder""]","rule.mitre.technique_[""Rename System Utilities"",""Deobfuscate/Decode Files or Information""]","rule.mitre.technique_[""Rundll32""]","rule.mitre.technique_[""Security Account Manager"",""PowerShell""]","rule.mitre.technique_[""Security Account Manager""]","rule.mitre.technique_[""Security Software Discovery""]","rule.mitre.technique_[""Service Stop""]","rule.mitre.technique_[""Stored Data Manipulation"",""Modify Registry""]","rule.mitre.technique_[""Sudo and Sudo Caching""]","rule.mitre.technique_[""System Information Discovery"",""Windows Management Instrumentation""]","rule.mitre.technique_[""System Owner/User Discovery""]","rule.mitre.technique_[""Valid Accounts"",""Account Access Removal""]","rule.mitre.technique_[""Valid Accounts"",""Remote Services""]","rule.mitre.technique_[""Valid Accounts""]","rule.mitre.technique_[""Visual Basic""]","rule.mitre.technique_[""Windows Command Shell"",""DLL Search Order Hijacking""]","rule.mitre.technique_[""Windows Command Shell""]","rule.mitre.technique_[""Windows Service""]",rule.mitre.technique_nan,"rule.mitre.tactic_[""Command and Control""]","rule.mitre.tactic_[""Credential Access"",""Execution""]","rule.mitre.tactic_[""Credential Access""]","rule.mitre.tactic_[""Defense Evasion"",""Impact""]","rule.mitre.tactic_[""Defense Evasion"",""Persistence"",""Privilege Escalation"",""Initial Access"",""Impact""]","rule.mitre.tactic_[""Defense Evasion"",""Persistence"",""Privilege Escalation"",""Initial Access"",""Lateral Movement""]","rule.mitre.tactic_[""Defense Evasion"",""Persistence"",""Privilege Escalation"",""Initial Access""]","rule.mitre.tactic_[""Defense Evasion"",""Privilege Escalation""]","rule.mitre.tactic_[""Defense Evasion""]","rule.mitre.tactic_[""Discovery"",""Execution""]","rule.mitre.tactic_[""Discovery""]","rule.mitre.tactic_[""Execution"",""Persistence"",""Privilege Escalation"",""Defense Evasion""]","rule.mitre.tactic_[""Execution""]","rule.mitre.tactic_[""Impact"",""Defense Evasion""]","rule.mitre.tactic_[""Impact""]","rule.mitre.tactic_[""Lateral Movement""]","rule.mitre.tactic_[""Persistence"",""Privilege Escalation""]","rule.mitre.tactic_[""Persistence""]","rule.mitre.tactic_[""Privilege Escalation"",""Defense Evasion"",""Execution""]","rule.mitre.tactic_[""Privilege Escalation"",""Defense Evasion""]","rule.mitre.tactic_[""Privilege Escalation"",""Persistence""]",rule.mitre.tactic_nan,syscheck.event_deleted,syscheck.event_modified,syscheck.event_unmodified,active_response,adduser,authentication_failed,authentication_success,config_changed,dpkg,local,ossec,pam,policy_changed,sca,sshd,su,sudo,syscheck,syscheck_entry_added,syscheck_entry_deleted,syscheck_entry_modified,syscheck_registry,syslog,sysmon,sysmon_eid11_detections,sysmon_eid13_detections,sysmon_eid1_detections,sysmon_eid3_detections,sysmon_process-anomalies,system_error,systemd,time_changed,usb,windows,windows_application,windows_security,windows_system,rule.level_0,rule.level_1,rule.level_2,data.win.eventdata.logonProcessName_Advapi,data.win.eventdata.logonProcessName_NtLmSsp,data.win.eventdata.logonProcessName_User32,data.win.eventdata.serviceType_user mode service,data.vulnerability.cve_nan
1080,1,5.42495,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,True,False,False,False,False,False,False,True
163,0,3.091042,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,True,False,False,False,False,False,False,True
1776,2,3.135494,1,0,1,0,0,0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,True,False,False,True,False,False,False,True
4713,1,5.337538,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,True,False,False,False,False,False,False,True
57,0,0.0,0,0,0,0,0,0,0,0,0.0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,True,False,False,False,False,False,False,True
265,2,3.135494,1,0,1,0,0,0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,True,False,False,True,False,False,False,True
2712,1,4.26268,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,True,False,False,False,False,False,False,True
2733,1,4.26268,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,True,False,False,False,False,False,False,True
3043,1,3.332205,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,True,False,False,False,False,False,False,True
2635,1,4.762174,0,0,0,0,0,0,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,True,False,False,False,False,False,False,True


In [48]:
test_df.shape[1]

192

In [49]:
# Predict the class probabilities
predictions = model.predict(test_df)

# Convert predicted probabilities to class labels (e.g., class 0, class 1, etc.)
predicted_classes = np.argmax(predictions, axis=1)

AttributeError: 'BestNeuralNet' object has no attribute 'predict'

In [50]:
# Create an empty list to store flagged logs
flagged_logs = []

In [51]:
# Iterate over predicted classes and original logs
for i, pred_class in enumerate(predicted_classes):
    if pred_class == 1:  # If the predicted class is 1 (Flag = 1)
        flagged_logs.append(test_df.iloc[i].to_dict())  # Add the log to the flagged_logs array

# flagged_logs now contains logs where the predicted class is 1

NameError: name 'predicted_classes' is not defined

In [52]:
print(len(flagged_logs))

0


In [53]:
print(flagged_logs)

[]


In [54]:
filtering_df.head()

Unnamed: 0,agent.name,data.win.system.eventID,data.win.system.channel,data.win.system.severityValue,data.win.system.providerName,rule.firedtimes,rule.level,rule.groups,location,data.win.eventdata.logonProcessName,data.win.eventdata.elevatedToken,data.win.eventdata.processName,data.win.eventdata.targetDomainName,data.win.eventdata.logonType,rule.mitre.technique,rule.mitre.tactic,syscheck.path,syscheck.event,syscheck.value_name,syscheck.win_perm_after,data.win.eventdata.p1,data.win.eventdata.serviceType,data.vulnerability.severity,data.vulnerability.cve,data.vulnerability.cvss.cvss3.base_score,data.win.eventdata.originalFileName,data.win.eventdata.image,Flag
0,Windows,1014.0,System,WARNING,Microsoft-Windows-DNS-Client,1,5,"[""windows"",""windows_system""]",EventChannel,,,,,,,,,,,,,,,,,,,0
1,Windows,4624.0,Security,AUDIT_SUCCESS,Microsoft-Windows-Security-Auditing,6,3,"[""windows"",""windows_security"",""authentication_...",EventChannel,Advapi,%%1842,C:\\Windows\\System32\\services.exe,NT AUTHORITY,5.0,"[""Valid Accounts""]","[""Defense Evasion"",""Persistence"",""Privilege Es...",,,,,,,,,,,,0
2,ubuntu,,,,,1,7,"[""syslog"",""dpkg"",""config_changed""]",/var/log/dpkg.log,,,,,,,,,,,,,,,,,,,0
3,ubuntu,,,,,4,7,"[""sca""]",sca,,,,,,,,,,,,,,,,,,,0
4,ubuntu,,,,,16,7,"[""sca""]",sca,,,,,,,,,,,,,,,,,,,0


In [55]:
further_flagged_logs = filtering_df[filtering_df['data.win.eventdata.serviceType'].notnull()].apply(lambda row: row.to_dict(), axis=1).tolist()

In [56]:
len(further_flagged_logs)

9

In [57]:
flagged_logs = filtering_df[filtering_df['data.vulnerability.cve'].notnull()].apply(lambda row: row.to_dict(), axis=1).tolist()

## LLM

In [116]:
import requests
import json

In [118]:
url = "http://localhost:11434/api/generate"

In [119]:
headers = {
    "Content-Type": "application/json"
}

In [147]:
special_prompt = """You are a cybersecurity analyst specializing in incident response. You are tasked with analyzing a set of processed system logs for potential malicious activity. Based on these logs, generate a detailed Incident Response Report. Identify any suspicious or malicious events in the logs, explain their significance, and provide actionable recommendations for containment, remediation, and prevention of further attacks. Be sure to focus on any potential signs of command and control (C2) activity, tool transfers, file tampering, or privilege escalation.

Include in your report:

    Summary of the incident.
    Key suspicious activities and indicators (e.g., specific Event IDs, MITRE ATT&CK tactics, elevated privileges).
    Details on affected systems or devices involved.
    Clear recommendations for next steps, such as containment, remediation, and future mitigation.
    Conclusion on the potential impact and any further investigation needed.

Here are the logs for analysis:""" + str(flagged_logs)
print(special_prompt)

You are a cybersecurity analyst specializing in incident response. You are tasked with analyzing a set of processed system logs for potential malicious activity. Based on these logs, generate a detailed Incident Response Report. Identify any suspicious or malicious events in the logs, explain their significance, and provide actionable recommendations for containment, remediation, and prevention of further attacks. Be sure to focus on any potential signs of command and control (C2) activity, tool transfers, file tampering, or privilege escalation.

Include in your report:

    Summary of the incident.
    Key suspicious activities and indicators (e.g., specific Event IDs, MITRE ATT&CK tactics, elevated privileges).
    Details on affected systems or devices involved.
    Clear recommendations for next steps, such as containment, remediation, and future mitigation.
    Conclusion on the potential impact and any further investigation needed.

Here are the logs for analysis:[{'data.win.sys

In [148]:
data = {
    "model": "llama3.2",
    "prompt": special_prompt,
    "stream": False
}

In [149]:
response = requests.post(url, headers=headers, data=json.dumps(data))

In [150]:
if response.status_code == 200:
    response_text = response.text
    data = json.loads(response_text)
    actual_response = data["response"]
    print(actual_response)
else:
    print( "Error:", response.status_code, response.text)

This appears to be a JSON object containing various rules and events from a security monitoring system. Here's a breakdown of what each part represents:

**Security Rules:**

* `rule.mitre.technique_*`: These are specific techniques used in attacks, such as "Windows Command Shell" or "Visual Basic".
* `rule.mitre.tactic_*`: These are high-level tactics used by attackers, such as "Command and Control", "Credential Access", etc.
* `rule.level_0`, `rule.level_1`, `rule.level_2`: These represent the severity levels of each rule, with level 0 being the lowest (false) and level 2 being the highest (true).
* The other rules seem to be specific events or conditions that are checked during monitoring.

**Events:**

* `syscheck.event_deleted`, `syscheck.event_modified`, `syscheck.event_unmodified`: These indicate whether a system check event has been deleted, modified, or unchanged.
* `authentification_success` and `authentication_failed`: These indicate successful and failed authentication even

# Delete Later