In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Benign' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 60
avg leaf node depth: 20.2842
num leaf nodes: 990050
starting tree 1
Validation Set Evaluation:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC AUC: 1.0

Test Set Evaluation:
Accuracy: 0.999914675767918
Precision: 0.9999146901540641
Recall: 0.999914675767918
F1 Score: 0.9999146756803191
ROC AUC: 1.0


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Ransomware-Ako' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Ransomware-Ako:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 16
avg leaf node depth: 6.39994
num leaf nodes: 17730
starting tree 1
Validation Set Evaluation:
Accuracy: 0.9585324232081911
Precision: 0.9529942480857069
Recall: 0.9585324232081911
F1 Score: 0.9555244029877836
ROC AUC: 0.8986464818978439

Test Set Evaluation:
Accuracy: 0.9594709897610921
Precision: 0.9570536498732457
Recall: 0.9594709897610921
F1 Score: 0.9582100142801847
ROC AUC: 0.893632569294283


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Ransomware-Maze' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Ransomware-Maze:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 52
avg leaf node depth: 19.6903
num leaf nodes: 1559072
starting tree 1
Evaluating for Ransomware-Maze:
Validation Set Evaluation:
Accuracy: 0.9696245733788396
Precision: 0.9696245733788396
Recall: 0.9696245733788396
F1 Score: 0.9696245733788396
ROC AUC: 0.9352544236881398

Test Set Evaluation:
Accuracy: 0.9656996587030716
Precision: 0.9648759317524391
Recall: 0.9656996587030716
F1 Score: 0.9652761737045134
ROC AUC: 0.9335125830621784


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Ransomware-Conti' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Ransomware-Conti:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 56
avg leaf node depth: 18.8982
num leaf nodes: 1314106
starting tree 1
Evaluating for Ransomware-Conti:
Validation Set Evaluation:
Accuracy: 0.9641638225255973
Precision: 0.9595712851322491
Recall: 0.9641638225255973
F1 Score: 0.9616277688523823
ROC AUC: 0.9069490290388452

Test Set Evaluation:
Accuracy: 0.9607508532423208
Precision: 0.9558741006842884
Recall: 0.9607508532423208
F1 Score: 0.9580354227774346
ROC AUC: 0.9159837202207616


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Ransomware-Pysa' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Ransomware-Pysa:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 59
avg leaf node depth: 21.7656
num leaf nodes: 1415871
starting tree 1
Evaluating for Ransomware-Pysa:
Validation Set Evaluation:
Accuracy: 0.9658703071672355
Precision: 0.9636228668278545
Recall: 0.9658703071672355
F1 Score: 0.9646921568477985
ROC AUC: 0.9057769539697251

Test Set Evaluation:
Accuracy: 0.9627133105802048
Precision: 0.9638908604789421
Recall: 0.9627133105802048
F1 Score: 0.9632891619272539
ROC AUC: 0.9120069679088637


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Ransomware-Shade' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Ransomware-Shade:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 58
avg leaf node depth: 19.7387
num leaf nodes: 1406051
starting tree 1
Evaluating for Ransomware-Shade:
Validation Set Evaluation:
Accuracy: 0.9617747440273038
Precision: 0.959960425234265
Recall: 0.9617747440273038
F1 Score: 0.9608155681123557
ROC AUC: 0.9311253431508842

Test Set Evaluation:
Accuracy: 0.9607508532423208
Precision: 0.959143936768029
Recall: 0.9607508532423208
F1 Score: 0.9599182296053098
ROC AUC: 0.931839588386198


In [None]:
###############################################################################################

'Spyware-Transponder' 'Spyware-Gator' 'Spyware-180solutions' 'Spyware-CWS' 'Spyware-TIBS'

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Spyware-Transponder' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Spyware-Transponder:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 58
avg leaf node depth: 20.7967
num leaf nodes: 1392787
starting tree 1
Evaluating for Spyware-Transponder:
Validation Set Evaluation:
Accuracy: 0.9501706484641638
Precision: 0.9512309232842097
Recall: 0.9501706484641638
F1 Score: 0.9506894641073352
ROC AUC: 0.9390146904960178

Test Set Evaluation:
Accuracy: 0.952815699658703
Precision: 0.9538862962507362
Recall: 0.952815699658703
F1 Score: 0.953340185258829
ROC AUC: 0.9396851682855606


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Spyware-Gator' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Spyware-Gator:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 60
avg leaf node depth: 19.8829
num leaf nodes: 1490272
starting tree 1
Evaluating for Spyware-Gator:
Validation Set Evaluation:
Accuracy: 0.9679180887372013
Precision: 0.9717677257195612
Recall: 0.9679180887372013
F1 Score: 0.9695080887316588
ROC AUC: 0.9806701409093871

Test Set Evaluation:
Accuracy: 0.9703924914675768
Precision: 0.9723722051388362
Recall: 0.9703924914675768
F1 Score: 0.9712874380232651
ROC AUC: 0.9756259729608745


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Spyware-180solutions' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Spyware-180solutions:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 58
avg leaf node depth: 20.8648
num leaf nodes: 1023929
starting tree 1
Evaluating for Spyware-180solutions:
Validation Set Evaluation:
Accuracy: 0.9631399317406143
Precision: 0.9595141915219564
Recall: 0.9631399317406143
F1 Score: 0.9611876337681562
ROC AUC: 0.8980176938677878

Test Set Evaluation:
Accuracy: 0.9580204778156997
Precision: 0.9515896125542623
Recall: 0.9580204778156997
F1 Score: 0.95432995558595
ROC AUC: 0.9038047285428965


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Spyware-CWS' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Spyware-CWS:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 61
avg leaf node depth: 21.7661
num leaf nodes: 1527196
starting tree 1
Evaluating for Spyware-CWS:
Validation Set Evaluation:
Accuracy: 0.9561433447098976
Precision: 0.9543213937969647
Recall: 0.9561433447098976
F1 Score: 0.9552079685468522
ROC AUC: 0.9166334786572529

Test Set Evaluation:
Accuracy: 0.9575085324232082
Precision: 0.9549964086960883
Recall: 0.9575085324232082
F1 Score: 0.956200971490113
ROC AUC: 0.9195829447496198


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Spyware-TIBS' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Spyware-TIBS:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 61
avg leaf node depth: 21.1798
num leaf nodes: 1451564
starting tree 1
Evaluating for Spyware-TIBS:
Validation Set Evaluation:
Accuracy: 0.9870307167235495
Precision: 0.986436773293487
Recall: 0.9870307167235495
F1 Score: 0.9866718648084275
ROC AUC: 0.9672661612642035

Test Set Evaluation:
Accuracy: 0.9877133105802047
Precision: 0.987057603687939
Recall: 0.9877133105802047
F1 Score: 0.9873008996467137
ROC AUC: 0.976234858891109


In [None]:
###################################################################################################

In [None]:
 # Trojan-Scar, Trojan-Zeus Trojan-Emotet Trojan-Refroso Trojan-Reconyc


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Trojan-Scar' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Trojan-Scar:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 49
avg leaf node depth: 17.8526
num leaf nodes: 753153
starting tree 1
Evaluating for Trojan-Scar:
Validation Set Evaluation:
Accuracy: 0.9658703071672355
Precision: 0.9638607747064853
Recall: 0.9658703071672355
F1 Score: 0.9647833746641898
ROC AUC: 0.9535453013063633

Test Set Evaluation:
Accuracy: 0.964419795221843
Precision: 0.9639387432298109
Recall: 0.964419795221843
F1 Score: 0.9641759246096717
ROC AUC: 0.9487162898170981


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Trojan-Zeus' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Trojan-Zeus:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 58
avg leaf node depth: 21.4011
num leaf nodes: 1328088
starting tree 1
Evaluating for Trojan-Zeus:
Validation Set Evaluation:
Accuracy: 0.9610921501706484
Precision: 0.9603217714214956
Recall: 0.9610921501706484
F1 Score: 0.9607005890202104
ROC AUC: 0.9349309639109882

Test Set Evaluation:
Accuracy: 0.9613481228668942
Precision: 0.9613935432894241
Recall: 0.9613481228668942
F1 Score: 0.9613708079662231
ROC AUC: 0.9399396174333094


In [16]:
 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Trojan-Emotet' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Trojan-Emotet:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 62
avg leaf node depth: 19.4808
num leaf nodes: 1381374
starting tree 1
Evaluating for Trojan-Emotet:
Validation Set Evaluation:
Accuracy: 0.9687713310580205
Precision: 0.9652969143808937
Recall: 0.9687713310580205
F1 Score: 0.9667209489182529
ROC AUC: 0.9431777743317339

Test Set Evaluation:
Accuracy: 0.9701365187713311
Precision: 0.9681680432887321
Recall: 0.9701365187713311
F1 Score: 0.9690582671864466
ROC AUC: 0.9399655648597033


In [17]:
  
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Trojan-Refroso' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Trojan-Refroso:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 54
avg leaf node depth: 20.5924
num leaf nodes: 1281554
starting tree 1
Evaluating for Trojan-Refroso:
Validation Set Evaluation:
Accuracy: 0.9831058020477815
Precision: 0.9828991717812176
Recall: 0.9831058020477815
F1 Score: 0.9829980868893456
ROC AUC: 0.9743497853446249

Test Set Evaluation:
Accuracy: 0.9820819112627986
Precision: 0.9812489444512041
Recall: 0.9820819112627986
F1 Score: 0.9815580868018089
ROC AUC: 0.9738829664360086


In [18]:
  
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
df['Category'] = df['Category'].apply(lambda x: 1 if x == 'Trojan-Reconyc' else 0)

# Encode labels
enc = LabelEncoder()
df['Category'] = enc.fit_transform(df['Category'])

# Define features and target
X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
y = df['Category']

# Split into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split training+validation into 70% training and 10% validation (87.5% of 80% = 70%)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Apply SMOTE oversampling on the training set
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, y_train)

# Train SPORF classifier
rf = rerfClassifier()
rf.fit(X_smote, Y_smote)

# Validation Set Evaluation
y_val_pred = rf.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
precision_val = precision_score(y_val, y_val_pred, average='weighted')
recall_val = recall_score(y_val, y_val_pred, average='weighted')
f1_val = f1_score(y_val, y_val_pred, average='weighted')

# ROC-AUC for Validation Set
y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

print("Evaluating for Trojan-Reconyc:")
print("Validation Set Evaluation:")
print("Accuracy:", accuracy_val)
print("Precision:", precision_val)
print("Recall:", recall_val)
print("F1 Score:", f1_val)
print("ROC AUC:", roc_auc_val)

# Test Set Evaluation
y_test_pred = rf.predict(X_test)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# ROC-AUC for Test Set
y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

print("\nTest Set Evaluation:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)
print("ROC AUC:", roc_auc_test)


max depth: 62
avg leaf node depth: 19.0958
num leaf nodes: 836753
starting tree 1
Evaluating for Trojan-Reconyc:
Validation Set Evaluation:
Accuracy: 0.9738907849829351
Precision: 0.9744727561223879
Recall: 0.9738907849829351
F1 Score: 0.974174729993411
ROC AUC: 0.95162516849506

Test Set Evaluation:
Accuracy: 0.9752559726962458
Precision: 0.9747567548350398
Recall: 0.9752559726962458
F1 Score: 0.9749987298853933
ROC AUC: 0.9656420953981725


In [None]:
#####################################################################################################################

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
target_strings = ["Ransomware-Ako", "Ransomware-Maze", "Ransomware-Conti", "Ransomware-Pysa", "Ransomware-Shade"]  # List of target strings
results = {}  # Dictionary to store results for each target

for target in target_strings:
    print(f"\nEvaluating for target: {target}")

    # Label the target class as 1 and others as 0
    df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
    df['Category'] = df['Category'].apply(lambda x: 1 if x == target else 0)

    # Encode labels
    enc = LabelEncoder()
    df['Category'] = enc.fit_transform(df['Category'])

    # Define features and target
    X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
    y = df['Category']

    # Split into 80% training+validation and 20% testing
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Further split training+validation into 70% training and 10% validation
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

    # Apply SMOTE oversampling on the training set
    smote = SMOTE(random_state=42)
    X_smote, Y_smote = smote.fit_resample(X_train, y_train)

    # Train SPORF classifier
    rf = rerfClassifier()
    rf.fit(X_smote, Y_smote)

    # Validation Set Evaluation
    y_val_pred = rf.predict(X_val)
    accuracy_val = accuracy_score(y_val, y_val_pred)
    precision_val = precision_score(y_val, y_val_pred, average='weighted')
    recall_val = recall_score(y_val, y_val_pred, average='weighted')
    f1_val = f1_score(y_val, y_val_pred, average='weighted')
    y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
    roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

    # Test Set Evaluation
    y_test_pred = rf.predict(X_test)
    accuracy_test = accuracy_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred, average='weighted')
    recall_test = recall_score(y_test, y_test_pred, average='weighted')
    f1_test = f1_score(y_test, y_test_pred, average='weighted')
    y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
    roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

    # Store results for the current target
    results[target] = {
        "Validation Accuracy": accuracy_val,
        "Validation Precision": precision_val,
        "Validation Recall": recall_val,
        "Validation F1 Score": f1_val,
        "Validation ROC AUC": roc_auc_val,
        "Test Accuracy": accuracy_test,
        "Test Precision": precision_test,
        "Test Recall": recall_test,
        "Test F1 Score": f1_test,
        "Test ROC AUC": roc_auc_test,
    }

    # Print results for the current target
    print(f"Validation Accuracy for {target}:", accuracy_val)
    print(f"Validation ROC AUC for {target}:", roc_auc_val)
    print(f"Test Accuracy for {target}:", accuracy_test)
    print(f"Test ROC AUC for {target}:", roc_auc_test)

# Display the results for all target strings
print("\nAll Results:")
for target, metrics in results.items():
    print(f"\nResults for {target}:")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from rerf.rerfClassifier import rerfClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

# Load data and preprocess
df = pd.read_csv("Obfuscated-MalMem2022.csv")
target_strings = ["Ransomware-Ako", "Ransomware-Maze", "Ransomware-Conti", "Ransomware-Pysa", "Ransomware-Shade"]  # List of target strings
results = {}  # Dictionary to store results for each target

for target in target_strings:
    print(f"\nEvaluating for target: {target}")

    # Ensure all entries in the Category column are strings before splitting
    df['Category'] = df['Category'].astype(str)
    df['Category'] = df['Category'].apply(lambda x: '-'.join(x.split('-', 2)[:2]))
    df['Category'] = df['Category'].apply(lambda x: 1 if x == target else 0)

    # Encode labels
    enc = LabelEncoder()
    df['Category'] = enc.fit_transform(df['Category'])

    # Define features and target
    X = df.drop(['Class', 'Category', 'svcscan.interactive_process_services', 'pslist.nprocs64bit', 'handles.nport'], axis=1)
    y = df['Category']

    # Split into 80% training+validation and 20% testing
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Further split training+validation into 70% training and 10% validation
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

    # Apply SMOTE oversampling on the training set
    smote = SMOTE(random_state=42)
    X_smote, Y_smote = smote.fit_resample(X_train, y_train)

    # Train SPORF classifier
    rf = rerfClassifier()
    rf.fit(X_smote, Y_smote)

    # Validation Set Evaluation
    y_val_pred = rf.predict(X_val)
    accuracy_val = accuracy_score(y_val, y_val_pred)
    precision_val = precision_score(y_val, y_val_pred, average='weighted')
    recall_val = recall_score(y_val, y_val_pred, average='weighted')
    f1_val = f1_score(y_val, y_val_pred, average='weighted')
    y_val_pred_proba = rf.predict_proba(X_val)[:, 1]
    roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

    # Test Set Evaluation
    y_test_pred = rf.predict(X_test)
    accuracy_test = accuracy_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred, average='weighted')
    recall_test = recall_score(y_test, y_test_pred, average='weighted')
    f1_test = f1_score(y_test, y_test_pred, average='weighted')
    y_test_pred_proba = rf.predict_proba(X_test)[:, 1]
    roc_auc_test = roc_auc_score(y_test, y_test_pred_proba)

    # Store results for the current target
    results[target] = {
        "Validation Accuracy": accuracy_val,
        "Validation Precision": precision_val,
        "Validation Recall": recall_val,
        "Validation F1 Score": f1_val,
        "Validation ROC AUC": roc_auc_val,
        "Test Accuracy": accuracy_test,
        "Test Precision": precision_test,
        "Test Recall": recall_test,
        "Test F1 Score": f1_test,
        "Test ROC AUC": roc_auc_test,
    }

    # Print results for the current target
    print(f"Validation Accuracy for {target}:", accuracy_val)
    print(f"Validation ROC AUC for {target}:", roc_auc_val)
    print(f"Test Accuracy for {target}:", accuracy_test)
    print(f"Test ROC AUC for {target}:", roc_auc_test)

# Display the results for all target strings
print("\nAll Results:")
for target, metrics in results.items():
    print(f"\nResults for {target}:")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")
