In [1]:
# import statements
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, recall_score
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
from sklearn.metrics import fbeta_score, make_scorer, classification_report, confusion_matrix, recall_score
import seaborn as sns

In [3]:
# read in preprocessed data
df = pd.read_csv('../data/preprocessed/malware_preprocessed.csv')

#### Model Selection 

The goal of this classification task is predicting instances of malware. The dataset is imbalanced, meaning there are more positive cases (malware) compared to negative cases. This imbalance can cause challenges with traditional accuracy metrics, which tend to favor the majority class. We aim to prioritize the minimization of false negatives while also preserving as many benign apps as possible. We will use the 3 models below: 

1. CART: CART is a decision tree that uses the Gini index as criterion for splitting. It builds binary trees, meaning each node is split into two child nodes. It includes mechanisms for handling missing values and provides built-in pruning methods to avoid overfitting. It can handle non-linear relationships and interactions between features, making it suitable for capturing complex patterns. It is fairly easy to interpret and visualize.

2. C5.0: This model is an optimized version of C4.5, with fast execution times and improved memory usage. It uses GR to decide on splits. It has advanced boosting techniques and can handle imbalanced datasets.

3. KNN: KNN is a simple yet effective model for classification. It works by classifying an instance based on the majority class of its neighbors. For imbalanced datasets, KNN can perform well if appropriate distance metrics and weighting schemes are applied. KNN is a simple and effective algorithm that does not require training time and can capture complex decision boundaries. While it lacks hyperparameters, it provides a useful benchmark. 

#### Evaluation Metrics

The goal of our algorithm is to capture as many malware cases (minimizing false negatives), while preserving as many benign apps as possible. False negatives (missed instances of malware) are more harmful than false positives (falsely flagging benign apps as malware), however, given the imbalance of the dataset we want to prioritize both precision and recall. Thus, we will use F score as our scoring metric. The F score is a weighted balance of precision and recall. We will use a $\beta > 1$ since we want to prioritize recall. This will allow us to ensure that the model is capturing as many instances of malware as possible without incorrectly flagging benign apps.  

In [5]:
f2_scorer = make_scorer(fbeta_score, beta=2)

#### Split data

Given that the dataset is imbalanced (with fewer benign samples than malicious ones), stratification prevents the training set from being skewed toward the majority class. Without stratification, the model might underrepresent the minority class (benign apps) in the training set, leading to poor performance in preserving benign apps.

The training set will have a balanced representation of both classes and the validation set will accurately reflect the overall class distribution, allowing for better hyperparameter tuning. The test set will also reflect the real-world distribution, making performance evaluation more realistic. Since our goal is to maximize recall (minimize false negatives), maintaining class balance during training and validation is critical.

We will also implement K-Fold cross validation since we have a relatively small dataset (n=4464), with stratification. 

In [14]:
# split data
X = df.drop('Label', axis=1)
y = df['Label']

# split using stratification 
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=7)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=7)

In [None]:
# implement K-Fold cross-validation since we have a relatively small dataset
skf = StratifiedKFold(n_splits=5)

#### Model 1: CART (Decision Tree)

CART is a simple, interpretable algorithm that can handle imbalanced data. It splits data based on the Gini index, making it effective for classification tasks. 

In [18]:
cart_model = DecisionTreeClassifier()
param_grid = {
    'max_depth': [3, 5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5], 
    'class_weight': ['balanced', None]
}
grid_search_cart = GridSearchCV(cart_model, param_grid, scoring=f2_scorer, cv=skf)
grid_search_cart.fit(X_val, y_val)

# Use the validation set for hyperparameter tuning
best_cart_model = grid_search_cart.best_estimator_

# Evaluate the best model using cross-validation on the training data
cross_val_scores = cross_val_score(best_cart_model, X_train, y_train, cv=skf, scoring=f2_scorer)
print(f"Cross-validation ROC AUC scores (TRAINING): {cross_val_scores}")
print(f"Mean ROC AUC (TRAINING): {cross_val_scores.mean():.3f}")

# Evaluate CART

best_cart_model.fit(X_train, y_train)

cart_preds = best_cart_model.predict(X_test)
print("CART Classification Report:\n", classification_report(y_test, cart_preds))
print("CART Confusion Matrix:\n", confusion_matrix(y_test, cart_preds))

# Calculate and print recall
cart_recall = recall_score(y_test, cart_preds)
print(f"CART Recall: {cart_recall:.3f}")

#### Model 2: C5.0 (XGBoost)

C5.0 (via XGBoost) is known for high accuracy and can naturally handle imbalance. It also supports regularization and boosting, reducing overfitting. 

In [22]:
xgb_model = xgb.XGBClassifier(objective='binary:logistic', scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]))
param_grid_xgb = {
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300]
}
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, scoring=f2_scorer, cv=skf)
grid_search_xgb.fit(X_val, y_val)

# Use the validation set for hyperparameter tuning
best_xgb_model = grid_search_xgb.best_estimator_

# Evaluate the best model using cross-validation on the training data
cross_val_scores = cross_val_score(best_cart_model, X_train, y_train, cv=skf, scoring=f2_scorer)
print(f"Cross-validation ROC AUC scores (TRAINING): {cross_val_scores}")
print(f"Mean ROC AUC (TRAINING): {cross_val_scores.mean():.3f}")

# Evaluate XGBoost

best_xgb_model.fit(X_train, y_train)

xgb_preds = best_xgb_model.predict(X_test)
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_preds))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, xgb_preds))

# Calculate and print recall
xgb_recall = recall_score(y_test, xgb_preds)
print(f"XGBoost Recall: {xgb_recall:.3f}")


#### Model 3: K-Nearest Neighbors (KNN)

KNN is a simple and effective algorithm that does not require training time and can capture complex decision boundaries. While it lacks hyperparameters, it provides a useful benchmark. 

In [26]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

# Evaluate KNN
knn_preds = knn_model.predict(X_test)
print("KNN Classification Report:\n", classification_report(y_test, knn_preds))
print("KNN Confusion Matrix:\n", confusion_matrix(y_test, knn_preds))

# Calculate and print recall
knn_recall = recall_score(y_test, knn_preds)
print(f"KNN Recall: {knn_recall:.3f}")

#### Model Interpretability (SHAP)

I used SHAP (SHapley Additive exPlanations) for model interpretability because it provides clear and consistent explanations for the predictions made by complex machine learning models. 

SHAP calculates the contribution of each feature to the prediction (positive or negative). If a feature significantly increases the likelihood of an app being classified as malicious, it will have a strong positive SHAP value.

Plots: 
- Force Plot: This visualizes how much each feature pushes the model's output toward predicting either "benign" or "malicious."
- Summary Plot: It shows the average impact of each feature across all predictions, helping identify the most influential predictors in the model.

This approach ensures that security analysts can understand which app characteristics are most indicative of malware, helping improve threat detection and prevention strategies.

In [30]:
# Explain the CART model
explainer_cart = shap.Explainer(best_cart_model, X_test)
shap_values_cart = explainer_cart(X_test)
shap.summary_plot(shap_values_cart, X_test)

# Explain the XGBoost model
explainer_xgb = shap.Explainer(best_xgb_model, X_test)
shap_values_xgb = explainer_xgb(X_test)
shap.summary_plot(shap_values_xgb, X_test)

# Explain the KNN model
explainer_knn = shap.KernelExplainer(knn_model.predict, X_test)
shap_values_knn = explainer_knn.shap_values(X_test)
shap.summary_plot(shap_values_knn, X_test)


In [None]:
# Visualize the tree using sklearn
plt.figure(figsize=(12, 12))
plot_tree(best_cart_model, feature_names = Xtrain.columns,
          class_names = list(set(ytest)))
plt.show()

In [45]:
plt.bar(Xtrain.columns, cart.feature_importances_)
plt.xlabel('Feature')
plt.ylabel('Probability')
plt.title('Feature Importances')
plt.show()

#### Results and Conclusion 

In [34]:
# Compare performance - F1
cart_f2 = f2_scorer(y_test, best_cart_model.predict_proba(X_test)[:, 1])
xgb_f2 = f2_scorer(y_test, best_xgb_model.predict_proba(X_test)[:, 1])
knn_f2 = f2_scorer(y_test, knn_model.predict_proba(X_test)[:, 1])

print(f"CART F-METRIC: {cart_f2:.3f}")
print(f"XGBoost F-METRIC: {xgb_f2:.3f}")
print(f"KNN F-METRIC: {knn_f2:.3f}")


In [43]:
# Confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.heatmap(confusion_matrix(y_test, cart_preds), annot=True, fmt='d', ax=axes[0])
axes[0].set_title('CART Confusion Matrix')
sns.heatmap(confusion_matrix(y_test, xgb_preds), annot=True, fmt='d', ax=axes[1])
axes[1].set_title('XGBoost Confusion Matrix')
sns.heatmap(confusion_matrix(y_test, knn_preds), annot=True, fmt='d', ax=axes[2])
axes[2].set_title('KNN Confusion Matrix')
plt.show()