# **MACHINE LEARNING FOR MALWARE DETECTION IN ANDROID**

#Introduction


Done by Ananiya Anantharaman, Siddhi Aadekar, Raj Shedge

## **Training Section**

In [None]:
# Install required packages
!pip install --upgrade androguard pyaxmlparser opendatasets

# Import libraries
from pyaxmlparser import APK
import pandas as pd
import pickle
import opendatasets as od
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Download the dataset from Kaggle
od.download('https://www.kaggle.com/datasets/ankit1743/android-malware-detection-dataset')

# Load the dataset
data = pd.read_csv('/content/android-malware-detection-dataset/Android_Malware.csv')

# Prepare features and labels
X = data.drop('Result', axis=1)
y = data['Result']
PERMISSION_LIST = list(X.columns)

# Split dataset into training and testing sets for proper evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Random Forest model on training data
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the trained model
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Load the model
model = pickle.load(open('rf_model.pkl', 'rb'))

Collecting androguard
  Downloading androguard-4.1.3-py3-none-any.whl.metadata (5.3 kB)
Collecting pyaxmlparser
  Downloading pyaxmlparser-0.3.31-py3-none-any.whl.metadata (2.0 kB)
Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Collecting apkInspector>=1.1.7 (from androguard)
  Downloading apkinspector-1.3.5-py3-none-any.whl.metadata (6.9 kB)
Collecting asn1crypto>=0.24.0 (from androguard)
  Downloading asn1crypto-1.5.1-py2.py3-none-any.whl.metadata (13 kB)
Collecting colorama>=0.4.1 (from androguard)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting dataset (from androguard)
  Downloading dataset-1.6.2-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting frida (from androguard)
  Downloading frida-17.3.2-cp37-abi3-manylinux_2_5_x86_64.whl.metadata (2.3 kB)
Collecting loguru (from androguard)
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting lxml>=4.3.0 (from androguard)
  Downloading lxml-6.0.

100%|██████████| 250k/250k [00:00<00:00, 773MB/s]







#**Evaluation Section**

In [None]:
# Predict on test data
y_pred = model.predict(X_test)

# Evaluate metrics
print("=== Model Evaluation on Test Data ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nDetailed Classification Report:\n", classification_report(y_test, y_pred))



=== Model Evaluation on Test Data ===
Accuracy: 0.97125
Precision: 0.9764542936288089
Recall: 0.9655329833371377
F1-score: 0.9709629289567313
Confusion Matrix:
 [[4317  102]
 [ 151 4230]]

Detailed Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.97      4419
           1       0.98      0.97      0.97      4381

    accuracy                           0.97      8800
   macro avg       0.97      0.97      0.97      8800
weighted avg       0.97      0.97      0.97      8800



#**Testing Section**

In [None]:
# Function to extract permissions from APK and build feature vector
def extract_permissions_pyaxml(apk_path):
    apk = APK(apk_path)
    perms = apk.get_permissions()
    perms_vector = [1 if perm in perms else 0 for perm in PERMISSION_LIST]
    return perms_vector

# Updated scan function with adjustable malware probability threshold
def scan_apk_pyaxml(apk_path, malware_probability_threshold=0.2):
    try:
        features = extract_permissions_pyaxml(apk_path)
        features_df = pd.DataFrame([features], columns=PERMISSION_LIST)
        prob = model.predict_proba(features_df)[0]
        print("Probability (Benign, Malware):", prob)
        if prob[1] >= malware_probability_threshold:
            print(f"Malware detected! Malware probability {prob[1]:.2f} ≥ threshold {malware_probability_threshold}")
        else:
            print(f"App is benign. Malware probability {prob[1]:.2f} < threshold {malware_probability_threshold}")
    except Exception as e:
        print("Error scanning APK:", e)

# For uploading APK file in Colab, run this in the notebook:
from google.colab import files
uploaded = files.upload()  # Upload your APK file here

# Example scan call (update filename accordingly)
scan_apk_pyaxml("/content/wildfire-test-apk-file.apk", malware_probability_threshold=0.2)

# Evaluate on test data
print("Test accuracy:", model.score(X_test, y_test))

KeyboardInterrupt: 

In [None]:
scan_apk_pyaxml("/content/testmalware.apk", malware_probability_threshold=0.2)

Probability (Benign, Malware): [0.78907362 0.21092638]
Malware detected! Malware probability 0.21 ≥ threshold 0.2


#Conclusion


##Thank You!