# Ransomware Prediction using Machine Learning
This notebook trains and compares the top 5 ML algorithms for ransomware detection.

## 1. Import Libraries

In [2]:
!pip install pandas numpy scikit-learn

Collecting pandas
  Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.4.1-cp313-cp313-win_amd64.whl.metadata (6.6 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.17.0-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-2.3.3-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   --------------------------------------- 



In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score
)

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Load and Explore Data

In [2]:
# Load data
df = pd.read_csv('data.csv')

print(f"Dataset Shape: {df.shape}")
print(f"Total Records: {df.shape[0]}")
print(f"Total Features: {df.shape[1]}")
df.head()

Dataset Shape: (10000, 22)
Total Records: 10000
Total Features: 22


Unnamed: 0,ts,host,pid,ppid,uid,user,exe,cmdline,syscall,path,...,ext,new_path,bytes_written,perm,owner,retval,minute,is_sensitive_path,is_backup_path,label
0,2026-01-19 10:50:45,app1,37029,87,806,backup,/usr/bin/python3,/usr/bin/python3 /tmp/enc.py,unlink,/srv/backup/glgmfmcr.bak,...,.xls,/var/tmp/scratch/evfgms.xls.encrypted,0,,,0,2026-01-19 10:50:00,1,0,1
1,2026-01-19 10:48:58,db1,41468,79,1179,app,/usr/local/bin/backupd,/usr/local/bin/backupd --target /srv/backup --...,open,/var/tmp/work/wyfnzvub.xls,...,.xls,,7002,,,0,2026-01-19 10:48:00,0,0,0
2,2026-01-19 09:52:01,app1,45129,71,-134,root,/usr/bin/vim,/usr/bin/vim,write,/var/tmp/scratch/vcwplqpx.db,...,.db,,17060,,,0,2026-01-19 09:52:00,0,0,0
3,2026-01-19 09:58:46,fs1,23269,57,-142,root,/usr/bin/mv,/usr/bin/mv,open,/var/tmp/work/clitdn.bak,...,.bak,,0,,,0,2026-01-19 09:58:00,0,0,0
4,2026-01-19 09:48:45,app1,26429,40,885,app,/usr/sbin/logrotate,/usr/sbin/logrotate /etc/logrotate.conf,write,/var/tmp/cache/wddoyizv.docx,...,.docx,,21813,,,0,2026-01-19 09:48:00,1,0,0


In [3]:
# Check column types
print("Column Data Types:")
print(df.dtypes)

Column Data Types:
ts                    object
host                  object
pid                    int64
ppid                   int64
uid                    int64
user                  object
exe                   object
cmdline               object
syscall               object
path                  object
dir                   object
filename              object
ext                   object
new_path              object
bytes_written          int64
perm                 float64
owner                 object
retval                 int64
minute                object
is_sensitive_path      int64
is_backup_path         int64
label                  int64
dtype: object


In [5]:
# Target distribution
print("Target Distribution:")
print(df['label'].value_counts())
print(f"\nClass Balance: {df['label'].value_counts(normalize=True).to_dict()}")

Target Distribution:
label
1    5015
0    4985
Name: count, dtype: int64

Class Balance: {1: 0.5015, 0: 0.4985}


## 3. Clean and Preprocess Data

In [6]:
df_clean = df.copy()

# Drop columns not useful for prediction
# IMPORTANT: Remove features that perfectly correlate with label (data leakage!)
columns_to_drop = [
    'ts', 'minute', 'campaign_id', 'new_path', 'path', 'filename', 'cmdline',
    'is_attack',  # This IS the label - causes 100% accuracy
    'from_writable_exec',  # Perfect 1.0 correlation with label
    'is_exec_from_writable',  # Perfect 1.0 correlation with label
    'is_delete_backup'  # Derived feature, correlated
]
existing_cols_to_drop = [col for col in columns_to_drop if col in df_clean.columns]
df_clean = df_clean.drop(columns=existing_cols_to_drop, errors='ignore')
print(f"Dropped columns: {existing_cols_to_drop}")
print(f"Remaining columns: {list(df_clean.columns)}")

Dropped columns: ['ts', 'minute', 'new_path', 'path', 'filename', 'cmdline']
Remaining columns: ['host', 'pid', 'ppid', 'uid', 'user', 'exe', 'syscall', 'dir', 'ext', 'bytes_written', 'perm', 'owner', 'retval', 'is_sensitive_path', 'is_backup_path', 'label']


In [7]:
# Separate numeric and categorical columns
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()

if 'label' in numeric_cols:
    numeric_cols.remove('label')

print(f"Numeric columns: {numeric_cols}")
print(f"Categorical columns: {categorical_cols}")

Numeric columns: ['pid', 'ppid', 'uid', 'bytes_written', 'perm', 'retval', 'is_sensitive_path', 'is_backup_path']
Categorical columns: ['host', 'user', 'exe', 'syscall', 'dir', 'ext', 'owner']


In [8]:
# Fill missing values in numeric columns with median
for col in numeric_cols:
    if df_clean[col].isnull().sum() > 0:
        median_val = df_clean[col].median()
        df_clean[col].fillna(median_val, inplace=True)
        print(f"Filled {col} with median: {median_val}")

# Fill missing values in categorical columns with mode
for col in categorical_cols:
    if df_clean[col].isnull().sum() > 0:
        mode_val = df_clean[col].mode()
        if len(mode_val) > 0:
            df_clean[col].fillna(mode_val[0], inplace=True)
            print(f"Filled {col} with mode: {mode_val[0]}")
        else:
            df_clean[col].fillna('unknown', inplace=True)
            print(f"Filled {col} with 'unknown'")

print("\nMissing values handled!")

Filled perm with median: 600.0
Filled owner with mode: root:root

Missing values handled!


In [9]:
# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col].astype(str))
    label_encoders[col] = le
    print(f"Encoded {col}: {len(le.classes_)} unique values")

# Fill any remaining missing values
df_clean = df_clean.fillna(0)
print(f"\nFinal dataset shape: {df_clean.shape}")

Encoded host: 3 unique values
Encoded user: 4 unique values
Encoded exe: 8 unique values
Encoded syscall: 7 unique values
Encoded dir: 12 unique values
Encoded ext: 11 unique values
Encoded owner: 4 unique values

Final dataset shape: (10000, 16)


## 4. Prepare Features for Training

In [10]:
# Separate features and target
X = df_clean.drop('label', axis=1)
y = df_clean['label']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")

Features shape: (10000, 15)
Target shape: (10000,)

Feature columns: ['host', 'pid', 'ppid', 'uid', 'user', 'exe', 'syscall', 'dir', 'ext', 'bytes_written', 'perm', 'owner', 'retval', 'is_sensitive_path', 'is_backup_path']


In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)
RR
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

Training set: 8000 samples
Test set: 2000 samples


In [12]:
print(f"Training set: {y_train.shape[0]} samples")
print(f"Test set: {y_test.shape[0]} samples")

Training set: 8000 samples
Test set: 2000 samples


## 5.ML Algorithms

In [13]:
# algorithms for classification
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

print("Algorithms:")
for name in models.keys():
    print(f"  - {name}")

Algorithms:
  - Random Forest
  - Gradient Boosting
  - Extra Trees
  - Logistic Regression
  - Decision Tree


## 6. Train and Evaluate Models

In [14]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    """Train and evaluate a model"""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    y_pred_proba = None
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, zero_division=0),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else 0
    }
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    metrics['CV F1 (mean)'] = cv_scores.mean()
    
    return metrics, model

In [15]:
# Train all models
results = {}
trained_models = {}

for name, model in models.items():
    print(f"Training {name}...")
    metrics, trained_model = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[name] = metrics
    trained_models[name] = trained_model
    print(f"  Accuracy: {metrics['Accuracy']:.4f} | F1-Score: {metrics['F1-Score']:.4f} | ROC-AUC: {metrics['ROC-AUC']:.4f}")

print("\nAll models trained!")

Training Random Forest...
  Accuracy: 0.9165 | F1-Score: 0.9166 | ROC-AUC: 0.9141
Training Gradient Boosting...
  Accuracy: 0.9155 | F1-Score: 0.9156 | ROC-AUC: 0.9154
Training Extra Trees...
  Accuracy: 0.9130 | F1-Score: 0.9130 | ROC-AUC: 0.9165
Training Logistic Regression...
  Accuracy: 0.8555 | F1-Score: 0.8606 | ROC-AUC: 0.8918
Training Decision Tree...
  Accuracy: 0.8155 | F1-Score: 0.8145 | ROC-AUC: 0.8155

All models trained!


## 7. Compare Model Results

In [16]:
# Create results DataFrame
results_df = pd.DataFrame(results).T.round(4)
results_df = results_df.sort_values('F1-Score', ascending=False)

print("Model Comparison (sorted by F1-Score):")
results_df

Model Comparison (sorted by F1-Score):


Unnamed: 0,Accuracy,Precision,Recall,F1-Score,ROC-AUC,CV F1 (mean)
Random Forest,0.9165,0.918,0.9153,0.9166,0.9141,0.9162
Gradient Boosting,0.9155,0.917,0.9143,0.9156,0.9154,0.9151
Extra Trees,0.913,0.9157,0.9103,0.913,0.9165,0.9125
Logistic Regression,0.8555,0.8336,0.8893,0.8606,0.8918,0.8621
Decision Tree,0.8155,0.8215,0.8076,0.8145,0.8155,0.8331


In [17]:
# Best model
best_model_name = results_df['F1-Score'].idxmax()
print(f"Best Model: {best_model_name}")
print(f"  F1-Score: {results_df.loc[best_model_name, 'F1-Score']:.4f}")
print(f"  Accuracy: {results_df.loc[best_model_name, 'Accuracy']:.4f}")
print(f"  ROC-AUC: {results_df.loc[best_model_name, 'ROC-AUC']:.4f}")

Best Model: Random Forest
  F1-Score: 0.9166
  Accuracy: 0.9165
  ROC-AUC: 0.9141


## 8. Detailed Report for Best Model

In [18]:
# Classification report
best_model = trained_models[best_model_name]
y_pred = best_model.predict(X_test)

print(f"Classification Report for {best_model_name}:")
print(classification_report(y_test, y_pred, target_names=['Benign', 'Ransomware']))

Classification Report for Random Forest:
              precision    recall  f1-score   support

      Benign       0.92      0.92      0.92       997
  Ransomware       0.92      0.92      0.92      1003

    accuracy                           0.92      2000
   macro avg       0.92      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000



In [19]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(f"                 Predicted")
print(f"                 Benign  Ransomware")
print(f"Actual Benign    {cm[0][0]:6d}  {cm[0][1]:6d}")
print(f"Actual Ransomware{cm[1][0]:6d}  {cm[1][1]:6d}")

Confusion Matrix:
                 Predicted
                 Benign  Ransomware
Actual Benign       915      82
Actual Ransomware    85     918


In [20]:
# Feature importance (if available)
if hasattr(best_model, 'feature_importances_'):
    feature_imp = pd.DataFrame({
        'Feature': X_test.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("Top 10 Important Features:")
    print(feature_imp.head(10).to_string(index=False))

Top 10 Important Features:
          Feature  Importance
              dir    0.245320
             host    0.174998
              exe    0.174080
          syscall    0.073401
              pid    0.059856
              uid    0.058290
    bytes_written    0.056576
             ppid    0.054705
              ext    0.045579
is_sensitive_path    0.025093


## 9. Make Prediction on New Data

In [23]:
def predict_ransomware(sample_dict):
    """Predict if a sample is ransomware or benign"""
    sample_df = pd.DataFrame([sample_dict])
    sample_scaled = scaler.transform(sample_df)
    
    prediction = best_model.predict(sample_scaled)
    proba = best_model.predict_proba(sample_scaled)
    
    result = "RANSOMWARE" if prediction[0] == 1 else "BENIGN"
    confidence = max(proba[0]) * 100
    
    print(f"Prediction: {result}")
    print(f"Confidence: {confidence:.2f}%")
    return prediction[0]

# Example usage (uncomment and modify with your data):
# sample = {'host': 0, 'pid': 1234, 'ppid': 1, ...}
# predict_ransomware(sample)