In [10]:
import pandas as pd
path = "/home/sarthak/drone_project/my_master_dataset_full.csv"
df = pd.read_csv(path)

target = "label"
groups = df["flight_id"]
X = df.drop(columns=["label", "flight_id", "attack_type", "attack_params_json","timestamp"])
y = df[target].values

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import GroupKFold

In [12]:

models = {
    "RandomForest": RandomForestClassifier(n_estimators=300, max_depth=14, class_weight='balanced',n_jobs=-1, random_state=42),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=200, max_depth=12, n_jobs=-1, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=500, class_weight='balanced'),
    "SVC-RBF": SVC(kernel="rbf", probability=True, class_weight='balanced'),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=200, learning_rate=0.05, max_depth=8, n_jobs=-1, use_label_encoder=False, eval_metric='logloss'
    )
}

gkf = GroupKFold(n_splits=5)
results = []

for name, model in models.items():
    print(f"\nðŸ”¹ Training {name}")
    fold_metrics = []

    for train_idx, test_idx in gkf.split(X, y, groups):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler()),
            ("model", model)
        ])

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        y_prob = (
            pipe.predict_proba(X_test)[:, 1]
            if hasattr(pipe.named_steps["model"], "predict_proba")
            else y_pred
        )

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)
        fold_metrics.append((acc, f1, auc))

    acc_mean = np.mean([m[0] for m in fold_metrics])
    f1_mean = np.mean([m[1] for m in fold_metrics])
    auc_mean = np.mean([m[2] for m in fold_metrics])

    results.append({
        "Model": name,
        "Accuracy": acc_mean,
        "F1": f1_mean,
        "AUC": auc_mean
    })


ðŸ”¹ Training RandomForest

ðŸ”¹ Training ExtraTrees

ðŸ”¹ Training LogisticRegression

ðŸ”¹ Training SVC-RBF

ðŸ”¹ Training GradientBoosting

ðŸ”¹ Training XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Model Comparison ===
                Model  Accuracy        F1       AUC
4    GradientBoosting  0.759494  0.622493  0.837373
5             XGBoost  0.760081  0.624371  0.832890
0        RandomForest  0.738618  0.662284  0.829851
1          ExtraTrees  0.766998  0.504097  0.805105
3             SVC-RBF  0.693928  0.527788  0.704970
2  LogisticRegression  0.648457  0.494576  0.642007


In [13]:
results_df = pd.DataFrame(results).sort_values("F1", ascending=False)
print("\n=== Model Comparison ===")
print(results_df)



=== Model Comparison ===
                Model  Accuracy        F1       AUC
0        RandomForest  0.738618  0.662284  0.829851
5             XGBoost  0.760081  0.624371  0.832890
4    GradientBoosting  0.759494  0.622493  0.837373
3             SVC-RBF  0.693928  0.527788  0.704970
1          ExtraTrees  0.766998  0.504097  0.805105
2  LogisticRegression  0.648457  0.494576  0.642007


In [5]:
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Drop useless or categorical columns
feature_cols = [
    c for c in df.columns
    if c not in ["label", "flight_id", "attack_type", "attack_params_json","timestamp"]
]

best_model = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=14,
        n_jobs=-1,
        class_weight='balanced',
        random_state=42
    ))
])

best_model.fit(X, y)

joblib.dump(best_model, "drone_ids_model.pkl")
joblib.dump(feature_cols, "feature_list.pkl")
print("âœ… Model and feature list saved.")


âœ… Model and feature list saved.


In [8]:
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Drop useless or categorical columns
feature_cols = [
    c for c in df.columns
    if c not in ["label", "flight_id", "attack_type", "attack_params_json","timestamp"]
]

gb_model = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("model", GradientBoostingClassifier(
        n_estimators=300,
        max_depth=14,       
        learning_rate=0.05, 
        random_state=42
    ))
])

# Train the model on the FULL dataset
gb_model.fit(X, y)

print("âœ… Model training complete.")

# Save the model and feature list
joblib.dump(gb_model, "gb_ids_model.pkl")
joblib.dump(feature_cols, "gb_feature_list.pkl")

print("âœ… Gradient Boosting model saved to 'gb_ids_model.pkl'")
print("âœ… Feature list saved to 'feature_list.pkl'")


âœ… Model training complete.
âœ… Gradient Boosting model saved to 'gb_ids_model.pkl'
âœ… Feature list saved to 'feature_list.pkl'


In [9]:
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import logging

feature_cols = [
    c for c in df.columns
    if c not in ["label", "flight_id", "attack_type", "attack_params_json", "timestamp"]
]

print("ðŸš€ Starting XGBoost model training...")

# Calculate 'scale_pos_weight' for imbalanced data
# This is the correct way to handle 'class_weight' in XGBoost
try:
    count_neg = (y == 0).sum()
    count_pos = (y == 1).sum()
    scale_pos_weight = count_neg / count_pos
    print(f"Calculated scale_pos_weight for imbalance: {scale_pos_weight:.2f}")
except ZeroDivisionError:
    print("Warning: No positive samples (label=1) found. Setting scale_pos_weight to 1.")
    scale_pos_weight = 1
except Exception as e:
    print(f"Error calculating scale_pos_weight: {e}. Defaulting to 1.")
    scale_pos_weight = 1


# Create the pipeline
xgb_model = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler()),
    ("model", XGBClassifier(
        n_estimators=300,
        max_depth=14,           
        learning_rate=0.05,
        n_jobs=-1,
        random_state=42,
        scale_pos_weight=scale_pos_weight, 
        eval_metric='logloss'
    ))
])

# Train the model on the FULL dataset
xgb_model.fit(X, y)

print("âœ… Model training complete.")

# Save the model and feature list
joblib.dump(xgb_model, "xgb_ids_model.pkl")
joblib.dump(feature_cols, "xgb_feature_list.pkl")

print("âœ… XGBoost model saved to 'xgb_ids_model.pkl'")
print("âœ… Feature list saved to 'feature_list.pkl'")

ðŸš€ Starting XGBoost model training...
Calculated scale_pos_weight for imbalance: 1.74
âœ… Model training complete.
âœ… XGBoost model saved to 'xgb_ids_model.pkl'
âœ… Feature list saved to 'feature_list.pkl'
