In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE



In [2]:
# ==============================
#  Load and preprocess data
# ==============================
df = pd.read_csv("crime through social media.csv")
df.head()

Unnamed: 0,User_ID,Age_Group,Gender,Location,Device_Type,Social_Media_Platform,Two_Factor_Authentication,Saved_Password,Account_Privacy,Frequent_New_Device_Logging,...,Cross_Channel_Usage,Season,Suspicious_Activity,Clicked_Ad,Ad_Platform,Ad_Target_Domain,Domain_Reputation,Domain_Length,Domain_Entropy,Click_Leads_To_Malicious_Site
0,1,35-44,Female,Jharkhand,Mobile,Twitter,Yes,No,Private,No,...,Yes,Holi,1,Yes,Instagram,banking.example,Good,15,3.507,0
1,2,18-24,Female,Uttar Pradesh,Mobile,Facebook,Yes,Yes,Private,No,...,No,Diwali,1,No,,,,0,0.0,0
2,3,35-44,Other,Gujarat,Mobile,Instagram,Yes,Yes,Public,No,...,No,navaratri,0,No,,,,0,0.0,0
3,4,18-24,Female,Uttar Pradesh,Tablet,WhatsApp,Yes,Yes,Public,No,...,Yes,summer vacations,0,No,,,,0,0.0,0
4,5,25-34,Female,Rajasthan,Mobile,WhatsApp,Yes,Yes,Private,No,...,Yes,Holi,0,Yes,LinkedIn,ecommerce.com,Good,13,2.449,0


In [3]:
X = df.drop(columns=["Click_Leads_To_Malicious_Site", "User_ID"])
y = df["Click_Leads_To_Malicious_Site"]
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = [col for col in X.columns if col not in num_cols]

In [4]:
# Handle missing values
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
X[cat_cols] = X[cat_cols].fillna(X[cat_cols].mode().iloc[0])

# Scale numeric features
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [5]:
# Encode categorical features
encoder = LabelEncoder()
for col in cat_cols:
    X[col] = encoder.fit_transform(X[col])


In [6]:
# ==============================
#  Split into train, validation, and test sets
# ==============================
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4,
                                                    random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5,
                                                random_state=42, stratify=y_temp)

print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}, Test size: {len(X_test)}")

Train size: 3000, Validation size: 1000, Test size: 1000


In [7]:
# ==============================
#  Handle class imbalance (SMOTE - optional experiment)
# ==============================
# smote = SMOTE(random_state=42)
# X_train, y_train = smote.fit_resample(X_train, y_train)

# ==============================

In [8]:
#  Logistic Regression (with 5-fold CV)
# ==============================
log_reg = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)

log_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

log_grid = GridSearchCV(log_reg, log_param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
log_grid.fit(X_train, y_train)

best_log_model = log_grid.best_estimator_
print("\nBest Logistic Regression Params:", log_grid.best_params_)

# Evaluate on validation set
y_val_pred = best_log_model.predict(X_val)
y_val_proba = best_log_model.predict_proba(X_val)[:, 1]

print("\n Logistic Regression (Validation Set)")
print(f"Accuracy : {accuracy_score(y_val, y_val_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_val_pred):.3f}")
print(f"Recall   : {recall_score(y_val, y_val_pred):.3f}")
print(f"F1 Score : {f1_score(y_val, y_val_pred):.3f}")
print(f"ROC AUC  : {roc_auc_score(y_val, y_val_proba):.3f}")



Best Logistic Regression Params: {'C': 0.01, 'solver': 'lbfgs'}

 Logistic Regression (Validation Set)
Accuracy : 0.912
Precision: 0.714
Recall   : 0.967
F1 Score : 0.821
ROC AUC  : 0.950


In [9]:
# ==============================
#  Random Forest (with 5-fold CV)
# ==============================
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_grid = GridSearchCV(rf, rf_param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
rf_grid.fit(X_train, y_train)

best_rf_model = rf_grid.best_estimator_
print("\nBest Random Forest Params:", rf_grid.best_params_)

# Validation evaluation
y_val_pred = best_rf_model.predict(X_val)
y_val_proba = best_rf_model.predict_proba(X_val)[:, 1]

print("\n Random Forest (Validation Set)")
print(f"Accuracy : {accuracy_score(y_val, y_val_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_val_pred):.3f}")
print(f"Recall   : {recall_score(y_val, y_val_pred):.3f}")
print(f"F1 Score : {f1_score(y_val, y_val_pred):.3f}")
print(f"ROC AUC  : {roc_auc_score(y_val, y_val_proba):.3f}")



Best Random Forest Params: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}

 Random Forest (Validation Set)
Accuracy : 0.941
Precision: 0.800
Recall   : 0.957
F1 Score : 0.871
ROC AUC  : 0.975


In [10]:
# ==============================
#  XGBoost (with 5-fold CV)
# ==============================
scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)  # handle imbalance

xgb = XGBClassifier(
    eval_metric="logloss",
    random_state=42,
    use_label_encoder=False
)

xgb_param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'scale_pos_weight': [1, scale_pos_weight]
}

xgb_grid = GridSearchCV(xgb, xgb_param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
xgb_grid.fit(X_train, y_train)

best_xgb_model = xgb_grid.best_estimator_
print("\nBest XGBoost Params:", xgb_grid.best_params_)

# Validation evaluation
y_val_pred = best_xgb_model.predict(X_val)
y_val_proba = best_xgb_model.predict_proba(X_val)[:, 1]

print("\n XGBoost (Validation Set)")
print(f"Accuracy : {accuracy_score(y_val, y_val_pred):.3f}")
print(f"Precision: {precision_score(y_val, y_val_pred):.3f}")
print(f"Recall   : {recall_score(y_val, y_val_pred):.3f}")
print(f"F1 Score : {f1_score(y_val, y_val_pred):.3f}")
print(f"ROC AUC  : {roc_auc_score(y_val, y_val_proba):.3f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best XGBoost Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 150, 'scale_pos_weight': 3.792332268370607}

 XGBoost (Validation Set)
Accuracy : 0.942
Precision: 0.801
Recall   : 0.962
F1 Score : 0.874
ROC AUC  : 0.973


In [11]:
# ==============================
#  Final Test Evaluation (best models)
# ==============================
for name, model in {
    "Logistic Regression": best_log_model,
    "Random Forest": best_rf_model,
    "XGBoost": best_xgb_model
}.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"\n {name} (Test Set)")
    print(f"Accuracy : {accuracy_score(y_test, y_pred):.3f}")
    print(f"Precision: {precision_score(y_test, y_pred):.3f}")
    print(f"Recall   : {recall_score(y_test, y_pred):.3f}")
    print(f"F1 Score : {f1_score(y_test, y_pred):.3f}")
    print(f"ROC AUC  : {roc_auc_score(y_test, y_proba):.3f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


 Logistic Regression (Test Set)
Accuracy : 0.902
Precision: 0.688
Recall   : 0.966
F1 Score : 0.804
ROC AUC  : 0.941
Confusion Matrix:
 [[701  91]
 [  7 201]]

 Random Forest (Test Set)
Accuracy : 0.934
Precision: 0.773
Recall   : 0.966
F1 Score : 0.859
ROC AUC  : 0.970
Confusion Matrix:
 [[733  59]
 [  7 201]]

 XGBoost (Test Set)
Accuracy : 0.934
Precision: 0.773
Recall   : 0.966
F1 Score : 0.859
ROC AUC  : 0.968
Confusion Matrix:
 [[733  59]
 [  7 201]]
