In [114]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
import re

In [115]:
df = pd.read_csv('/kaggle/input/car-hacking-dataset/DoS_dataset.csv')
df.columns = ['Timestamp', 'CAN ID', 'DLC', 'DATA0', 'DATA1', 'DATA2', 'DATA3', 'DATA4', 'DATA5', 'DATA6', 'DATA7', 'Flag']
df.head()

Unnamed: 0,Timestamp,CAN ID,DLC,DATA0,DATA1,DATA2,DATA3,DATA4,DATA5,DATA6,DATA7,Flag
0,1478198000.0,018f,8,fe,5b,00,00,0,3c,00,00,R
1,1478198000.0,0260,8,19,21,22,30,8,8e,6d,3a,R
2,1478198000.0,02a0,8,64,00,9a,1d,97,02,bd,00,R
3,1478198000.0,0329,8,40,bb,7f,14,11,20,00,14,R
4,1478198000.0,0545,8,d8,00,00,8a,0,00,00,00,R


In [123]:
# Convert Timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df.head()

Unnamed: 0,Timestamp,CAN ID,DLC,DATA0,DATA1,DATA2,DATA3,DATA4,DATA5,DATA6,DATA7,Flag
0,1970-01-01 00:00:01.478198376,018f,8,fe,5b,00,00,0,3c,00,00,R
1,1970-01-01 00:00:01.478198376,0260,8,19,21,22,30,8,8e,6d,3a,R
2,1970-01-01 00:00:01.478198376,02a0,8,64,00,9a,1d,97,02,bd,00,R
3,1970-01-01 00:00:01.478198376,0329,8,40,bb,7f,14,11,20,00,14,R
4,1970-01-01 00:00:01.478198376,0545,8,d8,00,00,8a,0,00,00,00,R


In [124]:
df.nunique()

Timestamp    2813
CAN ID         27
DLC             2
DATA0         108
DATA1          71
DATA2          76
DATA3          26
DATA4         190
DATA5         256
DATA6          75
DATA7         256
Flag            2
dtype: int64

In [125]:
df_2 = df.copy()

In [138]:
df_2['CAN ID'].unique()

array(['018f', '0260', '02a0', '0329', '0545', '0002', '0153', '02c0',
       '0130', '0131', '0140', '0350', '043f', '0370', '0440', '0316',
       '04f0', '0430', '04b1', '01f1', '05f0', '00a0', '00a1', '0690',
       '05a0', '05a2', '0000'], dtype=object)

In [153]:
df_2['DATA2'].unique(), df_2.shape

(array(['00', '22', '9a', '7f', '10', '14', '60', '68', '27', '24', 'R',
        '34', '21', '44', '54', '64', '74', '84', '94', 'a4', 'b4', 'c4',
        'd4', 'e4', 'f4', '04', '6c', '5c', '78', '70', '01', 'a5', '7c',
        '58', '50', '80', '23', '88', '9b', 'a0', '9c', 'a8', 'ac', 'c0',
        '25', 'cc', 'd0', 'dc', 'd8', 'e8', 'e0', 'ec', 'fc', 'f0', 'f8',
        '0c', '08', '1c', '18', '20', '28', '26', '30', '2c', '38', '48',
        '3c', '40', '4c', 'c8', 'bc', '8c', '98', '90', 'b0', 'b8'],
       dtype=object),
 (3665770, 12))

In [146]:
df_2[df_2['DATA2'] =='R'].count()

Timestamp    31188
CAN ID       31188
DLC          31188
DATA0        31188
DATA1        31188
DATA2        31188
DATA3            0
DATA4            0
DATA5            0
DATA6            0
DATA7            0
Flag             0
dtype: int64

In [158]:
df_3 = df_2[df_2['DATA2']!='R']

In [159]:
df_3['DATA2'].unique(), df_3.shape

(array(['00', '22', '9a', '7f', '10', '14', '60', '68', '27', '24', '34',
        '21', '44', '54', '64', '74', '84', '94', 'a4', 'b4', 'c4', 'd4',
        'e4', 'f4', '04', '6c', '5c', '78', '70', '01', 'a5', '7c', '58',
        '50', '80', '23', '88', '9b', 'a0', '9c', 'a8', 'ac', 'c0', '25',
        'cc', 'd0', 'dc', 'd8', 'e8', 'e0', 'ec', 'fc', 'f0', 'f8', '0c',
        '08', '1c', '18', '20', '28', '26', '30', '2c', '38', '48', '3c',
        '40', '4c', 'c8', 'bc', '8c', '98', '90', 'b0', 'b8'], dtype=object),
 (3634582, 12))

In [160]:
# Define columns to check (DATA0-DATA7)
data_columns = ['DATA0', 'DATA1', 'DATA2', 'DATA3', 'DATA4', 'DATA5', 'DATA6', 'DATA7']

In [161]:
# Create regex pattern for valid hex
hex_pattern = r'^[0-9A-Fa-f]{2}$'

In [162]:
# Check for non-hex values
mask = df_3[data_columns].apply(lambda col: ~col.str.match(hex_pattern, na=False))

In [163]:
# Get rows with any invalid entries
invalid_rows = df_3[mask.any(axis=1)]

# Show results
print("Rows with non-hex values in DATA columns:")
print(invalid_rows if not invalid_rows.empty else "No non-hex values found")

Rows with non-hex values in DATA columns:
No non-hex values found


In [167]:
# Define validation patterns
canid_pattern = r'^[0-9A-Fa-f]{4}$'  # 4-character hex

# Check CAN ID column
canid_mask = ~df_3['CAN ID'].str.match(canid_pattern, na=False)
invalid_canid = df_3[canid_mask]

# Display results
print("=== Invalid CAN IDs ===")
print(invalid_canid if not invalid_canid.empty else "All CAN IDs are valid hex")

# Optional: Show specific invalid values
if not invalid_canid.empty:
    print("\nInvalid CAN ID entries:")
    print(df.loc[canid_mask, 'CAN ID'])


=== Invalid CAN IDs ===
All CAN IDs are valid hex


In [178]:
df_4 = df_3.copy()

In [179]:
df_4.head()

Unnamed: 0,Timestamp,CAN ID,DLC,DATA0,DATA1,DATA2,DATA3,DATA4,DATA5,DATA6,DATA7,Flag
0,1970-01-01 00:00:01.478198376,018f,8,fe,5b,00,00,0,3c,00,00,R
1,1970-01-01 00:00:01.478198376,0260,8,19,21,22,30,8,8e,6d,3a,R
2,1970-01-01 00:00:01.478198376,02a0,8,64,00,9a,1d,97,02,bd,00,R
3,1970-01-01 00:00:01.478198376,0329,8,40,bb,7f,14,11,20,00,14,R
4,1970-01-01 00:00:01.478198376,0545,8,d8,00,00,8a,0,00,00,00,R


In [180]:
# Function to convert hex to decimal
def hex_to_int(hex_str: str) -> int:
    try:
        return int(str(hex_str).strip(), 16)  # Convert hex to int
    except ValueError:
        return np.nan 

# Convert all DATA columns
for col in df_4.columns[1:-1]:  # Exclude 'Flag' column
    df_4[col] = df_4[col].apply(hex_to_int)

In [181]:
df_4.head()

Unnamed: 0,Timestamp,CAN ID,DLC,DATA0,DATA1,DATA2,DATA3,DATA4,DATA5,DATA6,DATA7,Flag
0,1970-01-01 00:00:01.478198376,399,8,254,91,0,0,0,60,0,0,R
1,1970-01-01 00:00:01.478198376,608,8,25,33,34,48,8,142,109,58,R
2,1970-01-01 00:00:01.478198376,672,8,100,0,154,29,151,2,189,0,R
3,1970-01-01 00:00:01.478198376,809,8,64,187,127,20,17,32,0,20,R
4,1970-01-01 00:00:01.478198376,1349,8,216,0,0,138,0,0,0,0,R


In [183]:
df_4.nunique()

Timestamp    2813
CAN ID         26
DLC             1
DATA0         108
DATA1          71
DATA2          75
DATA3          26
DATA4         190
DATA5         256
DATA6          75
DATA7         256
Flag            2
dtype: int64

In [182]:
df_4.isnull().sum()

Timestamp    0
CAN ID       0
DLC          0
DATA0        0
DATA1        0
DATA2        0
DATA3        0
DATA4        0
DATA5        0
DATA6        0
DATA7        0
Flag         0
dtype: int64

In [None]:
df['DLC'].unique()

In [None]:
df[df['DLC'] == 2].count()

In [None]:
df['CAN ID'].unique()

In [None]:
df['DATA0'].unique()

In [None]:
df['DATA1'].unique()

In [None]:
df['DATA2'].unique()

In [None]:
invalid_hex = [x for x in df['DATA2'] if not all(c in '0123456789abcdefABCDEF' for c in str(x))]
count = len(invalid_hex)
count


In [None]:
df.nunique()

In [None]:
df.shape

In [None]:
df = df[df['DATA2'].apply(lambda x: all(c in '0123456789abcdefABCDEF' for c in x))]


In [None]:
df['DATA2'].unique()

In [None]:
df['DATA3'].unique()

In [None]:
df['DATA4'].unique()

In [None]:
df['DATA5'].unique()

In [None]:
df['DATA6'].unique()

In [None]:
df['DATA7'].unique()

In [None]:
df.describe()

In [None]:
df_2 = df.copy()
# Convert 'Timestamp' to datetime
df_2['Timestamp'] = pd.to_datetime(df_2['Timestamp'], unit='ms')

# Set Timestamp as the index
df_2 = df_2.set_index('Timestamp')

# Calculate time intervals between messages
df_2['Time_delta'] = df_2.index.to_series().diff()

# Count occurrences of '0000' CAN IDs in a 300ms rolling window
df_2['0000_Count'] = (df_2['CAN ID'] == '0000').astype(int).rolling('300ms').sum()

df_2.head()


In [None]:
df_2['Time_delta'] = df_2['Time_delta'].fillna(pd.Timedelta(seconds=0))  # Replace NaT with 0 seconds
df_2.head()

In [None]:


# Filter out 0 values and make a copy to avoid SettingWithCopyWarning
df_count = df_2.copy()

# Use bins for better performance
num_bins = 20  # Adjust based on data size
df_count["0000_Count_Binned"] = pd.cut(df_count["0000_Count"], bins=num_bins)

# Count occurrences of each bin
binned_counts = df_count["0000_Count_Binned"].value_counts().sort_index()

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=binned_counts.index.astype(str), y=binned_counts.values, palette="viridis")

# Labels and formatting
plt.xlabel("0000_Count Value Ranges")
plt.ylabel("Frequency")
plt.title("Distribution of 0000_Count (Binned for Performance)")
plt.xticks(rotation=45, ha='right')

# Show plot
plt.show()




In [None]:
# Drop unnecessary columns
df_3 = df_2.drop(["DLC"], axis=1).copy()
df_3.head()

In [None]:
df_3.dtypes, df_3.shape

In [None]:
df_2 = df.copy()

In [None]:
df_3.describe()

In [None]:
# Move 'Flag' column to the end
df_3 = df_3[[
    col for col in df_3.columns if col != 'Flag'
] + ['Flag']]
df_3.head()

In [None]:
df_4 = df_3.copy()
# Function to convert hex to decimal
def hex_to_int(hex_str: str) -> int:
    try:
        return int(str(hex_str).strip(), 16)  # Convert hex to int
    except ValueError:
        return np.nan 

# Convert all DATA columns
for col in df_4.columns[1:-3]:  # Exclude 'Flag' column
    df_4[col] = df_4[col].apply(hex_to_int)

In [None]:
df_4.head()

In [None]:
df_5 = df_4.copy()
df_5.head()

In [None]:
df_5["Flag"] = df_4["Flag"].map({"R": 0, "T": 1})
df_5.head()

In [None]:
print(df_4["Flag"].unique())

In [None]:
filtered_df2 = df_5[df_5["CAN ID"] == "0000"]
filtered_df2.head(), filtered_df2.shape

In [None]:
df_5.shape,df_5.dtypes

In [None]:
df_5.isnull().sum()

In [None]:
df_6 = df_5.dropna().copy()
df_6.shape

In [None]:
print(df_6["Flag"].unique())

In [None]:
df_6.isnull().sum(), df_6.info()

In [None]:
# Convert columns to int64 including timestamp mcolumn
df_6 = df_6.astype({"DATA0": "int64", "DATA1": "int64",
                    "DATA2": "int64", "DATA3": "int64", "DATA4": "int64",
                    "DATA5": "int64", "DATA6": "int64", "DATA7": "int64",
                    "Time_delta": "int64", "0000_Count": "int64", "Flag": "int64"})

In [None]:
df_6.head()

In [None]:
df_6.dtypes

In [None]:
df_6.describe()

In [None]:
df_7 = df_6.copy()
df_7.head()

In [None]:
filtered_df3 = df_7[(df_7["CAN ID"] == "0000")]
filtered_df3.shape

In [None]:
# Exclude the 'Flag' column before calculating correlation
correlation_matrix = df_7.drop(columns=['CAN ID','Flag', 'Time_delta', '0000_Count']).corr()
correlation_matrix

In [None]:
# Plot the correlation matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', linewidths=0.5)

In [None]:
filtered_df4 = df_7[(df_7["Flag"] == 1) & (df_7["CAN ID"] == "0000")]
filtered_df4.shape

In [None]:
df_8 = df_7.copy()
# Load the dataset (assuming df_7 is already a DataFrame)
df_8['CAN ID'] = df_8['CAN ID'].apply(lambda x: int(x, 16))  # Convert hex to integer
df_8['CAN ID'] = df_8['CAN ID'].astype(int)
df_8.head()

In [None]:
df_9 = df_8.drop(columns=['Time_delta', '0000_Count'], axis=1)
df_9 = df_9.reset_index(drop=True)
df_9.head(), df_9['Flag'].value_counts()

In [None]:
import os
import pandas as pd
import numpy as np

# ✅ Detect GPU availability
GPU_AVAILABLE = os.path.exists('/dev/nvidia0')

try:
    if GPU_AVAILABLE:
        import cudf
        import cupy as cp
        from cuml.ensemble import RandomForestClassifier as cuRF
        from cuml.model_selection import train_test_split
        from cuml.preprocessing import StandardScaler as cuScaler
        print("🚀 RAPIDS cuML available! Running on GPU")
    else:
        raise ImportError
except ImportError:
    GPU_AVAILABLE = False
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestClassifier
    print("⚡ RAPIDS not available. Running on CPU.")

# 📌 Load dataset directly to GPU
if GPU_AVAILABLE:
    df_10 = cudf.from_pandas(df_9)
else:
    df_10 = df_9.copy()

# Define features and target
X = df_10.drop(columns=['Flag'])
y = df_10['Flag']

# ✅ Handle class imbalance (GPU-compatible)
if GPU_AVAILABLE:
    # Simplified GPU oversampling (adjust as needed)
    X_1 = X[y == 1]
    X_0 = X[y == 0]
    X_0_resampled = X_0.sample(n=len(X_1), replace=True)
    X_resampled = cudf.concat([X_1, X_0_resampled])
    y_resampled = cudf.Series(cp.concatenate([cp.ones(len(X_1)), cp.zeros(len(X_0_resampled))]))
else:
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

# ✅ Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Convert to float32
if GPU_AVAILABLE:
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    y_train = y_train.astype('float32')
    y_test = y_test.astype('float32')
else:
    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    y_train = y_train.astype(np.float32)
    y_test = y_test.astype(np.float32)

# ✅ Standardize features
scaler = cuScaler() if GPU_AVAILABLE else StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ✅ Choose the model
if GPU_AVAILABLE:
    print("🚀 Using GPU-Accelerated Random Forest (cuML)")
    model = cuRF(n_estimators=100, random_state=42, n_streams=1)
else:
    print("⚡ Using CPU-based Random Forest")
    model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced", n_jobs=-1)

# ✅ Train the model
model.fit(X_train_scaled, y_train)

# ✅ Predictions and conversion to CPU
if GPU_AVAILABLE:
    y_pred = model.predict(X_test_scaled).to_numpy()
    y_test = y_test.to_numpy()
else:
    y_pred = model.predict(X_test_scaled)

# ✅ Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "rf_model.joblib")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc

# 1. Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# 2. ROC Curve & AUC Score
if GPU_AVAILABLE:
    try:
        y_pred_proba = model.predict_proba(X_test_scaled)
        if isinstance(y_pred_proba, cudf.DataFrame):
            y_pred_proba = y_pred_proba.iloc[:, 1].to_numpy()
        else:
            y_pred_proba = y_pred_proba[:, 1].to_numpy()
    except AttributeError:
        # If predict_proba is not available, use predict
        y_pred_proba = model.predict(X_test_scaled).to_numpy()
else:
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)


plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# # 3. Feature Importance Plot
# if GPU_AVAILABLE:
#     feature_importance = model.feature_importances_.to_numpy()
# else:
#     feature_importance = model.feature_importances_

# feature_names = X.columns.to_numpy() if GPU_AVAILABLE else X.columns

# # Sort features by importance
# sorted_idx = np.argsort(feature_importance)
# sorted_features = feature_names[sorted_idx]
# sorted_importance = feature_importance[sorted_idx]

# # Plot top 20 features
# plt.figure(figsize=(12, 8))
# plt.barh(range(20), sorted_importance[-20:])
# plt.yticks(range(20), sorted_features[-20:])
# plt.xlabel('Feature Importance')
# plt.title('Top 20 Most Important Features')
# plt.tight_layout()
# plt.show()


In [None]:
# import cudf
# import cupy as cp
# import numpy as np
# import matplotlib.pyplot as plt
# from cuml.metrics import accuracy_score as cu_accuracy_score

# def permutation_importance(model, X, y, n_repeats=10, random_state=42):
#     cp.random.seed(random_state)
    
#     # Ensure y is a 1D array
#     y = y.squeeze() if isinstance(y, cudf.DataFrame) else y
    
#     # Get initial score
#     y_pred = model.predict(X)
#     y_pred = y_pred.squeeze() if isinstance(y_pred, cudf.DataFrame) else y_pred
#     baseline_score = cu_accuracy_score(y, y_pred)
    
#     importances = []
    
#     for col in X.columns:
#         scores = []
#         for _ in range(n_repeats):
#             # Create a copy and shuffle the feature
#             X_shuffled = X.copy()
#             X_shuffled[col] = X_shuffled[col].sample(frac=1.0, replace=False).reset_index(drop=True)
            
#             # Predict and calculate score
#             y_pred = model.predict(X_shuffled)
#             y_pred = y_pred.squeeze() if isinstance(y_pred, cudf.DataFrame) else y_pred
#             score = cu_accuracy_score(y, y_pred)
            
#             # Calculate importance
#             importance = baseline_score - score
#             scores.append(importance)
        
#         importances.append(np.mean(scores))
    
#     return cudf.Series(importances, index=X.columns)


# # Calculate permutation importance
# feature_importance = permutation_importance(model, X_test_scaled, y_test)

# # Convert to CPU for plotting
# feature_importance_cpu = feature_importance.to_pandas()

# # Sort features by importance
# sorted_idx = feature_importance_cpu.argsort()
# sorted_features = feature_importance_cpu.index[sorted_idx]
# sorted_importance = feature_importance_cpu.values[sorted_idx]

# # Determine the number of features to plot (all features if less than 20, otherwise top 20)
# n_features_to_plot = min(len(sorted_features), 20)

# # Plot features
# plt.figure(figsize=(12, 8))
# plt.barh(range(n_features_to_plot), sorted_importance[-n_features_to_plot:])
# plt.yticks(range(n_features_to_plot), sorted_features[-n_features_to_plot:])
# plt.xlabel('Feature Importance (Decrease in Accuracy)')
# plt.title(f'Top {n_features_to_plot} Most Important Features (Permutation Importance)')
# plt.tight_layout()
# plt.show()

# # Print feature importances
# print("Feature Importances:")
# for feature, importance in feature_importance_cpu.sort_values(ascending=False).items():
#     print(f"{feature}: {importance:.4f}")


In [None]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb

# ✅ Detect GPU availability
GPU_AVAILABLE = os.path.exists('/dev/nvidia0')

try:
    if GPU_AVAILABLE:
        import cudf
        import cupy as cp
        from cuml.model_selection import train_test_split as cuml_train_test_split
        from cuml.preprocessing import StandardScaler as cuScaler
        print("🚀 RAPIDS cuML available! Running on GPU")
    else:
        raise ImportError
except ImportError:
    GPU_AVAILABLE = False
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    print("⚡ RAPIDS not available. Running on CPU.")

# 📌 Load dataset directly to GPU
if GPU_AVAILABLE:
    # Assuming df_9 is already loaded as a pandas DataFrame before this block.
    df_10 = cudf.from_pandas(df_9)
else:
    df_10 = df_9.copy()

# Define features and target
X = df_10.drop(columns=['Flag'])
y = df_10['Flag']

# ✅ Handle class imbalance (GPU-compatible)
if GPU_AVAILABLE:
    X_1 = X[y == 1]
    X_0 = X[y == 0]
    X_0_resampled = X_0.sample(n=len(X_1), replace=True)
    X_resampled = cudf.concat([X_1, X_0_resampled])
    y_resampled = cudf.Series(cp.concatenate([cp.ones(len(X_1)), cp.zeros(len(X_0_resampled))]))
else:
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

# ✅ Convert cudf DataFrame to NumPy arrays for train_test_split if using GPU
if GPU_AVAILABLE:
    X_resampled_cpu = X_resampled.to_pandas().values  # Convert cudf to pandas, then to NumPy
    y_resampled_cpu = y_resampled.to_pandas().values
else:
    X_resampled_cpu = X_resampled
    y_resampled_cpu = y_resampled

# ✅ Split into train/test sets (works for both CPU and GPU)
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled_cpu, y_resampled_cpu, test_size=0.2, random_state=42, stratify=y_resampled_cpu
)

# ✅ Convert back to cudf DataFrame if GPU is available
if GPU_AVAILABLE:
    X_train = cudf.DataFrame.from_pandas(pd.DataFrame(X_train))  # Convert back to cudf DataFrame
    X_test = cudf.DataFrame.from_pandas(pd.DataFrame(X_test))
    y_train = cudf.Series(y_train)
    y_test = cudf.Series(y_test)

# ✅ Convert to float32
if GPU_AVAILABLE:
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    y_train = y_train.astype('float32')
    y_test = y_test.astype('float32')
else:
    X_train = X_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    y_train = y_train.astype(np.float32)
    y_test = y_test.astype(np.float32)

# ✅ Standardize features
scaler = cuScaler() if GPU_AVAILABLE else StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert columns to explicit string names (FIX HERE)
if GPU_AVAILABLE:
    # For cuDF
    X_train_scaled.columns = X_train_scaled.columns.astype('str')
    X_test_scaled.columns = X_test_scaled.columns.astype('str')
else:
    # For pandas
    X_train_scaled.columns = X_train_scaled.columns.astype(str)
    X_test_scaled.columns = X_test_scaled.columns.astype(str)


# ✅ Choose the model
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "n_estimators": 100,
    "random_state": 42,
    "device": "cuda"
}

if GPU_AVAILABLE:
    print("🚀 Using GPU-Accelerated XGBoost")
    params["tree_method"] = "hist"  # Enable GPU
else:
    print("⚡ Using CPU-based XGBoost")
    params["tree_method"] = "hist"

model = xgb.XGBClassifier(**params)

# ✅ Train the model
model.fit(X_train_scaled, y_train)

# ✅ Predictions and conversion to CPU if necessary
if GPU_AVAILABLE:
    # Predictions are already in NumPy array format, no need for .to_numpy()
    y_pred = model.predict(X_test_scaled)
    # Only convert y_test if it's a cudf Series
    y_test = y_test.to_numpy()  # Convert cudf to NumPy
else:
    y_pred = model.predict(X_test_scaled)  # No conversion needed here either

# ✅ Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(model, "xgb_model.joblib")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import xgboost as xgb
from sklearn.metrics import confusion_matrix, roc_curve, auc

# 1. Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# 2. ROC Curve & AUC Score
y_pred_proba = model.predict_proba(X_test)[:, 1]

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# # 3. Feature Importance Plot
# feature_importance = model.feature_importances_
# feature_names = X_train.columns.to_numpy()

# # Sort features by importance
# sorted_idx = np.argsort(feature_importance)
# sorted_features = feature_names[sorted_idx]
# sorted_importance = feature_importance[sorted_idx]

# # Plot top 20 features
# plt.figure(figsize=(12, 8))
# plt.barh(range(20), sorted_importance[-20:])
# plt.yticks(range(20), sorted_features[-20:])
# plt.xlabel('Feature Importance')
# plt.title('Top 20 Most Important Features')
# plt.tight_layout()
# plt.show()
