In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from hpsvm import HPSVM
from mpi4py import MPI

In [9]:
df = pd.read_csv('../data/preprocessing_mushroom.csv')

In [11]:
df_encoded = df

In [55]:
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

In [56]:
X = df.drop('poisonous', axis=1)  
y = df['poisonous']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

print(f"Generated dataset with {X_train.shape[0]} training samples and {X_test.shape[0]} test samples")

print("Sample of scaled training data:")
print(X_train_scaled[:5])

Generated dataset with 6499 training samples and 1625 test samples
Sample of scaled training data:
[[-0.8427199   0.95985383  1.76225743 -0.84410691 -1.01316068  0.15988792
  -0.43722499 -0.66216943 -0.79870133 -1.14183755 -0.9719434  -0.93385772
  -0.88530978  0.10240489 -3.00957552  0.          0.14091952 -0.26005365
  -0.16455926 -1.08468287  1.08101476 -0.29096328]
 [ 1.02608241  0.14802535  0.19169813  1.18468406  0.41525741  0.15988792
  -0.43722499 -0.66216943 -1.08119167 -1.14183755  1.19897886  0.68544359
   0.59225269 -1.98881066  0.63177073  0.          0.14091952  3.41405222
  -1.27685318  1.43729696 -2.11712564  2.61464572]
 [-2.08858811  0.14802535 -0.59358151 -0.84410691  0.41525741  0.15988792
   2.28715199 -0.66216943  1.46122137 -1.14183755  1.19897886 -0.93385772
   0.59225269  0.62520877  0.63177073  0.          0.14091952  3.41405222
   0.94773467  1.43729696 -0.51805544 -0.29096328]
 [-0.8427199   0.14802535 -0.20094169 -0.84410691  1.36753614  0.15988792
  -0.437

In [57]:
X_train_scaled = comm.bcast(X_train_scaled, root=0)
X_test_scaled = comm.bcast(X_test_scaled, root=0)
y_train = comm.bcast(y_train, root=0)
y_test = comm.bcast(y_test, root=0)

In [58]:
#This is the reference in the hpsvm.py file
hpsvm = HPSVM(tau=1.0, tol=1e-4, max_iter=50)

2025-03-12 19:29:46,642 - HPSVM - INFO - Initialized HPSVM with 1 nodes
2025-03-12 19:29:46,642 - HPSVM - INFO - Parameters: tau=1.0, tol=0.0001, max_iter=50, kernel=linear


In [59]:
hpsvm.fit(X_train_scaled, y_train)

2025-03-12 19:29:46,652 - HPSVM - INFO - Starting HPSVM training with 6499 samples and 22 features
2025-03-12 19:29:46,686 - HPSVM - INFO - Data distributed among 1 nodes
2025-03-12 19:29:46,785 - HPSVM - INFO - Iteration 0: duality gap = 0.596438, step size = 0.234427
2025-03-12 19:29:46,837 - HPSVM - INFO - Iteration 5: duality gap = 0.496654, step size = 0.000002
2025-03-12 19:29:46,876 - HPSVM - INFO - Iteration 10: duality gap = 0.524695, step size = 0.000002
2025-03-12 19:29:46,909 - HPSVM - INFO - Iteration 15: duality gap = 0.524743, step size = 0.000000
2025-03-12 19:29:46,959 - HPSVM - INFO - Iteration 20: duality gap = 0.524743, step size = 0.000000
2025-03-12 19:29:47,002 - HPSVM - INFO - Iteration 25: duality gap = 0.524743, step size = 0.000000
2025-03-12 19:29:47,042 - HPSVM - INFO - Iteration 30: duality gap = 0.524743, step size = 0.000000
2025-03-12 19:29:47,093 - HPSVM - INFO - Iteration 35: duality gap = 0.524743, step size = 0.000000
2025-03-12 19:29:47,126 - HPSVM

<hpsvm.HPSVM at 0x1ddda4c5880>

In [60]:
y_pred = hpsvm.predict(X_test_scaled)

In [61]:
y_pred = np.where(y_pred == -1, 0, y_pred)

In [72]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {accuracy:.4f}")
print('Classification Report:')
print(classification_report(y_test, y_pred))

Test accuracy: 0.9532
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       842
           1       0.95      0.95      0.95       783

    accuracy                           0.95      1625
   macro avg       0.95      0.95      0.95      1625
weighted avg       0.95      0.95      0.95      1625



### This model was partially correct since it's also considered -1 (not posion) which could be harder to apply ML to and we where getting around 50 accuracy, for this we need to change to binary for easier readability. Easier readability -> better accuracy for the model which it shows since accuracy when changing y_pred -1 = 0 went up to 95