In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

train_df = pd.read_csv('training-password-data.csv')
test_df  = pd.read_csv('testing-password-data.csv')

X_train = train_df.drop(columns=['strength'])
y_train = train_df['strength'].astype(str)
X_test  = test_df.drop(columns=['strength'])
y_test  = test_df['strength'].astype(str)

X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test).reindex(columns=X_train.columns, fill_value=0)

le = LabelEncoder().fit(y_train)

nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)


In [2]:
total      = len(y_test)
correct    = (y_pred == y_test).sum()
incorrect  = total - correct
pct_corr   = correct   / total * 100
pct_incorr = incorrect / total * 100

y_true_int = le.transform(y_test)
y_pred_int = le.transform(y_pred)

from sklearn.metrics import (
    cohen_kappa_score,
    mean_absolute_error,
    mean_squared_error
)
import numpy as np

kappa = cohen_kappa_score(y_test, y_pred)
mae   = mean_absolute_error(y_true_int, y_pred_int)
rmse  = np.sqrt(mean_squared_error(y_true_int, y_pred_int))

mean_true = np.mean(y_true_int)
rae  = mae  / np.mean(np.abs(y_true_int - mean_true))
rrse = rmse / np.sqrt(np.mean((y_true_int - mean_true)**2))

print("=== Summary ===")
print(f"Correctly Classified Instances   {correct}   {pct_corr:.4f} %")
print(f"Incorrectly Classified Instances {incorrect}   {pct_incorr:.4f} %")
print(f"Kappa statistic                  {kappa:.3f}")
print(f"Mean absolute error              {mae:.4f}")
print(f"Root mean squared error          {rmse:.4f}")
print(f"Relative absolute error          {rae:.4f}")
print(f"Root relative squared error      {rrse:.4f}")
print(f"Total Number of Instances        {total}")

print("\n=== Detailed Accuracy By Class ===")
print(classification_report(y_test, y_pred, digits=4))

print("=== Confusion Matrix ===")
cm = confusion_matrix(y_test, y_pred, labels=le.classes_)
print(pd.DataFrame(
    cm,
    index=[f"actual={c}" for c in le.classes_],
    columns=[f"pred={c}"   for c in le.classes_]
))


=== Summary ===
Correctly Classified Instances   423831   79.7469 %
Incorrectly Classified Instances 107639   20.2531 %
Kappa statistic                  0.530
Mean absolute error              0.2030
Root mean squared error          0.4516
Relative absolute error          0.7640
Root relative squared error      0.8956
Total Number of Instances        531470

=== Detailed Accuracy By Class ===
              precision    recall  f1-score   support

           0     0.9945    0.3550    0.5232     71711
           1     0.8792    0.8451    0.8618    396208
           2     0.5082    1.0000    0.6739     63551

    accuracy                         0.7975    531470
   macro avg     0.7940    0.7333    0.6863    531470
weighted avg     0.8504    0.7975    0.7937    531470

=== Confusion Matrix ===
          pred=0  pred=1  pred=2
actual=0   25457   46000     254
actual=1     140  334824   61244
actual=2       0       1   63550
