In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import load_npz

# Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, auc
)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize, StandardScaler

# Save model
import joblib


In [1]:
# Load TF-IDF features
X_train = load_npz("train_features_tfidf.npz")
X_val = load_npz("val_features_tfidf.npz")
X_test = load_npz("test_features_tfidf.npz")

# Load labels
y_train = pd.read_csv("train_labels.csv")['medical_specialty']
y_val = pd.read_csv("val_labels.csv")['medical_specialty']
y_test = pd.read_csv("test_labels.csv")['medical_specialty']

print("Shapes:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:", X_val.shape, "y_val:", y_val.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)

NameError: name 'load_npz' is not defined

In [None]:
log_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=500)

param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l2"]
}

grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=3,   # reduced for speed, increase if needed
    scoring="accuracy",
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

: 

In [None]:
cv_scores = cross_val_score(best_model, X_train, y_train, cv=3, scoring="accuracy")

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

plt.figure(figsize=(6,4))
sns.boxplot(x=cv_scores)
plt.title("Cross-Validation Accuracy Distribution")
plt.xlabel("Accuracy")
plt.show()

In [None]:
joblib.dump(best_model, "softmax_regression_model.pkl")
print("✅ Model saved as softmax_regression_model.pkl")


In [None]:
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="weighted"))
print("Recall:", recall_score(y_test, y_pred, average="weighted"))
print("F1-score:", f1_score(y_test, y_pred, average="weighted"))

print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(12,8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=best_model.classes_,
            yticklabels=best_model.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix Heatmap")
plt.show()