In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier

In [2]:
# === Load dataset ===
df = pd.read_csv("random_augmented_balanced_dataset.csv")

# === Split Dataset (stratified 80/10/10) ===
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['intent'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['intent'], random_state=42)

In [3]:
# === Vectorization using TF-IDF ===
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])
y_train = train_df['intent']
y_test = test_df['intent']

In [4]:
# === Models to train ===
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVC": LinearSVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": MultinomialNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Stochastic Gradient Descent": SGDClassifier()
}

In [5]:
# Create a list to store metric results
results = []

# === Training and Evaluation with Confusion Matrices ===
for name, model in models.items():
    print(f"\n🧠 Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"📊 {name} Results:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")

    # Append results to the table
    results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1
    })

    # Create and save confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=sorted(y_test.unique()))
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=sorted(y_test.unique()),
                yticklabels=sorted(y_test.unique()))
    plt.title(f'Confusion Matrix: {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_{name.replace(' ', '_').lower()}.png")
    plt.close()

# Save the metrics to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("ml_model_results_tfidf.csv", index=False)

print("\n✅ All metrics saved to 'ml_model_results_tfidf.csv'")
print("📊 Confusion matrices saved as PNG files (one per model)")


🧠 Training Logistic Regression...
📊 Logistic Regression Results:
Accuracy: 0.6141
Precision: 0.6070
Recall: 0.6081
F1-Score: 0.6058

🧠 Training Linear SVC...
📊 Linear SVC Results:
Accuracy: 0.6969
Precision: 0.6899
Recall: 0.6910
F1-Score: 0.6897

🧠 Training Random Forest...
📊 Random Forest Results:
Accuracy: 0.7898
Precision: 0.7884
Recall: 0.7851
F1-Score: 0.7845

🧠 Training Naive Bayes...
📊 Naive Bayes Results:
Accuracy: 0.5696
Precision: 0.5827
Recall: 0.5600
F1-Score: 0.5604

🧠 Training K-Nearest Neighbors...
📊 K-Nearest Neighbors Results:
Accuracy: 0.5504
Precision: 0.5462
Recall: 0.5433
F1-Score: 0.5374

🧠 Training Stochastic Gradient Descent...
📊 Stochastic Gradient Descent Results:
Accuracy: 0.6446
Precision: 0.6375
Recall: 0.6374
F1-Score: 0.6332

✅ All metrics saved to 'ml_model_results_tfidf.csv'
📊 Confusion matrices saved as PNG files (one per model)
