In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import (
    classification_report, accuracy_score, precision_score,
    recall_score, f1_score, confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier

In [2]:
# === Load Dataset ===
df = pd.read_csv("random_augmented_balanced_dataset.csv")

# === Split Dataset (stratified 80/10/10) ===
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['intent'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['intent'], random_state=42)


In [3]:
# === Vectorization using Bag of Words ===
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])
y_train = train_df['intent']
y_test = test_df['intent']

In [4]:
# === Models ===
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear SVC": LinearSVC(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": MultinomialNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Stochastic Gradient Descent": SGDClassifier()
}

In [5]:
# === Results List ===
results = []

# === Training and Evaluation with Confusion Matrices ===
for name, model in models.items():
    print(f"\n🧠 Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='macro')
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"📊 {name} Results:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1-Score: {f1:.4f}")

    # Store results
    results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1-Score': f1
    })

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=sorted(y_test.unique()))
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples',
                xticklabels=sorted(y_test.unique()),
                yticklabels=sorted(y_test.unique()))
    plt.title(f'Confusion Matrix: {name} (BoW)')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_bow_{name.replace(' ', '_').lower()}.png")
    plt.close()

# Save metrics to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("ml_model_results_bow.csv", index=False)

print("\n✅ BoW Results saved to 'ml_model_results_bow.csv'")
print("📊 Confusion matrices saved for each model (BoW)")


🧠 Training Logistic Regression...
📊 Logistic Regression Results:
Accuracy: 0.6757
Precision: 0.6695
Recall: 0.6687
F1-Score: 0.6683

🧠 Training Linear SVC...
📊 Linear SVC Results:
Accuracy: 0.6890
Precision: 0.6840
Recall: 0.6812
F1-Score: 0.6816

🧠 Training Random Forest...
📊 Random Forest Results:
Accuracy: 0.7971
Precision: 0.7952
Recall: 0.7924
F1-Score: 0.7918

🧠 Training Naive Bayes...
📊 Naive Bayes Results:
Accuracy: 0.5716
Precision: 0.5712
Recall: 0.5644
F1-Score: 0.5645

🧠 Training K-Nearest Neighbors...
📊 K-Nearest Neighbors Results:
Accuracy: 0.5318
Precision: 0.5426
Recall: 0.5263
F1-Score: 0.5247

🧠 Training Stochastic Gradient Descent...
📊 Stochastic Gradient Descent Results:
Accuracy: 0.6737
Precision: 0.6613
Recall: 0.6662
F1-Score: 0.6612

✅ BoW Results saved to 'ml_model_results_bow.csv'
📊 Confusion matrices saved for each model (BoW)
