In [14]:
import pandas as pd
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Configuration
TRAIN_PATH = 'train.csv'
VAL_PATH = 'val.csv'
TEST_PATH = 'test.csv'
MODEL_SAVE_PATH = 'sbert_logreg_model.pkl'
SBERT_MODEL_NAME = 'all-MiniLM-L6-v2'  # Small, fast, effective

In [15]:
def load_data(path):
    df = pd.read_csv(path)
    return df['email'].tolist(), df['intent'].tolist()

print("--- 1. Loading Data ---")
X_train_text, y_train = load_data(TRAIN_PATH)
X_val_text, y_val = load_data(VAL_PATH)
X_test_text, y_test = load_data(TEST_PATH)

--- 1. Loading Data ---


In [16]:
print(f"--- 2. Generating Embeddings ({SBERT_MODEL_NAME}) ---")
# Load SBERT model
encoder = SentenceTransformer(SBERT_MODEL_NAME)

# Encode all text data
# We encode separately to simulate real-world scenarios
X_train_emb = encoder.encode(X_train_text, show_progress_bar=True)
X_val_emb = encoder.encode(X_val_text, show_progress_bar=True)
X_test_emb = encoder.encode(X_test_text, show_progress_bar=True)

--- 2. Generating Embeddings (all-MiniLM-L6-v2) ---


Batches:   0%|          | 0/123 [00:00<?, ?it/s]

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

In [17]:
clf = LogisticRegression(max_iter=1000, C=1.0)
clf.fit(X_train_emb, y_train)

val_predictions = clf.predict(X_val_emb)
print("Validation Accuracy:", accuracy_score(y_val, val_predictions))
print("Validation Report:\n", classification_report(y_val, val_predictions))

Validation Accuracy: 0.9005328596802842
Validation Report:
                 precision    recall  f1-score   support

 Account Issue       0.95      0.97      0.96        77
     Complaint       0.91      0.88      0.90        59
Delivery Issue       0.89      0.91      0.90        88
   Order Issue       0.89      0.91      0.90        75
        Others       0.92      0.75      0.83        64
 Payment Issue       0.90      0.89      0.90        73
 Product Issue       0.94      0.96      0.95        53
Refund Request       0.82      0.92      0.87        74

      accuracy                           0.90       563
     macro avg       0.90      0.90      0.90       563
  weighted avg       0.90      0.90      0.90       563



In [18]:
print("\n--- 4. Full Training (Train + Val) ---")
# Combine embeddings and labels
X_full_emb = np.vstack((X_train_emb, X_val_emb))
y_full = y_train + y_val

# Retrain classifier
clf.fit(X_full_emb, y_full)
print("Classifier retrained on combined Train + Validation sets.")


--- 4. Full Training (Train + Val) ---
Classifier retrained on combined Train + Validation sets.


In [19]:
print("\n--- 5. Testing (Test Set) ---")
test_predictions = clf.predict(X_test_emb)
print("Test Accuracy:", accuracy_score(y_test, test_predictions))
print("Test Report:\n", classification_report(y_test, test_predictions))

print("\n--- 6. Saving Model ---")
# We save a dictionary containing the classifier and the model name
# Note: In production, you load SBERT separately, encode input, then predict with clf
artifact = {
    'classifier': clf,
    'encoder_name': SBERT_MODEL_NAME
}
joblib.dump(artifact, MODEL_SAVE_PATH)
print(f"Model artifact saved to {MODEL_SAVE_PATH}")


--- 5. Testing (Test Set) ---
Test Accuracy: 0.9327014218009478
Test Report:
                 precision    recall  f1-score   support

 Account Issue       0.99      0.99      0.99       153
     Complaint       0.94      0.95      0.94       131
Delivery Issue       0.93      0.99      0.96       121
   Order Issue       0.92      0.92      0.92       131
        Others       0.96      0.87      0.92       119
 Payment Issue       0.97      0.85      0.91       131
 Product Issue       0.93      0.91      0.92       136
Refund Request       0.83      0.96      0.89       133

      accuracy                           0.93      1055
     macro avg       0.93      0.93      0.93      1055
  weighted avg       0.94      0.93      0.93      1055


--- 6. Saving Model ---
Model artifact saved to sbert_logreg_model.pkl
