In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import joblib
import warnings
warnings.filterwarnings("ignore")


In [4]:
# Load Dataset
df = pd.read_csv('./data/cleaned_data.csv')  # replace with your actual file if needed
df.head()

Unnamed: 0,ticket_id,text,priority_label
0,1001,agent every development say opportunity behavi...,Low
1,1002,behavior benefit suggest page movie win need s...,Low
2,1003,relate animal direction eye bag law street cla...,High
3,1004,left establish understand read successful simp...,Low
4,1005,central cause seat much section investment you...,Low


In [6]:
# Label Encode
le = LabelEncoder()
df['encoded_priority'] = le.fit_transform(df['priority_label'])

X = df['text']
y = df['encoded_priority']

label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_map)

Label mapping: {'Critical': 0, 'High': 1, 'Low': 2, 'Medium': 3}


In [8]:
# Text Vectorization using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_vec = vectorizer.fit_transform(X)

# Save vectorizer for use in Streamlit
joblib.dump(vectorizer, './models/tfidf_vectorizer.pkl')

['./models/tfidf_vectorizer.pkl']

In [9]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42, stratify=y)


In [10]:
# Handle Class Imbalance
ros = RandomOverSampler(random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_train, y_train)

# Optional check
pd.Series(y_train_bal).value_counts()

encoded_priority
3    57
2    57
0    57
1    57
Name: count, dtype: int64

In [11]:
# Train & Calibrate RandomForest Classifier
rf = RandomForestClassifier(class_weight='balanced', n_estimators=150, random_state=42)
calibrated_model = CalibratedClassifierCV(estimator=rf, method='sigmoid', cv=5)
calibrated_model.fit(X_train_bal, y_train_bal)

In [12]:
# Evaluate Model
y_pred = calibrated_model.predict(X_test)
y_proba = calibrated_model.predict_proba(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

    Critical       0.00      0.00      0.00         3
        High       0.33      0.12      0.18         8
         Low       0.30      0.47      0.37        15
      Medium       0.29      0.29      0.29        14

    accuracy                           0.30        40
   macro avg       0.23      0.22      0.21        40
weighted avg       0.28      0.30      0.27        40

Confusion Matrix:
 [[0 0 3 0]
 [0 1 4 3]
 [0 1 7 7]
 [0 1 9 4]]


In [13]:
# Save the Model and Label Encoder
joblib.dump(calibrated_model, 'ticket_priority_model.pkl')
joblib.dump(le, './models/label_encoder.pkl')

['./models/label_encoder.pkl']