# Importing Necessary Libraries


In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load dataset and cleaning it


In [2]:
# 1. Load dataset
df = pd.read_csv("/kaggle/input/itchatbot/all_tickets_processed_improved_v3.csv")

# 2. Keep only required columns
df = df[["Document", "Topic_group"]]

# 3. Rename for convenience
df.columns = ["text", "category"]

# 4. Drop missing values
df = df.dropna()

# Splitting data into X and y

In [3]:
# 5. Split into X and y
X = df["text"].astype(str)
y = df["category"].astype(str)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Traning and vectorization


In [4]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=10000,
    ngram_range=(1, 2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 8. Train classifier
model = LogisticRegression(max_iter=2000)
model.fit(X_train_vec, y_train)

# Model Evaluation

In [5]:
y_pred = model.predict(X_test_vec)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))


Classification Report:

                       precision    recall  f1-score   support

               Access       0.92      0.88      0.90      1425
Administrative rights       0.89      0.64      0.74       352
           HR Support       0.86      0.87      0.87      2183
             Hardware       0.79      0.89      0.84      2724
     Internal Project       0.91      0.80      0.85       424
        Miscellaneous       0.83      0.81      0.82      1412
             Purchase       0.98      0.87      0.92       493
              Storage       0.95      0.83      0.89       555

             accuracy                           0.85      9568
            macro avg       0.89      0.82      0.85      9568
         weighted avg       0.86      0.85      0.85      9568



# Save model and vectorizer


In [9]:
joblib.dump(vectorizer, "/kaggle/working/vectorizer.joblib")
joblib.dump(model, "/kaggle/working/model.joblib")

['/kaggle/working/model.joblib']