In [20]:
import pandas as pd
df = pd.read_csv("../data/processed/clean_tickets.csv")
df.head()


Unnamed: 0,subject,body,answer,type,queue,priority,language,version,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8,clean_subject,clean_body,text
0,Wesentlicher Sicherheitsvorfall,"Sehr geehrtes Support-Team,\n\nich möchte eine...",Vielen Dank für die Meldung des kritischen Sic...,Incident,Technical Support,high,de,51,Security,Outage,Disruption,Data Breach,,,,,wesentlicher sicherheitsvorfall,sehr geehrtes support team n nich m chte einen...,wesentlicher sicherheitsvorfall. sehr geehrtes...
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...","Thank you for reaching out, <name>. We are awa...",Incident,Technical Support,high,en,51,Account,Disruption,Outage,IT,Tech Support,,,,account disruption,dear customer support team n ni am writing to ...,account disruption. dear customer support team...
2,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Thank you for your inquiry. Our products suppo...,Request,Returns and Exchanges,medium,en,51,Product,Feature,Tech Support,,,,,,query about smart home system integration feat...,dear customer support team n ni hope this mess...,query about smart home system integration feat...
3,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",We appreciate you reaching out with your billi...,Request,Billing and Payments,low,en,51,Billing,Payment,Account,Documentation,Feedback,,,,inquiry regarding invoice details,dear customer support team n ni hope this mess...,inquiry regarding invoice details. dear custom...
4,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Thank you for your inquiry. Our product suppor...,Problem,Sales and Pre-Sales,medium,en,51,Product,Feature,Feedback,Tech Support,,,,,question about marketing agency software compa...,dear support team n ni hope this message reach...,question about marketing agency software compa...


In [21]:
X = df['text']          # input
y = df['type']          # target (category)


In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

tfidf = TfidfVectorizer(
    max_features=20000,        # larger vocabulary
    ngram_range=(1, 2),        # unigrams + bigrams
    sublinear_tf=True         # better scaling for long text
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

model = LinearSVC(class_weight="balanced")   # handles imbalance
model.fit(X_train_tfidf, y_train)


0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [25]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8490731024833857
              precision    recall  f1-score   support

      Change       0.96      0.97      0.97       584
    Incident       0.81      0.83      0.82      2293
     Problem       0.66      0.64      0.65      1203
     Request       0.99      0.99      0.99      1638

    accuracy                           0.85      5718
   macro avg       0.86      0.86      0.86      5718
weighted avg       0.85      0.85      0.85      5718



In [26]:
import joblib
import os

os.makedirs("../models", exist_ok=True)
joblib.dump(model, "../models/ticket_model.pkl")
joblib.dump(tfidf, "../models/tfidf_vectorizer.pkl")

print("Model & vectorizer saved successfully!")


Model & vectorizer saved successfully!


In [27]:
import sys, os
sys.path.append(os.path.abspath(".."))


In [28]:
from joblib import load
from src.preprocessing.text_cleaning import clean_text

model = load("../models/ticket_model.pkl")
tfidf = load("../models/tfidf_vectorizer.pkl")

def predict_ticket(subject, body):
    text = clean_text(subject) + ". " + clean_text(body)
    X = tfidf.transform([text])
    return model.predict(X)[0]

# test
print(predict_ticket("Payment issue", "Amount was debited but order failed"))


Incident
