In [24]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [25]:
df = pd.read_csv("all_tickets_processed_improved_v3.csv")

df.head()


Unnamed: 0,Document,Topic_group
0,connection with icon icon dear please setup ic...,Hardware
1,work experience user work experience user hi w...,Access
2,requesting for meeting requesting meeting hi p...,Hardware
3,reset passwords for external accounts re expir...,Access
4,mail verification warning hi has got attached ...,Miscellaneous


In [26]:
def clean_text(text):
    text = text.lower()                     # lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # remove symbols/numbers
    return text
df["clean_text"] = df["Document"].apply(clean_text)


In [27]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

X = vectorizer.fit_transform(df["clean_text"])
y = df["Topic_group"]


In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [29]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8458403010033445
                       precision    recall  f1-score   support

               Access       0.92      0.87      0.89      1455
Administrative rights       0.87      0.68      0.76       342
           HR Support       0.84      0.83      0.84      2107
             Hardware       0.79      0.88      0.83      2760
     Internal Project       0.91      0.80      0.85       451
        Miscellaneous       0.80      0.82      0.81      1400
             Purchase       0.97      0.88      0.92       497
              Storage       0.93      0.84      0.88       556

             accuracy                           0.85      9568
            macro avg       0.88      0.83      0.85      9568
         weighted avg       0.85      0.85      0.85      9568



In [30]:
def assign_priority(category):
    if category in ["Hardware", "Access"]:
        return "High"
    elif category in ["Miscellaneous"]:
        return "Low"
    else:
        return "Medium"


In [31]:
def predict_ticket(ticket_text):
    clean = clean_text(ticket_text)
    vector = vectorizer.transform([clean])
    category = model.predict(vector)[0]
    priority = assign_priority(category)

    return category, priority


In [32]:
ticket = "My laptop is not connecting to the network"
print(predict_ticket(ticket))


('Hardware', 'High')


In [33]:
df.shape
df.columns
df["Topic_group"].value_counts()


Topic_group
Hardware                 13617
HR Support               10915
Access                    7125
Miscellaneous             7060
Storage                   2777
Purchase                  2464
Internal Project          2119
Administrative rights     1760
Name: count, dtype: int64

In [34]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report