In [1]:
import pandas as pd
import numpy as np

In [2]:
df_emailData = pd.read_csv('Emails_data.csv', encoding='latin1')

In [3]:
df_emailData.columns

Index(['subject', 'body', 'answer', 'type', 'queue', 'priority', 'language',
       'version', 'tag_1', 'tag_2', 'tag_3', 'tag_4', 'tag_5', 'tag_6',
       'tag_7', 'tag_8'],
      dtype='object')

In [4]:
df_emailData.head()

Unnamed: 0,subject,body,answer,type,queue,priority,language,version,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8
0,Wesentlicher Sicherheitsvorfall,"Sehr geehrtes Support-Team,\n\nich möchte eine...",Vielen Dank für die Meldung des kritischen Sic...,Incident,Technical Support,high,de,51,Security,Outage,Disruption,Data Breach,,,,
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...","Thank you for reaching out, <name>. We are awa...",Incident,Technical Support,high,en,51,Account,Disruption,Outage,IT,Tech Support,,,
2,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Thank you for your inquiry. Our products suppo...,Request,Returns and Exchanges,medium,en,51,Product,Feature,Tech Support,,,,,
3,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",We appreciate you reaching out with your billi...,Request,Billing and Payments,low,en,51,Billing,Payment,Account,Documentation,Feedback,,,
4,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Thank you for your inquiry. Our product suppor...,Problem,Sales and Pre-Sales,medium,en,51,Product,Feature,Feedback,Tech Support,,,,


In [5]:
df_emailData.tail()

Unnamed: 0,subject,body,answer,type,queue,priority,language,version,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8
28582,Performance Problem with Data Analytics Tool,The data analytics tool experiences sluggish p...,We are addressing the performance issue with t...,Incident,Technical Support,high,en,400,Performance,IT,Tech Support,,,,,
28583,Datensperrung in der Kundschaftsbetreuung,"Es gab einen Datensperrungsunfall, bei dem ung...",Ich kann Ihnen bei dem Datensperrungsunfall he...,Incident,Product Support,high,de,400,Security,IT,Tech Support,Bug,,,,
28584,Problem mit der Videokonferenz-Software heute,Wichtigere Sitzungen wurden unterbrochen durch...,"Sehr geehrte/r [Name], leider wurde das Proble...",Incident,Human Resources,low,de,400,Bug,Performance,Network,IT,Tech Support,,,
28585,Update Request for SaaS Platform Integration F...,Requesting an update on the integration featur...,Received your request for updates on the integ...,Change,IT Support,high,en,400,Feature,IT,Tech Support,,,,,
28586,Inquiry About Project Management Features,Looking for detailed information on the projec...,"Dear [Name], thank you for your email regardin...",Request,Technical Support,low,en,400,Feature,Documentation,Feedback,IT,,,,


In [6]:
df_emailData['queue'].unique()

array(['Technical Support', 'Returns and Exchanges',
       'Billing and Payments', 'Sales and Pre-Sales',
       'Service Outages and Maintenance', 'Product Support', 'IT Support',
       'Customer Service', 'Human Resources', 'General Inquiry'],
      dtype=object)

In [7]:
df_emailData = df_emailData[["subject", "body", "queue", "priority", "language"]]
df_emailData.dropna(inplace=True)
df_emailData.head()


Unnamed: 0,subject,body,queue,priority,language
0,Wesentlicher Sicherheitsvorfall,"Sehr geehrtes Support-Team,\n\nich möchte eine...",Technical Support,high,de
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...",Technical Support,high,en
2,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Returns and Exchanges,medium,en
3,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",Billing and Payments,low,en
4,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Sales and Pre-Sales,medium,en


In [None]:
df_emailData = df_emailData[df_emailData["language"] == "en"]
df_emailData.head()


Unnamed: 0,subject,body,queue,priority,language
1,Account Disruption,"Dear Customer Support Team,\n\nI am writing to...",Technical Support,high,en
2,Query About Smart Home System Integration Feat...,"Dear Customer Support Team,\n\nI hope this mes...",Returns and Exchanges,medium,en
3,Inquiry Regarding Invoice Details,"Dear Customer Support Team,\n\nI hope this mes...",Billing and Payments,low,en
4,Question About Marketing Agency Software Compa...,"Dear Support Team,\n\nI hope this message reac...",Sales and Pre-Sales,medium,en
5,Feature Query,"Dear Customer Support,\n\nI hope this message ...",Technical Support,high,en


In [12]:
df_emailData["text"] = (
    df_emailData["subject"].fillna("") + " " +
    df_emailData["subject"].fillna("") + " " +
    df_emailData["body"].fillna("")
)


In [13]:
import re

In [14]:
def clean_text(text):
    text = str(text).lower()

    text = re.sub(r'dear\s+\w+', '', text)

    text = re.sub(r'http\S+', '', text)

    text = re.sub(r'\d+', '', text)

    text = re.sub(r'[^a-zA-Z ]', '', text)

    text = re.sub(r'\s+', ' ', text)

    text = text.strip()

    return text.strip()


In [15]:
df_emailData["text"] = df_emailData["text"].apply(clean_text)

In [16]:
df_emailData.rename(columns={
    "queue": "category",
    "priority": "urgency"
}, inplace=True)


In [17]:
df_emailData["category"] = df_emailData["category"].replace({
    "IT Support": "Technical Support",
    "Service Outages and Maintenance": "Technical Support",
    "Product Support": "Technical Support"
})


In [18]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [19]:
X = df_emailData["text"]
y = df_emailData["category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [20]:
category_model = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words="english",
        max_features=20000,
        ngram_range=(1,3),
        min_df=2,
        max_df=0.85
    )),
    ('clf', LinearSVC(class_weight="balanced")
)
])

In [21]:
category_model.fit(X_train, y_train)


In [22]:
print(classification_report(
    y_test,
    category_model.predict(X_test),
    zero_division=0
))


                       precision    recall  f1-score   support

 Billing and Payments       0.81      0.76      0.79       270
     Customer Service       0.55      0.54      0.55       409
      General Inquiry       0.73      0.58      0.65        38
      Human Resources       0.67      0.56      0.61        57
Returns and Exchanges       0.60      0.56      0.58       137
  Sales and Pre-Sales       0.53      0.56      0.54        82
    Technical Support       0.85      0.87      0.86      1754

             accuracy                           0.78      2747
            macro avg       0.68      0.63      0.65      2747
         weighted avg       0.77      0.78      0.78      2747



In [None]:
df_emailData["category"].value_counts()

category
Technical Support        8769
Customer Service         2045
Billing and Payments     1349
Returns and Exchanges     686
Sales and Pre-Sales       410
Human Resources           285
General Inquiry           187
Name: count, dtype: int64

In [29]:
errors = pd.DataFrame({
    "Text": X_test,
    "Actual": y_test,
    "Predicted": X_test
})

errors = errors[errors["Actual"] != errors["Predicted"]]
errors.head(20)


Unnamed: 0,Text,Actual,Predicted
19842,problem with network connection problem with n...,Technical Support,problem with network connection problem with n...
5124,billing system error report billing system err...,Billing and Payments,billing system error report billing system err...
17341,problem with investment analytics tools proble...,Technical Support,problem with investment analytics tools proble...
10117,reduce digital engagement levels reduce digita...,Technical Support,reduce digital engagement levels reduce digita...
16023,concern about medical data access delay concer...,Technical Support,concern about medical data access delay concer...
19929,major software integration issue continues maj...,Technical Support,major software integration issue continues maj...
12135,issues with investment optimization algorithms...,Technical Support,issues with investment optimization algorithms...
12975,problem with financial performance metrics pro...,Returns and Exchanges,problem with financial performance metrics pro...
28357,significant decline in digital engagement metr...,Technical Support,significant decline in digital engagement metr...
20608,request for ibm cloud support request for ibm ...,Customer Service,request for ibm cloud support request for ibm ...


In [30]:
Xu = df_emailData["text"]
yu = df_emailData["urgency"]

Xu_train, Xu_test, yu_train, yu_test = train_test_split(
    Xu, yu,
    test_size=0.2,
    random_state=42,
    stratify=yu
)


In [31]:
from sklearn.linear_model import LogisticRegression


urgency_model = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words="english",
        max_features=8000,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.9
    )),
    ('clf', LogisticRegression(
        max_iter=2000,
        class_weight="balanced"
    ))
])


In [32]:
urgent_keywords = [
    "urgent", "asap", "immediately", "critical",
    "priority", "server down", "not working",
    "system down", "cannot access", "failure",
    "blocked", "outage", "error", "failed"
]


def urgency_rule(text):
    text = text.lower()
    for word in urgent_keywords:
        if word in text:
            return "high"
    return None


In [33]:
urgency_model.fit(Xu_train, yu_train)

In [35]:
def predict_email(text):
    # Step 1: Clean the text
    text_clean = clean_text(text)

    # Step 2: Category prediction
    category = category_model.predict([text_clean])[0]

    # Step 3: Category confidence check (optional but recommended)
    prob = max(category_model.predict_proba([text_clean])[0])
    if prob < 0.45:
        category = "Other"

    # Step 4: Rule-based urgency first
    rule_result = urgency_rule(text_clean)

    # Step 5: ML urgency fallback
    if rule_result:
        urgency = rule_result
    else:
        urgency = urgency_model.predict([text_clean])[0]

    # Step 6: Return structured output
    return {
        "Input Text": text,
        "Predicted Category": category,
        "Urgency": urgency
    }


In [36]:
print(classification_report(
    yu_test,
    urgency_model.predict(Xu_test),
    zero_division=0
))


              precision    recall  f1-score   support

        high       0.61      0.58      0.59      1061
         low       0.41      0.55      0.47       567
      medium       0.59      0.52      0.56      1119

    accuracy                           0.55      2747
   macro avg       0.54      0.55      0.54      2747
weighted avg       0.56      0.55      0.55      2747



In [24]:
df_emailData.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13731 entries, 1 to 28586
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   subject   13731 non-null  object
 1   body      13731 non-null  object
 2   category  13731 non-null  object
 3   urgency   13731 non-null  object
 4   language  13731 non-null  object
 5   text      13731 non-null  object
dtypes: object(6)
memory usage: 750.9+ KB


In [23]:
df_emailData["urgency"].value_counts()

urgency
medium    5595
high      5304
low       2832
Name: count, dtype: int64

In [26]:
import pickle

with open("category_model.pkl", "wb") as f:
    pickle.dump(category_model, f)


In [27]:
with open("urgency_model.pkl", "wb") as f:
    pickle.dump(urgency_model, f)


