In [7]:
df.rename(columns={
    "Body": "Issue_Description",
    "Department": "Category"
}, inplace=True)


In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"dear customer support team,", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["Issue_Description"] = df["Issue_Description"].astype(str).apply(clean_text)


In [12]:
df["Tags"] = df["Tags"].apply(ast.literal_eval)


In [14]:
priority_map = {
    "low": 1,
    "medium": 2,
    "high": 3
}

df["Priority_Score"] = df["Priority"].map(priority_map)


In [17]:
np.random.seed(42)

def generate_resolution_time(priority):
    if priority == 3:
        return np.random.randint(2, 6)
    elif priority == 2:
        return np.random.randint(6, 24)
    else:
        return np.random.randint(24, 72)

df["Resolution_Time_Hours"] = df["Priority_Score"].apply(generate_resolution_time)


In [19]:
le = LabelEncoder()
df["Category_Encoded"] = le.fit_transform(df["Category"])


In [21]:
mlb = MultiLabelBinarizer()
tags_encoded = mlb.fit_transform(df["Tags"])

tags_df = pd.DataFrame(
    tags_encoded,
    columns=mlb.classes_
)

df = pd.concat([df.reset_index(drop=True), tags_df.reset_index(drop=True)], axis=1)


In [23]:
tfidf = TfidfVectorizer(
    max_features=100,
    stop_words="english"
)

tfidf_matrix = tfidf.fit_transform(df["Issue_Description"])


In [24]:
kmeans = KMeans(n_clusters=5, random_state=42)
df["Cluster_ID"] = kmeans.fit_predict(tfidf_matrix)


In [27]:
similarity_matrix = cosine_similarity(tfidf_matrix)

df["Similarity_Score"] = similarity_matrix.mean(axis=1)


In [28]:
df.head()


Unnamed: 0.1,Unnamed: 0,Issue_Description,Category,Priority,Tags,Priority_Score,Resolution_Time_Hours,Category_Encoded,2019,AES,...,WritingExperience,Xero,Zapier,Zoho,Zoho Books,Zoom,iOS,macOS,Cluster_ID,Similarity_Score
0,0,i am writing to report a significant problem w...,Technical Support,high,"[Account, Disruption, Outage, IT, Tech Support]",3,4,9,0,0,...,0,0,0,0,0,0,0,0,0,0.105657
1,1,i hope this message reaches you well i am reac...,Returns and Exchanges,medium,"[Product, Feature, Tech Support]",2,20,6,0,0,...,0,0,0,0,0,0,0,0,2,0.093964
2,2,i hope this message finds you well i am reachi...,Billing and Payments,low,"[Billing, Payment, Account, Documentation, Fee...",1,66,0,0,0,...,0,0,0,0,0,0,0,0,2,0.06218
3,3,dear support teami hope this message reaches y...,Sales and Pre-Sales,medium,"[Product, Feature, Feedback, Tech Support]",2,13,7,0,0,...,0,0,0,0,0,0,0,0,2,0.101631
4,4,dear customer supporti hope this message reach...,Technical Support,high,"[Feature, Product, Documentation, Feedback]",3,2,9,0,0,...,0,0,0,0,0,0,0,0,2,0.101718


In [34]:
df = df[[
    "Issue_Description",
    "Category",
    "Priority",
    "Priority_Score",
    "Resolution_Time_Hours"
]]


In [38]:
df.to_csv(r"C:\Users\tejas\Downloads\clean_support_tickets.csv", index=False)


In [6]:
import pandas as pd
import numpy as np
df = pd.read_csv(r"C:\Users\tejas\Downloads\clean_support_tickets.csv")


In [12]:
df["ticket_id"] = range(1, len(df) + 1)


In [50]:
df["type"] = "Incident"


In [16]:
df["created_date"] = pd.to_datetime("2024-01-01") + pd.to_timedelta(
    np.random.randint(0, 30, size=len(df)), unit="D"
)


In [18]:
df["resolved_date"] = df["created_date"] + pd.to_timedelta(
    df["Resolution_Time_Hours"], unit="h"
)


In [20]:
df["country"] = "India"


In [22]:
df["status"] = "Resolved"


In [28]:
print(df.columns)

Index(['Issue_Description', 'Category', 'Priority', 'Priority_Score',
       'Resolution_Time_Hours', 'ticket_id', 'type', 'created_date',
       'resolved_date', 'country', 'status'],
      dtype='object')


In [30]:
df.rename(columns={
    "Issue_Description": "issue_description",
    "Category": "category",
    "Priority": "priority"
}, inplace=True)


In [32]:
df[[
    "ticket_id",
    "type",
    "category",
    "priority",
    "created_date",
    "resolved_date",
    "country",
    "issue_description",
    "status"
]].head()


Unnamed: 0,ticket_id,type,category,priority,created_date,resolved_date,country,issue_description,status
0,1,Incident,Technical Support,high,2024-01-27,2024-01-27 04:00:00,India,i am writing to report a significant problem w...,Resolved
1,2,Incident,Returns and Exchanges,medium,2024-01-09,2024-01-09 20:00:00,India,i hope this message reaches you well i am reac...,Resolved
2,3,Incident,Billing and Payments,low,2024-01-25,2024-01-27 18:00:00,India,i hope this message finds you well i am reachi...,Resolved
3,4,Incident,Sales and Pre-Sales,medium,2024-01-15,2024-01-15 13:00:00,India,dear support teami hope this message reaches y...,Resolved
4,5,Incident,Technical Support,high,2024-01-10,2024-01-10 02:00:00,India,dear customer supporti hope this message reach...,Resolved


In [52]:
df.to_csv(r"C:\Users\tejas\Downloads\support_tickets.csv", index=False)
