In [None]:
# Week 3: Text Vectorization and Clustering Preparation

In [None]:
# Week 3: Importing visualization libraries
# This cell loads libraries required for exploratory visualization

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Week 3: Visualizing ticket distribution by priority
# This chart shows how tickets are distributed across priority levels

sns.countplot(x="Priority", data=df)
plt.title("Ticket Distribution by Priority")
plt.show()

In [None]:
# Week 3: Visualizing ticket distribution by department
# This chart identifies departments with higher ticket volumes

sns.countplot(y="Department", data=df)
plt.title("Ticket Distribution by Department")
plt.show()

In [None]:
# Week 3: Analyzing issue complexity by priority
# This boxplot shows variation in tag count across priority levels

sns.boxplot(x="Priority", y="Tag_Count", data=df)
plt.title("Issue Complexity by Priority")
plt.show()

In [None]:
# Week 3: Importing libraries for text vectorization and clustering
# This cell prepares tools required for clustering tickets

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [None]:
# Week 3: Creating clusters using ticket tags
# This cell converts tags into text features and applies KMeans clustering

df["Tags_Text"] = df["Tags"].apply(lambda x: " ".join(eval(x)))

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["Tags_Text"])

kmeans = KMeans(n_clusters=3, random_state=42)
df["Cluster"] = kmeans.fit_predict(X)

In [None]:
### End of Week 3
#Exploratory visualizations were created and support tickets were grouped into clusters using text based features.

In [None]:
# Week 4: Cluster Interpretation and Insight Generation

In [None]:
# Week 4: Visualizing cluster distribution
# This chart shows how support tickets are distributed across identified clusters

sns.countplot(x="Cluster", data=df)
plt.title("Ticket Clusters Based on Issue Tags")
plt.show()

In [None]:
# Week 4: Assigning initial semantic labels to clusters
# This cell maps cluster IDs to interpretable issue categories

cluster_labels = {
    0: "Authentication Issues",
    1: "Billing and Payment Issues",
    2: "System Performance Issues"
}

df["Cluster_Label"] = df["Cluster"].map(cluster_labels)

In [None]:
# Week 4: Evaluating clustering quality using silhouette score
# This measures how well tickets are grouped within clusters

from sklearn.metrics import silhouette_score

score = silhouette_score(X, df["Cluster"])
print(score)

In [None]:
# Week 4: Visualizing cluster distribution by label
# This shows the relative size of each identified issue category

df["Cluster_Label"].value_counts().plot(kind="bar")


In [None]:
# Week 4: Extracting top keywords per cluster
# This helps understand the dominant themes in each cluster

k = kmeans.n_clusters

import numpy as np

terms = vectorizer.get_feature_names_out()

for i in range(k):
    top_terms = np.argsort(kmeans.cluster_centers_[i])[-10:]
    print(f"Cluster {i}:")
    print([terms[t] for t in top_terms])


In [None]:
# Week 4: Inspecting updated dataset structure
# This confirms newly added cluster related columns

df.columns


In [None]:
# Week 4: Refining cluster labels for business relevance
# This updates cluster names to align with real-world support categories

cluster_map = {
    0: "Technical Support Issues",
    1: "Product and Security Issues",
    2: "Network and Security Alerts"
}

df["Cluster_Label"] = df["Cluster"].map(cluster_map)


In [None]:
# Week 4: Validating clusters using sample ticket inspection
# This manually verifies that cluster assignments are semantically correct

for c in df["Cluster"].unique():
    print(f"\nCluster {c}:")
    samples = df[df["Cluster"] == c].sample(5, random_state=42)
    for text in samples["Body"]:
        print("-", text[:120])



In [None]:
# Week 4: Analyzing cluster label distribution
# This cell shows the number of tickets in each issue category

df["Cluster_Label"].value_counts()

In [None]:
# Week 4: Analyzing priority distribution across clusters
# This table compares issue categories with ticket priority levels

pd.crosstab(df["Cluster_Label"], df["Priority"])

In [None]:
# Week 4: Analyzing department-wise distribution across clusters
# This table shows how different departments contribute to each issue category

pd.crosstab(df["Cluster_Label"], df["Department"])

In [None]:
# End of Week 4
#Cluster analysis and interpretation were completed. Issue categories were identified, validated, and evaluated for quality.


In [None]:
# Milestone 2 Completed
#Exploratory visualization and cluster based insights were successfully generated.