In [1]:
# imports
import pandas as pd 
import numpy as np
import plotly.express as px 
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans, DBSCAN
from umap.umap_ import UMAP
from sklearn.manifold import TSNE

  from .autonotebook import tqdm as notebook_tqdm
  @nb.jit
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
pd.set_option('display.max_columns', None)

# Function to read incident data and perform preprocessing
def read_inc_data(infile):
    # Read in INC and category df
    inc_df = pd.read_csv(infile)

    # Remove columns that are 100% null
    not_null_cols = [col for col in inc_df.columns if len(inc_df[inc_df[col].notnull()]) >= 1]
    inc_df = inc_df[not_null_cols]

    # Remove training issue, user error
    inc_df = inc_df[~inc_df['u_cause_code'].isin(['Training issue', 'User error'])]

    # Drop missing incident short_descriptions
    inc_df = inc_df[inc_df['short_description_NER'].notnull()]
    inc_df = inc_df.reset_index(drop=True)

    return inc_df

inc_df = read_inc_data('C:/Users/parth/Downloads/capstone_ff/UnsupervisedModel/ServiceNow_Incident.csv')

  inc_df = pd.read_csv(infile)


### Use ydata profiling for exploratory data analysis

In [4]:
# Function to get TF-IDF document-term matrix
def get_doc_term_matrix(corpus):
    # Create and fit TF-IDF vectorizer with max_features set to 1000
    vectorizer = TfidfVectorizer(tokenizer=str.split, stop_words='english', max_features=1000)
    doc_term_matrix = vectorizer.fit_transform(corpus)
    return doc_term_matrix

In [5]:
# Function to run dimensionality reduction using TruncatedSVD
def run_dimreduction(doc_term_matrix, num_components):
    # Run Latent Semantic Indexing on the doc term matrix using SVD
    lsi = TruncatedSVD(n_components=num_components)
    description_vecs = lsi.fit_transform(doc_term_matrix)
    return description_vecs

In [6]:
# Function to get Word2Vec embeddings for incident descriptions
def get_word2vec_embeddings(corpus, embedding_dim=100, window=5, min_count=1, workers=4):
    sentences = [text.split() for text in corpus]
    model = Word2Vec(sentences, vector_size=embedding_dim, window=window, min_count=min_count, workers=workers)
    return model

# Function to get average word vectors for incident descriptions
def get_avg_word_vectors(model, corpus):
    vectors = [np.mean([model.wv[word] for word in text.split() if word in model.wv] or [np.zeros(model.vector_size)], axis=0)
               for text in corpus]
    return np.vstack(vectors)

In [7]:
# Function to perform K-means clustering and create scatter plot
def run_clustering_and_visualization(doc_vectors, num_components, cluster_method='kmeans', dim_reduction='umap'):
    # Perform dimensionality reduction to 2 components if needed
    if doc_vectors.shape[1] != num_components:
        doc_vectors = run_dimreduction(doc_vectors, num_components)

    # Run K-means clustering
    cluster_model = KMeans(n_clusters=num_components)
    clusters = cluster_model.fit_predict(doc_vectors)

    # Check if doc_vectors has only one column, and if so, create an artificial second column with zeros
    if doc_vectors.shape[1] == 1:
        doc_vectors = np.hstack((doc_vectors, np.zeros((doc_vectors.shape[0], 1))))

    # Create DataFrame with reduced vectors and cluster labels
    reduced_df = pd.DataFrame(doc_vectors, columns=[f"Component_{i+1}" for i in range(num_components)])
    inc_df['cluster'] = clusters
    inc_df2 = pd.concat([inc_df, reduced_df], axis=1)

    # Create scatter plot using plotly
    fig = px.scatter(inc_df2, x='Component_1', y='Component_2', color='cluster', hover_name='category', hover_data=['short_description_NER'],
                     opacity=0.6, template='simple_white', width=800, height=600, title=f"{dim_reduction.capitalize()} results with {cluster_method.capitalize()} clustering")
    
    fig.update_traces(marker=dict(size=6, line=dict(width=0.5, color='black')))
    return fig

In [8]:
# Function to perform DBSCAN clustering and create scatter plot
def run_clustering_and_visualization_dbscan(doc_vectors, eps=0.5, min_samples=5, dim_reduction='umap'):
    # Perform dimensionality reduction
    description_vecs = run_dimreduction(doc_vectors, num_components=2)

    # Run DBSCAN clustering
    dbscan_model = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = dbscan_model.fit_predict(description_vecs)

    # Create DataFrame with reduced vectors and cluster labels
    reduced_df = pd.DataFrame(description_vecs, columns=['Component_1', 'Component_2'])
    inc_df['cluster'] = clusters
    inc_df2 = pd.concat([inc_df, reduced_df], axis=1)

    # Create scatter plot using plotly
    fig = px.scatter(inc_df2, x='Component_1', y='Component_2', color='cluster', hover_name='category', hover_data=['short_description_NER'],
                     opacity=0.6, template='simple_white', width=800, height=600, title=f"{dim_reduction.capitalize()} results with DBSCAN clustering")
    
    fig.update_traces(marker=dict(size=6, line=dict(width=0.5, color='black')))
    return fig

In [9]:
# Function to run dimensionality reduction on word2vec vectors
def run_dimreduction_word2vec(word2vec_vectors, num_components, method='umap'):
    if method == 'umap':
        dim_reducer = UMAP(n_components=num_components)
    elif method == 'tsne':
        dim_reducer = TSNE(n_components=num_components)
    else:
        raise ValueError("Invalid dimensionality reduction method. Use 'umap' or 'tsne'.")

    description_vecs = dim_reducer.fit_transform(word2vec_vectors)

    return description_vecs

In [10]:
# Function to plot labelled scatter plot
def plot_labelled_scatter(in_dat, in_title=""):
    in_dat['cluster'] = in_dat['cluster'].astype(str)

    out_fig = px.scatter(in_dat, x='Component_1', 
                         y='Component_2',
                         color='cluster',  # Use the 'cluster' column for color encoding
                         hover_name='category',
                         hover_data=['short_description_NER'], 
                         opacity=0.60,
                         template='simple_white',
                         width=800,
                         height=600,
                         title=in_title)
    
    out_fig.update_traces(marker=dict(size=6, 
                                      line=dict(width=.5,
                                                color='black')),
                          selector=dict(mode='markers'))

    return out_fig


### Next, you can run dimensionality reduction on the combined document-term matrix (text embedding + additional features) and visualize the results using various clustering algorithms.  For example, let's use KMeans, DBSCAN, PCA, UMAP, and t-SNE for clustering and visualization

In [11]:
# Get TF-IDF document-term matrix
corpus_bow = inc_df['short_description_NER']
description_tfidf_matrix = get_doc_term_matrix(corpus_bow)

# Get Word2Vec embeddings for incident descriptions
corpus_word2vec = inc_df['short_description_NER']
word2vec_model = get_word2vec_embeddings(corpus_word2vec)
description_word2vec_vecs = get_avg_word_vectors(word2vec_model, corpus_word2vec)

# Show 2-dimensional K-means results using TF-IDF vectors
chart_tfidf_kmeans = run_clustering_and_visualization(description_tfidf_matrix, num_components=10, cluster_method='kmeans', dim_reduction='lsi')

# Show 2-dimensional DBSCAN results using TF-IDF vectors
chart_tfidf_dbscan = run_clustering_and_visualization(description_tfidf_matrix, num_components=10, cluster_method='dbscan', dim_reduction='lsi')

# Show 2-dimensional K-means results using Word2Vec embeddings
chart_word2vec_kmeans = run_clustering_and_visualization(description_word2vec_vecs, num_components=10, cluster_method='kmeans', dim_reduction='umap')

# Show 2-dimensional DBSCAN results using TF-IDF vectors
chart_bow_dbscan = run_clustering_and_visualization_dbscan(description_tfidf_matrix, eps=0.01, min_samples=1, dim_reduction='lsi')



  super()._check_params_vs_input(X, default_n_init=10)






In [12]:
# Display the results
chart_tfidf_kmeans.show()
chart_tfidf_dbscan.show()
chart_word2vec_kmeans.show()
chart_bow_dbscan.show()