In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.cluster import KMeans
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import normalize, to_categorical
from tensorflow.keras.models import Model, load_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import davies_bouldin_score, confusion_matrix, classification_report
import seaborn as sns
from collections import Counter
import os
import re
import string
import sys
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import word_tokenize


In [2]:
"""Function for preprocessing the text."""
stemmer = SnowballStemmer('english')
def preprocess_text(text, stop_words):
    
    # Check if the text is not a string (e.g., NaN) and return an empty string in such cases
    if not isinstance(text, str):
        return ''
    
     # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', str(text))
    text = re.sub(r'\d+', '', text)

    # Tokenize text and remove stop words
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text
 

In [3]:
# Load Data
MainDf = pd.read_csv('merged csv.csv', encoding='cp1252', on_bad_lines='skip')
MainDf.drop_duplicates(inplace=True)
# Preprocessing
stop_words = set(stopwords.words('english'))
data = np.array(MainDf['description'].apply(lambda x: preprocess_text(x, stop_words)))

# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)


In [4]:
max_len= 48
# Load Labeled Data and Preprocess
df = pd.read_csv('Test_data.csv')
Labels = df['type']
Label_data = df['description'].apply(lambda x: preprocess_text(x, stop_words))
lb_sequences = tokenizer.texts_to_sequences(Label_data)
lb_padded = pad_sequences(lb_sequences, padding='post', maxlen=max_len)
Nor_lb_padded = normalize(lb_padded, axis=1)
lb_new_data = np.reshape(Nor_lb_padded, (len(Nor_lb_padded), Nor_lb_padded.shape[1], 1))
label_encoder = LabelEncoder()
labels_Int = label_encoder.fit_transform(Labels)


In [12]:
from tensorflow.keras.models import load_model
import pickle
# Load the CAE model
encoder = load_model("Final_cae_model_Result18.keras")

Lb_encoded_test = encoder.predict(lb_new_data)
exp_lb_encoded_test = np.reshape(Lb_encoded_test, newshape=(Lb_encoded_test.shape[0], -1))



In [13]:
kmeans = joblib.load('kmeans_model_final9.pkl')

lb_test_cluster_labels =  kmeans.predict(exp_lb_encoded_test)


In [14]:
# Simple function to align cluster labels with true labels
def align_cluster_labels_with_ground_truth(cluster_labels, true_labels):
    aligned_labels = np.zeros_like(cluster_labels)
    for i in np.unique(cluster_labels):
        mask = (cluster_labels == i)
        if np.sum(mask) == 0:  # This should handle the error, but ideally, this condition shouldn't occur
            continue
        aligned_labels[mask] = np.bincount(true_labels[mask]).argmax()
    return aligned_labels

aligned_cluster_labels = align_cluster_labels_with_ground_truth(lb_test_cluster_labels, labels_Int)

# Evaluate
accuracy = accuracy_score(labels_Int, aligned_cluster_labels)
#nmi = normalized_mutual_info_score(labels_Int, aligned_cluster_labels)

print(f"Accuracy: {accuracy}")


# Create a confusion matrix
conf_matrix = confusion_matrix(labels_Int, aligned_cluster_labels)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Calculate precision, recall, and F1-score
report = classification_report(labels_Int, aligned_cluster_labels)
print("Classification Report:")
print(report)




Accuracy: 0.660377358490566
Confusion Matrix:
[[ 4  0  2  0]
 [ 0  0  6  1]
 [ 3  0  8  4]
 [ 0  0  2 23]]
Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.67      0.62         6
           1       0.00      0.00      0.00         7
           2       0.44      0.53      0.48        15
           3       0.82      0.92      0.87        25

    accuracy                           0.66        53
   macro avg       0.46      0.53      0.49        53
weighted avg       0.58      0.66      0.62        53



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Print the clustering evaluation metrics
from sklearn.metrics import silhouette_score, davies_bouldin_score

print('Test set Davies-Bouldin score:', davies_bouldin_score(exp_lb_encoded_test, aligned_cluster_labels))

Test set Davies-Bouldin score: 1.06907933796153
