In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import pandas as pd
import numpy as np
from nltk import pos_tag, word_tokenize, RegexpParser
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from googletrans import Translator
from nltk.corpus import stopwords


import random
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.losses import BinaryCrossentropy
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from collections import Counter

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# file_path = 'Hall_2012_cleaned.csv'
# file_path = 'hall_keywords.csv'
file_path = 'Jeyaraman_2020_cleaned.csv'
# file_path = 'Radjenovic_2013_cleaned.csv'
# file_path = 'Smid_2020_cleaned.csv'

df = pd.read_csv(file_path, delimiter=',')
df = df.dropna(axis=0)
df_sample = df.copy()

In [None]:
from nltk.corpus import wordnet
import random

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalpha()]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n: # only replace up to n words
            break

    sentence = ' '.join(new_words)
    return sentence

def augment_text(df, minority_class, augment_by=0.9):
    minority_df = df[df['label_included'] == minority_class]
    n_augmentations = int(len(minority_df) * augment_by)
    
    augmented_texts = []
    for _ in range(n_augmentations):
        original_text = random.choice(minority_df['Corpus'].tolist())
        augmented_text = synonym_replacement(original_text, n=1) # You can adjust n for more replacements
        augmented_texts.append(augmented_text)
    
    # Add augmented texts to the dataframe
    augmented_df = pd.DataFrame(augmented_texts, columns=['Corpus'])
    augmented_df['label_included'] = minority_class
    return pd.concat([df, augmented_df], ignore_index=True)

# Assuming your minority class is identified, for example, as 1
df_augmented = augment_text(df, minority_class=1, augment_by=0.9)


df_sample = pd.concat([df, df_augmented], ignore_index=True)

# Shuffle the dataframe to mix original and augmented examples (optional)
df_sample = df_sample.sample(frac=1).reset_index(drop=True)


In [None]:
label_counts = df_sample['label_included'].value_counts()
label_1_count = label_counts.get(1, 0) 
label_0_count = label_counts.get(0, 0)
print(label_1_count)
print(label_0_count)
max_sequence_length = max(len(text.split()) for text in df_sample['Corpus'])  
num_classes = 1
input_shape = (max_sequence_length,)
from collections import Counter
token_counts = Counter(word for sentence in  df_sample['Corpus'] for word in sentence.split())
vocab_size = len(token_counts)
vocab_size

In [None]:
embedding_dim=200
learning_rate = 0.001
batch_size = 10
hidden_units = 128
projection_units = 128
num_epochs = 5
dropout_rate = 0.3
temperature = 0.1


In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.models import Sequential
def create_simple_encoder():
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
        GlobalAveragePooling1D()
    ])
    
    return model

In [None]:
def create_classifier(encoder, trainable=True):
    for layer in encoder.layers:
        layer.trainable = trainable

    inputs = keras.Input(shape=input_shape, dtype=tf.int32)
    features = encoder(inputs)

    # Flatten the features (if needed) to prepare for the dense layer
    features = keras.layers.Flatten()(features)

    # Add a single dense layer for classification
    outputs = keras.layers.Dense(1, activation="sigmoid")(features)

    model = keras.Model(inputs=inputs, outputs=outputs, name="simple-text-classifier")

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate),
        loss='binary_crossentropy',
        metrics=[
            keras.metrics.BinaryAccuracy(),
            balanced_accuracy_metric,
            f1_score_metric,
            mcc_metric,
        ]
    )
    
    return model

In [None]:
# class SupervisedContrastiveLoss(keras.losses.Loss):
#     def __init__(self, temperature=1, name=None):
#         super().__init__(name=name)
#         self.temperature = temperature

#     def __call__(self, labels, feature_vectors, sample_weight=None):
     
#         feature_vectors_normalized = tf.math.l2_normalize(feature_vectors, axis=1)
   
#         logits = tf.divide(
#             tf.matmul(
#                 feature_vectors_normalized, tf.transpose(feature_vectors_normalized)
#             ),
#             self.temperature,
#         )
      
#         return tfa.losses.npairs_loss(tf.squeeze(labels), logits)
class SupervisedContrastiveLoss(keras.losses.Loss): #triplet
    def __init__(self, margin=0.3, name=None):
        super().__init__(name=name)
        self.margin = margin

    def __call__(self, labels, feature_vectors, sample_weight=None):
        # Assuming labels are binary: 1 for similar pairs, 0 for dissimilar pairs
        positive_mask = tf.equal(tf.expand_dims(labels, 0), tf.expand_dims(labels, 1))
        negative_mask = ~positive_mask

        feature_vectors_normalized = tf.math.l2_normalize(feature_vectors, axis=1)

        # Calculate pairwise cosine similarities
        similarities = tf.matmul(
            feature_vectors_normalized, tf.transpose(feature_vectors_normalized)
        )

        # Get positive and negative similarities
        positive_similarity = tf.where(positive_mask, similarities, tf.zeros_like(similarities))
        negative_similarity = tf.where(negative_mask, similarities, tf.zeros_like(similarities))

        # Calculate triplet loss
        loss = tf.maximum(negative_similarity - positive_similarity + self.margin, 0)

        return tf.reduce_mean(loss)


def add_projection_head(encoder):
    input_shape = (max_sequence_length,)  
    inputs = keras.Input(shape=input_shape)
    features = encoder(inputs)
   
    outputs = layers.Dense(projection_units, activation="relu")(features)
    model = keras.Model(
        inputs=inputs, outputs=outputs, name="text-encoder_with_projection-head"
    )
    return model


In [None]:

encoder_with_projection_head = add_projection_head(encoder_model)
encoder_with_projection_head.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=SupervisedContrastiveLoss(),
)

encoder_with_projection_head.summary()

In [None]:
def mcc_metric(y_true, y_pred):
    true_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true * y_pred, 0, 1)))
    true_negatives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    false_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip((1 - y_true) * y_pred, 0, 1)))
    false_negatives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true * (1 - y_pred), 0, 1)))
    
    denominator = tf.keras.backend.sqrt((true_positives + false_positives) * (true_positives + false_negatives) * (true_negatives + false_positives) * (true_negatives + false_negatives))
    mcc = (true_positives * true_negatives - false_positives * false_negatives) / (denominator + tf.keras.backend.epsilon())
    
    return mcc


def balanced_accuracy_metric(y_true, y_pred):
    actual_positives = tf.math.reduce_sum(y_true)
    actual_negatives = tf.math.reduce_sum(1 - y_true)
    
    epsilon = 1e-7  # Small constant to avoid division by zero

    true_positives = tf.math.reduce_sum(y_true * tf.round(y_pred))
    true_negatives = tf.math.reduce_sum((1 - y_true) * tf.round(1 - y_pred))
    
    balanced_accuracy = 0.5 * (true_positives / (actual_positives + epsilon) + true_negatives / (actual_negatives + epsilon))
    
    return balanced_accuracy




def f1_score_metric(y_true, y_pred):
    true_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true * y_pred, 0, 1)))
    predicted_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_pred, 0, 1)))
    actual_positives = tf.keras.backend.sum(y_true)
    
    precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
    recall = true_positives / (actual_positives + tf.keras.backend.epsilon())
    
    f1_score = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
    
    return f1_score


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

# Parameters
n_splits = 5  # Number of folds
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['Corpus'])

# Convert labels to numpy array for StratifiedKFold
labels = train_df['label_included'].values

for fold, (train_idx, validate_idx) in enumerate(kfold.split(train_df, labels)):
    print(f"Running Fold {fold + 1}/{n_splits}")

    # Split data
    train_df_fold = train_df.iloc[train_idx]
    validate_df_fold = train_df.iloc[validate_idx]

    # Tokenize and pad sequences

    sequences = tokenizer.texts_to_sequences(train_df['Corpus'])
    padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

    # Scale the data
#     scaler = StandardScaler()
#     padded_sequences = scaler.fit_transform(padded_sequences)
   
    extractor = tf.keras.Model(inputs=encoder_model.inputs,
                           outputs=encoder_model.layers[-2].output)

# Predict to get the embeddings
    embeddings_before_training = extractor.predict(padded_sequences)

    # Create and compile models
    encoder_with_projection_head = add_projection_head(create_simple_encoder())
    encoder_with_projection_head.compile(
        optimizer=keras.optimizers.Adam(learning_rate),
        loss=SupervisedContrastiveLoss(),
    )

    # Train the encoder
    history = encoder_with_projection_head.fit(
        x=padded_sequences, 
        y=labels, 
        batch_size=batch_size, 
        epochs=num_epochs
    )
    
    final_embeddings = encoder_model.predict(padded_sequences)
    
    # Prepare validation data
    validate_sequences = tokenizer.texts_to_sequences(validate_df_fold['Corpus'])
    validate_padded = pad_sequences(validate_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
#     validate_padded = scaler.transform(validate_padded)
    validate_labels = validate_df_fold['label_included'].values.astype(int)

    # Create and train classifier
    classifier = create_classifier(encoder_with_projection_head, trainable=False)
    history_classifier = classifier.fit(
        x=padded_sequences, 
        y=labels, 
        batch_size=batch_size, 
        epochs=num_epochs,
        validation_data=(validate_padded, validate_labels)
    )



# tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
# tokenizer.fit_on_texts(train_df['Corpus'])
# train_sequences = tokenizer.texts_to_sequences(train_df['Corpus'])
# train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
# scaler = StandardScaler()
# train_padded = scaler.fit_transform(train_padded)

# train_y = train_df['label_included'].values.astype(int)




In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, Normalize

extractor = tf.keras.Model(inputs=encoder_model.inputs,
                           outputs=encoder_model.layers[-2].output)

# Predict to get the embeddings
embeddings_before_training = extractor.predict(padded_sequences)
   
extractor_after_training = tf.keras.Model(inputs=encoder_with_projection_head.input, outputs=encoder_with_projection_head.layers[-1].output)  # use 'model.output' to get embeddings from the encoder part
embeddings_after_training = extractor_after_training.predict(padded_sequences)


# Flatten the embeddings for TSNE
flatten_embeddings_before_training = embeddings_before_training.reshape(embeddings_before_training.shape[0], -1)
flatten_embeddings_after_training = embeddings_after_training.reshape(embeddings_after_training.shape[0], -1)

# Define a function to plot the embeddings
def plot_embeddings(embeddings, labels, title, perplexity=30):
    if embeddings.shape[0] < perplexity:
        perplexity = embeddings.shape[0] - 1  # Set perplexity to one less than the number of samples
        
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=0)
    reduced_embeddings = tsne.fit_transform(embeddings)

    # Define colormap
    unique_labels = np.unique(labels)
    cmap = ListedColormap(plt.cm.get_cmap('viridis', len(unique_labels))(np.linspace(0, 1, len(unique_labels))))
    norm = Normalize(vmin=min(unique_labels), vmax=max(unique_labels))

    plt.figure(figsize=(8, 8))
    scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap=cmap, norm=norm)
    plt.colorbar(scatter, ticks=range(len(unique_labels)))
    plt.title(title)
    plt.show()

# Plotting the embeddings before and after training
plot_embeddings(flatten_embeddings_before_training, labels, "Embeddings Before Training")
plot_embeddings(flatten_embeddings_after_training, labels, "Embeddings After Training")

from sklearn.metrics import silhouette_score
# Compute Silhouette Score for embeddings before training
silhouette_before = silhouette_score(flatten_embeddings_before_training, labels)
print(f"Silhouette Score Before Training: {silhouette_before}")

# Compute Silhouette Score for embeddings after training
silhouette_after = silhouette_score(flatten_embeddings_after_training, labels)
print(f"Silhouette Score After Training: {silhouette_after}")


In [None]:
from sklearn.metrics import accuracy_score, matthews_corrcoef, balanced_accuracy_score, f1_score, roc_auc_score,confusion_matrix,average_precision_score,recall_score


mcc = matthews_corrcoef(true_labels, predicted_labels)
balanced_accuracy = balanced_accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)
precision = average_precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)


print(f'Matthews Correlation Coefficient: {mcc}')
print(f'Balanced Accuracy: {balanced_accuracy}')
print(f'F1 Score: {f1}')
print(f'precision Score: {precision}')
print(f'recall: {recall}')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
confusion = confusion_matrix(true_labels, predicted_labels)


# Reshape the confusion matrix to a 2x2 matrix
confusion = confusion.reshape(2, 2)

# Create a heatmap of the confusion matrix
sns.heatmap(confusion, annot=True, fmt="d", cmap="Blues")

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

plt.show()

In [None]:
from lime.lime_text import LimeTextExplainer
def predict_proba(texts):
    # Tokenize and pad the text sequences
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')
#     padded = scaler.transform(padded)  # Assuming you want to scale as done in training
    # Get predictions
    pred = classifier.predict(padded)
    # For binary classification, LIME expects probabilities for both classes
    return np.hstack((1-pred, pred))

explainer = LimeTextExplainer(class_names=['Irrelevant', 'Relevant'])

# Choose an instance to explain
idx = 9
# For example, explain prediction for the 10th document in your test set
test_doc = test_df['Corpus'].iloc[idx]

In [None]:
exp = explainer.explain_instance(test_doc, predict_proba, num_features=10)

# Show the explanation
exp.show_in_notebook(text=True)

In [None]:
exp.as_pyplot_figure()
plt.title('Feature contribution for classifying as Relevant')
plt.show()

In [None]:


# Step 1: Predict the probabilities for the test set
predicted_probs = predict_proba(test_df['Corpus'].tolist())

# Step 2: Determine the predicted class based on probability threshold, e.g., > 0.5 for class 1
predicted_classes = np.argmax(predicted_probs, axis=1)

# Adding a column for predicted classes to the test_df for convenience
test_df['predicted_class'] = predicted_classes

# Filter instances that were predicted as class 1 ('Relevant')
predicted_relevant_df = test_df[test_df['predicted_class'] == 1]


for idx, row in predicted_relevant_df.iterrows():
    test_doc = row['Corpus']
    
    # Generate explanation for this instance
    exp = explainer.explain_instance(test_doc, predict_proba, num_features=10)
    
    print(f"Explanation for document {idx} (Predicted as 'Relevant'):")
    exp.show_in_notebook(text=True)

In [None]:
Clustering with the same encoder

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.models import save_model
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering, OPTICS
from sklearn.metrics import adjusted_rand_score, davies_bouldin_score, silhouette_score, normalized_mutual_info_score, jaccard_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

X = df_sample['Corpus']
ground_truth_labels = df_sample['label_included']
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)


In [None]:
def find_optimal_pca_components(data, explained_variance_threshold=0.95):
    pca = PCA()
    pca.fit(data)
    explained_variance = np.cumsum(pca.explained_variance_ratio_)
    num_components = np.argmax(explained_variance >= explained_variance_threshold) + 1
    return num_components

def perform_pca(data, num_components):
    pca = PCA(n_components=num_components)
    reduced_data = pca.fit_transform(data)
    return reduced_data

def kmeans_clustering(data, k):
    # K-Means clustering
    kmeans = KMeans(n_clusters=k, init="k-means++", n_init=50)
    kmeans.fit(data)
    cluster_labels = kmeans.labels_
    return cluster_labels

def hierarchical_clustering(data, k):
    cosine_similarity_matrix = cosine_similarity(data)
    hierarchical_model = AgglomerativeClustering(n_clusters=k, affinity="euclidean", linkage="ward")
    cluster_labels = hierarchical_model.fit_predict(cosine_similarity_matrix)
    return cluster_labels

def perform_dbscan(data, epsilon, min_samples):
    dbscan_model = DBSCAN(eps=epsilon, min_samples=min_samples)
    cluster_labels = dbscan_model.fit_predict(data)
    return cluster_labels

def perform_spectral_clustering(data, n_clusters):
    spectral_model = SpectralClustering(n_clusters=n_clusters)
    cluster_labels = spectral_model.fit_predict(data)
    return cluster_labels

def perform_optics(data, min_samples, max_eps):
    optics_model = OPTICS(min_samples=min_samples, max_eps=max_eps)
    cluster_labels = optics_model.fit_predict(data)
    return cluster_labels


def evaluate_clustering(ground_truth_labels, cluster_labels, data):
    '''ARI 1 is optimal
    DBI, chscore lower the value better
    silhouette,nmi,fmi homogenety, completness ad vmeaure close to 1
    '''
    ari = adjusted_rand_score(ground_truth_labels, cluster_labels)
    dbi = davies_bouldin_score(data, cluster_labels)
    silhouette_avg = silhouette_score(data, cluster_labels)
    nmi = normalized_mutual_info_score(ground_truth_labels, cluster_labels)

    jaccard_coefficient = jaccard_score(ground_truth_labels, cluster_labels, average='micro')
    
    return ari, dbi, silhouette_avg, nmi, jaccard_coefficient


In [None]:
encoder_model.save('simple_encoder_model.h5')
encoder_model = load_model('simple_encoder_model.h5')
# Generate embeddings using the loaded encoder
embeddings = encoder_model.predict(X_padded)
embeddings_standardized = scaler.fit_transform(embeddings)

In [None]:
silhouette_scores = []
for i in range(2, 15):
    model = KMeans(n_clusters=i)
    labels = model.fit_predict(embeddings_standardized)
    silhouette_avg = silhouette_score(embeddings_standardized, labels)
    silhouette_scores.append(silhouette_avg)
    
optimal_k_sil = silhouette_scores.index(max(silhouette_scores)) + 2
print(f"Optimal number of clusters (k): {optimal_k_sil}")

In [None]:
num_components = find_optimal_pca_components(embeddings_standardized)

# reduced_data_stand = perform_pca(embeddings_standardized, num_components)
reduced_data_stand = perform_pca(embeddings_standardized, 2)
kmeans_labels = kmeans_clustering(reduced_data_stand, 2)

# kmeans_labels = kmeans_clustering(reduced_data_stand, optimal_k_sil)

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)

# Fit and transform your data using t-SNE
combined_features_tsne  = tsne.fit_transform(reduced_data_stand)

plt.scatter(combined_features_tsne[:, 0], combined_features_tsne[:, 1], c=kmeans_labels, cmap='rainbow')
plt.title("K-Means Clustering with t-SNE Visualization")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import umap.umap_ as umap

umap_model = umap.UMAP(n_components=2, random_state=42)
combined_features_umap = umap_model.fit_transform(reduced_data_stand)

plt.scatter(combined_features_umap[:, 0], combined_features_umap[:, 1], c=kmeans_labels, cmap='rainbow')
plt.title("K-Means Clustering with UMAP Visualization")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.show()

In [None]:
ari_k, dbi_k, silhouette_avg_k, nmi_k, js_k = evaluate_clustering(ground_truth_labels, kmeans_labels,reduced_data_stand)
print("ARI:", ari_k)
print("Davies-Bouldin Index:", dbi_k)
print("Silhouette Score:", silhouette_avg_k)
print("NMI:", nmi_k)
print("Jaccard-coefficient:", js_k)

In [None]:
import scipy.cluster.hierarchy as sch
from sklearn.metrics.pairwise import cosine_similarity
dendrogram = sch.dendrogram(sch.linkage(reduced_data_stand, method='ward'))

In [None]:
optimal_linkage_distance = 450
# Adjust this value as per your observation

# Count the number of clusters based on the linkage distance
num_clusters = sum(1 for d in dendrogram['dcoord'] if d[1] > optimal_linkage_distance)

print(f"Optimal number of clusters: {num_clusters}")

In [None]:
# optimal_k_hierarchical = num_clusters
optimal_k_hierarchical = 2

hierarchical_labels = hierarchical_clustering(reduced_data_stand, optimal_k_hierarchical)
from collections import Counter

# Assuming 'hierarchical_labels' is your label array
label_counts = Counter(hierarchical_labels)

# Print the label counts
for label, count in label_counts.items():
    print(f"Label {label}: Count {count}")

In [None]:
ari_h, dbi_h, silhouette_avg_h, nmi_h, js_h = evaluate_clustering(ground_truth_labels, hierarchical_labels, reduced_data_stand)
print("ARI:", ari_h)
print("Davies-Bouldin Index:", dbi_h)
print("Silhouette Score:", silhouette_avg_h)
print("NMI:", nmi_h)
print("Jaccard-coefficient:", js_h)


In [None]:
tsne = TSNE(n_components=2, random_state=42)

# Fit and transform your data using t-SNE
combined_features_tsne  = tsne.fit_transform(reduced_data_stand)

plt.scatter(combined_features_tsne[:, 0], combined_features_tsne[:, 1], c=hierarchical_labels, cmap='rainbow')
plt.title("Hierarchical Clustering with t-SNE Visualization")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()

In [None]:
umap_model = umap.UMAP(n_components=2, random_state=42)
combined_features_umap = umap_model.fit_transform(reduced_data_stand)

plt.scatter(combined_features_umap[:, 0], combined_features_umap[:, 1], c=hierarchical_labels, cmap='rainbow')
plt.title("Hierarchical Clustering with UMAP Visualization")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.show()

In [None]:
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=20)
neighbors_fit = neighbors.fit(reduced_data_stand)
distances, indices = neighbors_fit.kneighbors(reduced_data_stand)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.figure(figsize=(10,8))
plt.plot(distances)
num_features = reduced_data_stand.shape[1]
print("Number of features:", num_features)

In [None]:
epsilon_dbscan = 3

min_samples_dbscan = 5

dbscan_labels = perform_dbscan(reduced_data_stand, epsilon_dbscan, min_samples_dbscan)
ari_db, dbi_db, silhouette_avg_db, nmi_db, js_db = evaluate_clustering(ground_truth_labels, dbscan_labels, reduced_data_stand)
print("ARI:", ari_db)
print("Davies-Bouldin Index:", dbi_db)
print("Silhouette Score:", silhouette_avg_db)
print("NMI:", nmi_db)
print("Jaccard-coefficient:", js_db)

In [None]:

tsne = TSNE(n_components=2, random_state=42)

# Fit and transform your data using t-SNE
combined_features_tsne  = tsne.fit_transform(reduced_data_stand)

plt.scatter(combined_features_tsne[:, 0], combined_features_tsne[:, 1], c=dbscan_labels, cmap='rainbow')
plt.title("DBSCAN Clustering with t-SNE Visualization")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()

In [None]:
umap_model = umap.UMAP(n_components=2, random_state=42)
combined_features_umap = umap_model.fit_transform(reduced_data_stand)

plt.scatter(combined_features_umap[:, 0], combined_features_umap[:, 1], c=dbscan_labels, cmap='rainbow')
plt.title("DBSCAn Clustering with UMAP Visualization")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.show()

In [None]:

# Create a range of possible cluster numbers
cluster_range = range(2, 20)

silhouette_scores = []



# Using the Silhouette Score to find the optimal number of clusters
for n_clusters in cluster_range:
    spectral_model = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors')
    spectral_labels = spectral_model.fit_predict(reduced_data_stand)
    silhouette_avg = silhouette_score(reduced_data_stand, spectral_labels)
    silhouette_scores.append(silhouette_avg)
    
optimal_num_clusters = cluster_range[silhouette_scores.index(max(silhouette_scores))]
print(f"Optimal number of clusters: {optimal_num_clusters}")

# Plot the Silhouette Score
plt.subplot(1, 2, 2)
plt.plot(cluster_range, silhouette_scores, marker='o')
plt.title("Silhouette Score for Spectral Clustering")
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")



plt.tight_layout()
plt.show()


In [None]:
# Building the clustering model 
spectral_model_rbf = SpectralClustering(n_clusters = 2, affinity = 'nearest_neighbors') 
# spectral_model_rbf = SpectralClustering(n_clusters = optimal_num_clusters, affinity = 'nearest_neighbors') 
  
# Training the model and Storing the predicted cluster labels 
labels_rbf = spectral_model_rbf.fit_predict(reduced_data_stand)

In [None]:
tsne = TSNE(n_components=2, random_state=42)

# Fit and transform your data using t-SNE
combined_features_tsne  = tsne.fit_transform(reduced_data_stand)

plt.scatter(combined_features_tsne[:, 0], combined_features_tsne[:, 1], c=labels_rbf, cmap='rainbow')
plt.title("Spectral Clustering with t-SNE Visualization")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()

In [None]:
umap_model = umap.UMAP(n_components=2, random_state=42)
combined_features_umap = umap_model.fit_transform(reduced_data_stand)

plt.scatter(combined_features_umap[:, 0], combined_features_umap[:, 1], c=labels_rbf, cmap='rainbow')
plt.title("Spectral Clustering with UMAP Visualization")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.show()

In [None]:
ari_sc, dbi_sc, silhouette_avg_sc, nmi_sc, js_sc = evaluate_clustering(ground_truth_labels, labels_rbf, reduced_data_stand)
print("ARI:", ari_sc)
print("Davies-Bouldin Index:", dbi_sc)
print("Silhouette Score:", silhouette_avg_sc)
print("NMI:", nmi_sc)
print("Jaccard-coefficient:", js_sc)


In [None]:
data = {
    'Corpus': X,
    'ground_truth_labels': ground_truth_labels,
    'kmeans_labels': kmeans_labels,
    'hierarchical_labels': hierarchical_labels,
    'dbscan_labels': dbscan_labels,
    'spectral_labels': labels_rbf
}

df_result = pd.DataFrame(data)

df_result

In [None]:
pd.set_option('display.max_colwidth', None)
df_misclassified_gt0_kmeans1 = df_result[(df_result['ground_truth_labels'] == 0) & (df_result['kmeans_labels'] == 1)]

# Filter rows where ground truth is 1 but kmeans is 0
df_misclassified_gt1_kmeans0 = df_result[(df_result['ground_truth_labels'] == 1) & (df_result['kmeans_labels'] == 0)]

df_classified_gt0_kmeans0 = df_result[(df_result['ground_truth_labels'] == 0) & (df_result['kmeans_labels'] == 0)]

# Filter rows where ground truth is 1 but kmeans is 0
df_classified_gt1_kmeans1 = df_result[(df_result['ground_truth_labels'] == 1) & (df_result['kmeans_labels'] == 1)]

In [None]:
df_misclassified_gt1_kmeans0

In [None]:
df_misclassified_gt0_kmeans1

In [None]:
df_classified_gt0_kmeans0

In [None]:
df_classified_gt1_kmeans1

In [None]:
import altair as alt
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def NMF_modeling(df_data, components):
    df = df_data.copy()

    # Initialize TF-IDF vectorizer
    model = TfidfVectorizer(stop_words='english', ngram_range=(2, 2))

    # Fit and transform the TF-IDF matrix
    word_vector = model.fit_transform(df['Corpus'])

    # Fit NMF model
    nmf_model = NMF(n_components=components, init='nndsvd')
    nmf_model.fit(word_vector)

    # Assign the dominant topic to each document
    df['topic'] = nmf_model.transform(word_vector).argmax(axis=1)

    topic_words_dict = {}
    topic_words = {}

    n_words = 10

    feature_names = model.get_feature_names_out()

    # Extract top words for each topic and calculate TF-IDF scores
    for idx, topic in enumerate(nmf_model.components_):
        top_words_idx = topic.argsort()[:-n_words - 1:-1]
        top_words = [feature_names[i] for i in top_words_idx]

        # Get TF-IDF scores for the top words
        tfidf_scores = [model.idf_[model.vocabulary_[word]] for word in top_words]

        topic_words_dict[idx] = list(zip(top_words, tfidf_scores))
        topic_words[idx] = top_words

    # Add the topic_words_dict and topic_words to the DataFrame
    df['topic_words_tfid'] = [topic_words_dict[i] for i in df['topic']]
    df['topic_words'] = [topic_words[i] for i in df['topic']]

    return df, model


In [None]:
result_missclassified_1_0, tfidf_model = NMF_modeling(df_misclassified_gt1_kmeans0, components=2)
result_missclassified_0_1,tfidf_model_0_1 =NMF_modeling(df_misclassified_gt0_kmeans1, components=2)
result_missclassified_0_0,tfidf_model_0_0 =NMF_modeling(df_classified_gt0_kmeans0, components=2)
result_missclassified_1_1,tfidf_model_1_1 =NMF_modeling(df_classified_gt1_kmeans1, components=2)

In [None]:
result_missclassified_0_1

In [None]:
result_missclassified_1_0

In [None]:
result_missclassified_0_0

In [None]:
result_missclassified_1_1

In [None]:
from wordcloud import WordCloud
def visualize_combined_topic_words(df):
    # Concatenate all topic words and TF-IDF scores from the DataFrame
    all_topic_words = [item for sublist in df['topic_words_tfid'] for item in sublist]
    
    # Create a WordCloud for combined topic words
    combined_wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(all_topic_words))

    # Plot the combined WordCloud
    plt.figure(figsize=(12, 6))
    plt.imshow(combined_wordcloud, interpolation='bilinear')
    plt.title('Combined Topic Words')
    plt.axis('off')
    plt.show()

    # Create a bar plot for combined topic words
    combined_df = pd.DataFrame(all_topic_words, columns=['topic_words', 'tfidf_scores'])
    plt.figure(figsize=(12, 6))
    sns.barplot(x='topic_words', y='tfidf_scores', data=combined_df)
    plt.title('Combined Topic Words')
    plt.xticks(rotation=45, ha='right')
    plt.show()

# Visualize combined topic words for all rows
visualize_combined_topic_words(result_missclassified_1_1)

visualize_combined_topic_words(result_missclassified_0_0)
visualize_combined_topic_words(result_missclassified_1_0)
visualize_combined_topic_words(result_missclassified_0_1)