In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import umap.umap_ as umap
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
file_path = "../../data/adhd-beliefs-pt/adhd-beliefs-pt-embeddings-serafim.pkl"
df = pd.read_pickle(file_path)

In [None]:
df["adhd_diagnosis"] = df["adhd_diagnosis"].apply(lambda x: "ADHD" if x == "Sim, diagnosticado" else "No ADHD")
df["sex"] = df["sex"].apply(lambda x: "Female" if x == "Feminino" else "Male")

In [None]:
def visualize_umap(selected_column):
    X = df[selected_column].tolist()
    X = StandardScaler().fit_transform(X)

    umap_model = umap.UMAP(n_neighbors=10, min_dist=0.1, metric='cosine')
    X_umap = umap_model.fit_transform(X)

    df['UMAP1'] = X_umap[:, 0]
    df['UMAP2'] = X_umap[:, 1]

    # Plot UMAP
    sns.scatterplot(data=df, x='UMAP1', y='UMAP2', hue='adhd_diagnosis', style='sex', alpha=0.7)
    plt.title(f'UMAP of Sentence Embeddings ({selected_column})')
    plt.tight_layout()
    plt.show()

In [None]:
def visualize_kmeans(selected_column, n_clusters=5):
    X = df[selected_column].tolist()
    X = StandardScaler().fit_transform(X)

    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['kmeans_cluster'] = kmeans.fit_predict(X)

    plt.figure(figsize=(8, 6))
    sns.scatterplot(
        data=df,
        x='UMAP1',
        y='UMAP2',
        hue='kmeans_cluster',
        palette='rocket',
        style='adhd_diagnosis',
        s=80,
        edgecolor='w',
        alpha=0.85
    )
    plt.title(f'KMeans Clustering of Sentence Embeddings ({selected_column})', fontsize=14)
    plt.xlabel('UMAP1')
    plt.ylabel('UMAP2')
    plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
def compute_avg_similarity(group_df, selected_column):
    vectors = np.vstack(group_df[selected_column].tolist())
    similarity_matrix = cosine_similarity(vectors)
    # Only keep upper triangle (excluding diagonal)
    tri_upper = similarity_matrix[np.triu_indices(len(similarity_matrix), k=1)]
    return tri_upper.mean()

In [None]:
def get_similarity(selected_column):
    adhd_women = df[(df['sex'] == 'Female') & (df['adhd_diagnosis'] == 'ADHD')]
    non_adhd_women = df[~df.index.isin(adhd_women.index)]
    
    women = df[(df['sex'] == 'Female')]
    men = df[(df['sex'] == 'Male')]
    
    adhd = df[(df['adhd_diagnosis'] == 'ADHD')]
    non_adhd = df[(df['adhd_diagnosis'] == 'No ADHD')]

    print(f"[{selected_column}] Avg similarity - ADHD Women:", compute_avg_similarity(adhd_women, selected_column))
    print(f"[{selected_column}] Avg similarity - Non-ADHD Women:", compute_avg_similarity(non_adhd_women, selected_column))
    print(f"[{selected_column}] Avg similarity - Women:", compute_avg_similarity(women, selected_column))
    print(f"[{selected_column}] Avg similarity - Men:", compute_avg_similarity(men, selected_column))
    print(f"[{selected_column}] Avg similarity - ADHD:", compute_avg_similarity(adhd, selected_column))
    print(f"[{selected_column}] Avg similarity - Non-ADHD:", compute_avg_similarity(non_adhd, selected_column))

In [None]:
columns = ["special_interest_embedding", "diary_entry_embedding", "selfdefining_memory_embedding", "empty_sheet_embedding", "merged_text_embedding"]

In [None]:
for col in columns:
    df = df[df[col].apply(lambda x: not all(v == 0 for v in x))]
    print(df.shape)
    visualize_umap(col)
    visualize_kmeans(col, n_clusters=5)
    get_similarity(col)
    print(f"Visualizations and clustering for {col} completed.\n")
    print("\n" + "="*50 + "\n")