In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn import metrics

%matplotlib

def analyze_data(file_name, input_columns, threshold):
    # Load data
    with open(file_name, 'rb') as fl:
        database_full = pickle.load(fl)

    # Extract input and output based on file configuration
    data_input_full = database_full.iloc[:, 0:input_columns]
    data_output_full = (database_full.iloc[:, input_columns] < threshold).astype(int)

    # Get indexes for positive and negative samples
    positive_idx = np.where(data_output_full == 1)
    negative_idx = np.where(data_output_full == 0)

    # Perform PCA dimensionality reduction
    pca = PCA(n_components=2)
    data_input_full_2d_pca = pca.fit_transform(data_input_full)

    # Plot PCA results
    plt.figure()
    plt.scatter(data_input_full_2d_pca[positive_idx, 0], data_input_full_2d_pca[positive_idx, 1], color='red', label='Positive', alpha=0.1)
    plt.scatter(data_input_full_2d_pca[negative_idx, 0], data_input_full_2d_pca[negative_idx, 1], color='blue', label='Negative', alpha=0.1)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend()
    plt.title(f'2D PCA for {file_name}')
    plt.show()

    # Perform t-SNE dimensionality reduction
    tsne = TSNE(n_components=2, random_state=42)
    data_input_full_2d_tsne = tsne.fit_transform(data_input_full)

    # Plot t-SNE results
    plt.figure()
    plt.scatter(data_input_full_2d_tsne[positive_idx, 0], data_input_full_2d_tsne[positive_idx, 1], color='red', label='Positive', alpha=0.1)
    plt.scatter(data_input_full_2d_tsne[negative_idx, 0], data_input_full_2d_tsne[negative_idx, 1], color='blue', label='Negative', alpha=0.1)
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.legend()
    plt.title(f'2D t-SNE for {file_name}')
    plt.show()

    # Calculate silhouette score and DBI for PCA and t-SNE
    silhouette_score_pca = metrics.silhouette_score(data_input_full_2d_pca, data_output_full)
    dbi_score_pca = metrics.davies_bouldin_score(data_input_full_2d_pca, data_output_full)
    silhouette_score_tsne = metrics.silhouette_score(data_input_full_2d_tsne, data_output_full)
    dbi_score_tsne = metrics.davies_bouldin_score(data_input_full_2d_tsne, data_output_full)

    # Print silhouette score and DBI results
    print(f"File: {file_name}")
    print(f"PCA Silhouette Coefficient: {silhouette_score_pca}")
    print(f"PCA Davies-Bouldin Index: {dbi_score_pca}")
    print(f"t-SNE Silhouette Coefficient: {silhouette_score_tsne}")
    print(f"t-SNE Davies-Bouldin Index: {dbi_score_tsne}")


In [None]:
# usage activity
analyze_data('./database_full_ac.pkl', 54, 200)
analyze_data('./database_full_ac.pkl', 54, 250)
analyze_data('./database_full_ac.pkl', 54, 300)
analyze_data('./database_high_quality_ac.pkl', 54, 200)
analyze_data('./database_high_quality_ac.pkl', 54, 250)
analyze_data('./database_high_quality_ac.pkl', 54, 300)

In [None]:
# usage stability
analyze_data('./database_full_st.pkl', 55, -1)
analyze_data('./database_full_st.pkl', 55, 0)
analyze_data('./database_full_st.pkl', 55, 1)
analyze_data('./database_high_quality_st.pkl', 55, -1)
analyze_data('./database_high_quality_st.pkl', 55, 0)
analyze_data('./database_high_quality_st.pkl', 55, 1)