## Imports

In [1]:
import pandas as pd
from MainPackage import MixtureModelBernoulli
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import adjusted_rand_score
import time

# Run model

In [None]:
num_classes = 4
random_state = 100

# Get data
df = pd.read_csv("../Data/test_data.csv",index_col=0)

# Set up model
C_list = MixtureModelBernoulli(num_classes=num_classes,
                               random_state=random_state,
                               burn_in=0,
                               max_iter=1000)


# fit the data to the model
start_time = time.time()
C_list.fit(df)
total_time = time.time() - start_time

print("DONE!")
print(f"Total time: {round(total_time,5)} (secs)")

# Get parameter estimates
k, theta, pi = C_list.get_params()

# Print Parameters

## Pi

In [None]:
pi = pd.DataFrame(pi)
pi.columns = ["Probability"]
pi.index = [f"Cluster {x}" for x in np.arange(0,len(pi)).astype(str)]
display(pi)

## Theta

In [None]:
theta = pd.DataFrame(theta)
theta.columns = [f"Item {x}" for x in np.arange(1,theta.shape[1]+1).astype(str)]
theta.index = [f"Cluster {x}" for x in np.arange(0,theta.shape[0]).astype(str)]
display(theta)

## K

In [None]:
k = pd.DataFrame(k)
k.columns = ["Avg Class"]
k.index = [f"Subject {x}" for x in np.arange(1,len(k)+1).astype(str)]
display(k)

## Class Membership

In [None]:
memebership = C_list.get_class_membership_scores()
memebership = pd.DataFrame(memebership)
memebership.columns = [f"Cluster {x}" for x in np.arange(0,memebership.shape[1]).astype(str)]
memebership.index = [f"Subject {x}" for x in np.arange(1,memebership.shape[0]+1).astype(str)]
display(memebership)

# Cluster Mapping
It should be noted the cluster numbers do not align between the two clusterings. We will use the two k vectors to find the most accurate mapping between the two clusterings.

In [None]:
# Get true k vector. We subject 1 to get 0 based index
df_combined_k = pd.read_csv("../Data/test_k_vector.csv",index_col=0) -1
df_combined_k.columns = ["True Class"]

# round predicted k
df_combined_k["Pred Class"] = k.round().astype(np.int64).values

# Get freq table
freq_table = df_combined_k.copy()
freq_table["Ones"] = 1
freq_table = freq_table.pivot_table(columns="Pred Class",
                                    index="True Class",
                                    values="Ones",
                                    aggfunc=sum)
freq_table = freq_table.fillna(0)
freq_table

The table above suggests that the predicted cluster 0 corresponds to the true cluster 1. And similarly:
* 0->1
* 1->3
* 2->0
* 3->2

We will now relabel the predicted parameters so that the cluster numbers line up

In [None]:
mapping = [1,3,0,2]
mapping_inv = [2,0,3,1]
def cluster_mapping(x):
    return mapping[x]

# Map pred k
df_combined_k_mapped = df_combined_k.copy()
df_combined_k_mapped["Pred Class"] = df_combined_k["Pred Class"].apply(cluster_mapping)

# Reorder pi
df_combined_pi_mapped = pi.iloc[mapping_inv].copy()
df_combined_pi_mapped.index = [f"Cluster {x}" for x in np.arange(0,len(pi)).astype(str)]
df_combined_pi_mapped.columns = ["Pred Prob"]

# Reorder theta
df_pred_theta_mapped = theta.iloc[mapping_inv].copy()
df_pred_theta_mapped.index = [f"Cluster {x}" for x in np.arange(0,theta.shape[0]).astype(str)]

# Comparing K vectors with Adjusted Rand Score
Adjusted rand score is a score between [0,1] which compares two clusterings. 1 indicating the same clustering, and 0 indicating random cluster labels.

In [None]:
display(df_combined_k_mapped.T)
rand_score = adjusted_rand_score(df_combined_k_mapped["True Class"],
                                 df_combined_k_mapped["Pred Class"])
print(f"rand score : {rand_score}")

With a score of .968 the clusters are almost equivalent. This gives us strong evidence the algorithm is able to accurately cluster the subjects.

# Comparing Pi Vector

In [None]:
# Get true pi vector
df_true_pi = pd.read_csv("../Data/test_pi_vector.csv",index_col=0)
df_true_pi.index = [f"Cluster {x}" for x in np.arange(0,len(pi)).astype(str)]
# add to combined df
df_combined_pi_mapped["True Prob"] = df_true_pi["Probability"]
display(df_combined_pi_mapped)

# Comparing Theta Matrix

In [None]:
# Get true pi vector
df_true_theta = pd.read_csv("../Data/test_theta_matrix.csv",index_col=0)
df_true_theta.index = [f"Cluster {x}" for x in np.arange(0,len(pi)).astype(str)]
print("True Theta Matrix")
display(df_true_theta)
print("Pred Theta Matrix")
display(df_pred_theta_mapped)

# Analyze Convergence for thetas

In [None]:
np.random.RandomState(100)
number_of_rows = 5
number_of_cols = 2
samples_theta_params = C_list.samples_theta_params
print(samples_theta_params.shape)

# Get random indexes
c_range = [0,1,2,3]
j_range = np.random.choice(np.arange(0,samples_theta_params.shape[2]),number_of_cols)


plt.figure(figsize=(10,20))
num_plot = 1
for c in c_range:
    for j in j_range:
        running_avg = np.zeros(samples_theta_params.shape[0])
        for num_sample in range(1,samples_theta_params.shape[0]):
            running_avg[num_sample] = np.average(samples_theta_params[:num_sample, c, j])

        plt.subplot(number_of_rows,number_of_cols,num_plot)
        # Plot samples
        plt.plot(samples_theta_params[:, c, j], ".r", label="Sample",alpha=.15)
        # Plot running avg
        plt.plot(running_avg, "-k,", label="Running\nAverage",alpha=.5)
        # plot true val
        truevals = df_true_theta.iloc[mapping[c],j] * np.ones_like(running_avg)
        plt.plot(truevals, "--b,", label="True Val")
        
        plt.xlabel("Sample Number")
        plt.ylabel("Parameter Value")
        plt.title(f"Convergence of theta[{c},{j}]")
        plt.legend(bbox_to_anchor = (1.05, 0.6))
        num_plot += 1
plt.tight_layout()

# Convergence Analysis for k

In [None]:
samples_class_assignments = C_list.samples_class_assignments
print(samples_class_assignments.shape)

# Get random indexes
i_range = np.random.choice(np.arange(0,samples_class_assignments.shape[1]),number_of_cols*number_of_rows)

plt.figure(figsize=(10,20))
num_plot = 1
for i in i_range:
    running_avg = np.zeros(samples_class_assignments.shape[0])
    for num_sample in range(1,samples_class_assignments.shape[0]):
        running_avg[num_sample] = np.average(samples_class_assignments[:num_sample, i])

    plt.subplot(number_of_rows,number_of_cols,num_plot,alpha=.5)
    # Plot samples
    plt.plot(samples_class_assignments[:, i], ".r", label="Sample",alpha=.15)
    # Plot running avg
    plt.plot(running_avg, "-k,", label="Running\nAverage")
    # plot true val
    truevals = mapping_inv[df_combined_k_mapped.iloc[i,0]] * np.ones_like(running_avg)
    plt.plot(truevals, "--b,", label="True Val")
    plt.xlabel("Sample Number")
    plt.ylabel("Parameter Value")
    plt.title(f"Convergence of k[{i}]")
    plt.legend(bbox_to_anchor = (1.05, 0.6))
    plt.ylim(-.5,3.5)
    num_plot += 1
plt.tight_layout()

# Convergence Analysis for pi

In [None]:
samples_class_probabilities = C_list.samples_class_probabilities
print(samples_class_probabilities.shape)


plt.figure(figsize=(10,20))
num_plot = 1
for c in c_range:
    running_avg = np.zeros(samples_class_probabilities.shape[0])
    for num_sample in range(1,samples_class_probabilities.shape[0]):
        running_avg[num_sample] = np.average(samples_class_probabilities[:num_sample, c])

    plt.subplot(number_of_rows,number_of_cols,num_plot)
    # Plot samples
    plt.plot(samples_class_probabilities[:, c], ".r", label="Sample",alpha=.15)
    # Plot running avg
    plt.plot(running_avg, "-k,", label="Running\nAverage",alpha=.5)
    # plot true val
    truevals = df_combined_pi_mapped.iloc[mapping[c],1] * np.ones_like(running_avg)
    plt.plot(truevals, "--b,", label="True Val")
    plt.xlabel("Sample Number")
    plt.ylabel("Parameter Value")
    plt.title(f"Convergence of pi[{c}]")
    plt.legend(bbox_to_anchor = (1.05, 0.6))
    plt.ylim(0,1)
    num_plot += 1
plt.tight_layout()