In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import glob
import numpy as np
%matplotlib inline

def read_data(file_pattern):
    # Use glob to get all file paths matching the pattern
    file_paths = glob.glob(file_pattern)

    # Read and concatenate all CSV files into one DataFrame
    data_frames = [pd.read_csv(file) for file in file_paths]
    data = pd.concat(data_frames, ignore_index=True)
    data = data[data['Type'] == 'multifocal']
    return data


def clean_data(data):
    X = data[['OD Sphere', 'OD Cylinder', 'OD Axis', 'OD Add',
            'OS Sphere', 'OS Cylinder', 'OS Axis', 'OS Add']]

    # Handle missing values (if any)
    X.fillna(0, inplace=True)

    # Normalize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    # Calculate Z-scores
    z_scores = np.abs((X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0))

    # Filter out rows where any Z-score is above the threshold (e.g., 3)
    threshold = 3
    return data[(z_scores < threshold).all(axis=1)]

data = read_data('dispense_report*.csv')
data = data[data['dispense type'] == 'DISPENSED']
data = clean_data(data)


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.pyplot as plt


X = data[['OD Sphere', 'OD Cylinder', 'OD Axis', 'OD Add',
        'OS Sphere', 'OS Cylinder', 'OS Axis', 'OS Add']]
# Standardize the filtered data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Perform hierarchical clustering on filtered data
Z = linkage(X_scaled, method='ward')



# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Glasses')
plt.ylabel('Distance')
plt.show()


In [96]:

# Cut the dendrogram to form clusters (e.g., 5 clusters)
num_clusters = 5
data['cluster'] = fcluster(Z, t=num_clusters, criterion='maxclust')

# Add cluster labels to the original data
data['cluster'] = data['cluster'].astype(str)  # Convert to string for easier analysis

In [None]:

from sklearn.decomposition import PCA
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)

# Loading scores
loading_scores = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=X.columns)
print("Loading scores:\n", loading_scores)

# Plot the PCA components
plt.figure(figsize=(8, 6))

scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=data['cluster'].astype(int), cmap='viridis', alpha=0.5)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA of Dispense Report Data')
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.show()

# Plot loading scores
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
loading_scores['PC1'].plot(kind='bar', ax=ax[0])
ax[0].set_title('Loading Scores for PC1')
ax[0].set_ylabel('Loading Score')
ax[0].set_xlabel('Feature')
ax[0].set_ylim(-0.5, 0.5)

loading_scores['PC2'].plot(kind='bar', ax=ax[1])
ax[1].set_title('Loading Scores for PC2')

plt.tight_layout()
plt.show()

In [None]:
%matplotlib widget
# Apply PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# Explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)

# Loading scores
loading_scores = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3'], index=X.columns)
print("Loading scores:\n", loading_scores)

# Plot the PCA components in 3D
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=data['cluster'].astype(int), cmap='viridis', alpha=0.5)

ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
ax.set_zlabel('PCA Component 3')
ax.set_title('3D PCA of Dispense Report Data')
legend1 = ax.legend(*scatter.legend_elements(), title="Clusters")
plt.show()


In [None]:

%matplotlib inline
# Plot loading scores
fig, ax = plt.subplots(1, 3, figsize=(18, 6))
loading_scores['PC1'].plot(kind='bar', ax=ax[0])
ax[0].set_title('Loading Scores for PC1')
ax[0].set_ylabel('Loading Score')
ax[0].set_xlabel('Feature')
ax[0].set_ylim(-0.5, 0.5)

loading_scores['PC2'].plot(kind='bar', ax=ax[1])
ax[1].set_title('Loading Scores for PC2')
ax[1].set_ylabel('Loading Score')
ax[1].set_xlabel('Feature')
ax[1].set_ylim(-0.5, 0.5)

loading_scores['PC3'].plot(kind='bar', ax=ax[2])
ax[2].set_title('Loading Scores for PC3')
ax[2].set_ylabel('Loading Score')
ax[2].set_xlabel('Feature')
ax[2].set_ylim(-0.5, 0.5)



plt.tight_layout()
plt.show()