In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import glob
import numpy as np
%matplotlib inline

def read_data(file_pattern):
    # Use glob to get all file paths matching the pattern
    file_paths = glob.glob(file_pattern)

    # Read and concatenate all CSV files into one DataFrame
    data_frames = [pd.read_csv(file) for file in file_paths]
    data = pd.concat(data_frames, ignore_index=True)
    data = data[data['Type'] == 'multifocal']
    return data


def clean_data(data):
    X = data[['OD Sphere', 'OD Cylinder', 'OD Axis', 'OD Add',
            'OS Sphere', 'OS Cylinder', 'OS Axis', 'OS Add']]

    # Handle missing values (if any)
    X.fillna(0, inplace=True)

    # Normalize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    # Calculate Z-scores
    z_scores = np.abs((X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0))

    # Filter out rows where any Z-score is above the threshold (e.g., 3)
    threshold = 3
    return data[(z_scores < threshold).all(axis=1)]

data = read_data('dispense_report*.csv')
data = data[data['dispense type'] == 'DISPENSED']
data = clean_data(data)


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.pyplot as plt


X = data[['OD Sphere', 'OD Cylinder', 'OD Axis', 'OD Add',
        'OS Sphere', 'OS Cylinder', 'OS Axis', 'OS Add']]
# Standardize the filtered data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Perform hierarchical clustering on filtered data
Z = linkage(X_scaled, method='ward')



# Plot the dendrogram
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Glasses')
plt.ylabel('Distance')
plt.show()


In [163]:

# Cut the dendrogram to form clusters (e.g., 5 clusters)
num_clusters = 8
data['cluster'] = fcluster(Z, t=num_clusters, criterion='maxclust')

# Add cluster labels to the original data
data['cluster'] = data['cluster'].astype(str)  # Convert to string for easier analysis

In [None]:

from sklearn.decomposition import PCA
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)

# Loading scores
loading_scores = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=X.columns)
print("Loading scores:\n", loading_scores)

# Plot the PCA components
plt.figure(figsize=(8, 6))

scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=data['cluster'].astype(int), cmap='viridis', alpha=0.5)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('PCA of Dispense Report Data')
plt.legend(*scatter.legend_elements(), title="Clusters")
plt.show()

# Plot loading scores
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
loading_scores['PC1'].plot(kind='bar', ax=ax[0])
ax[0].set_title('Loading Scores for PC1')
ax[0].set_ylabel('Loading Score')
ax[0].set_xlabel('Feature')
ax[0].set_ylim(-0.5, 0.5)

loading_scores['PC2'].plot(kind='bar', ax=ax[1])
ax[1].set_title('Loading Scores for PC2')

plt.tight_layout()
plt.show()

In [None]:
%matplotlib widget
# Apply PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# Explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)

# Loading scores
loading_scores = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3'], index=X.columns)
print("Loading scores:\n", loading_scores)

# Plot the PCA components in 3D
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=data['cluster'].astype(int), cmap='viridis', alpha=0.5)

ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
ax.set_zlabel('PCA Component 3')
ax.set_title('3D PCA of Dispense Report Data')
legend1 = ax.legend(*scatter.legend_elements(), title="Clusters")
plt.show()


In [None]:

%matplotlib inline
# Plot loading scores
fig, ax = plt.subplots(1, 3, figsize=(18, 6))
loading_scores['PC1'].plot(kind='bar', ax=ax[0])
ax[0].set_title('Loading Scores for PC1')
ax[0].set_ylabel('Loading Score')
ax[0].set_xlabel('Feature')
ax[0].set_ylim(-0.5, 0.5)

loading_scores['PC2'].plot(kind='bar', ax=ax[1])
ax[1].set_title('Loading Scores for PC2')
ax[1].set_ylabel('Loading Score')
ax[1].set_xlabel('Feature')
ax[1].set_ylim(-0.5, 0.5)

loading_scores['PC3'].plot(kind='bar', ax=ax[2])
ax[2].set_title('Loading Scores for PC3')
ax[2].set_ylabel('Loading Score')
ax[2].set_xlabel('Feature')
ax[2].set_ylim(-0.5, 0.5)



plt.tight_layout()
plt.show()

In [None]:
# Load and preprocess inventory data similarly
inventory_data = read_data('inventory*.csv')
inventory_data = clean_data(inventory_data)

X_inventory = inventory_data[['OD Sphere', 'OD Cylinder', 'OD Axis', 'OD Add',
                              'OS Sphere', 'OS Cylinder', 'OS Axis', 'OS Add']].fillna(0)
X_inventory_scaled = scaler.transform(X_inventory)

# Compute cluster centroids from dispensed data
centroids = []
for cluster_id in sorted(data['cluster'].unique(), key=int):
    cluster_points = X_scaled[data['cluster'] == cluster_id]
    centroids.append(cluster_points.mean(axis=0))
centroids = np.array(centroids)

# Assign each inventory item to the nearest centroid
distances = np.sqrt(((X_inventory_scaled[:, None] - centroids) ** 2).sum(axis=2))
nearest_cluster_indices = distances.argmin(axis=1)
inventory_data['cluster'] = (nearest_cluster_indices + 1).astype(str)

# Compute absolute and relative frequency in the dispensed data
dispense_cluster_count = data['cluster'].value_counts().rename('dispense_cluster_count')
dispense_cluster_freq = data['cluster'].value_counts(normalize=True).rename('dispense_cluster_frequency')

# Compute absolute and relative frequency in the inventory data
inventory_cluster_count = inventory_data['cluster'].value_counts().rename('inventory_cluster_count')
inventory_cluster_freq = inventory_data['cluster'].value_counts(normalize=True).rename('inventory_cluster_frequency')

# Create a comparison DataFrame
comparison_df = pd.DataFrame({
    'dispense_cluster_count': dispense_cluster_count,
    'dispense_cluster_frequency': dispense_cluster_freq,
    'inventory_cluster_count': inventory_cluster_count,
    'inventory_cluster_frequency': inventory_cluster_freq
}).fillna(0)

print(comparison_df)



In [None]:
# Convert cluster frequencies to percentages in the comparison_df
comparison_df['dispense_cluster_percent'] = comparison_df['dispense_cluster_frequency'] * 100
comparison_df['inventory_cluster_percent'] = comparison_df['inventory_cluster_frequency'] * 100

# Plot cluster frequencies as percentages
fig, ax = plt.subplots(figsize=(8, 6))

# We'll plot a grouped bar chart
bar_width = 0.4
clusters = comparison_df.index
x_positions = range(len(clusters))

ax.bar(
    [x - bar_width/2 for x in x_positions], 
    comparison_df['dispense_cluster_percent'], 
    width=bar_width, 
    label='Dispensed (%)'
)

ax.bar(
    [x + bar_width/2 for x in x_positions], 
    comparison_df['inventory_cluster_percent'], 
    width=bar_width, 
    label='Inventory (%)'
)

ax.set_xticks(x_positions)
ax.set_xticklabels(clusters)
ax.set_xlabel('Cluster')
ax.set_ylabel('Frequency (%)')
ax.set_title('Comparison of Cluster Frequencies: Dispensed vs. Inventory')
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Prepare the data
X = inventory_data[['OD Sphere', 'OD Cylinder', 'OD Axis', 'OD Add',
          'OS Sphere', 'OS Cylinder', 'OS Axis', 'OS Add']]
y = inventory_data['cluster']

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Get unique clusters
clusters = y.unique()

# Plot feature importances for each cluster
fig, axes = plt.subplots(len(clusters), 1, figsize=(10, len(clusters) * 4))

for i, cluster in enumerate(clusters):
    # Create binary labels for the current cluster
    y_binary = (y == cluster).astype(int)
    
    # Train a Random Forest classifier
    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_scaled, y_binary)
    
    # Get feature importances
    feature_importances = rf.feature_importances_
    features = X.columns
    
    # Plot feature importances
    axes[i].barh(features, feature_importances)
    axes[i].set_xlabel('Feature Importance')
    axes[i].set_ylabel('Feature')
    axes[i].set_title(f'Feature Importances for Cluster {cluster}')

plt.tight_layout()
plt.show()