In [None]:
from PIL import Image
import numpy as np
import pandas as pd
import os
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [None]:
def calculate_and_store_SVD():
    df = pd.read_csv('data/filtered_df2.csv')

    image_files = list(df['Image']) 

    sample_img = Image.open("data/eyeball_img/" + image_files[0])
    M, N = sample_img.size
    print(f"Image Dimensions: {M}, {N}")

    images_array = np.zeros((len(image_files), M * N * 3), dtype=np.float32)

    for i, file in enumerate(image_files):
        print(i)
        img = Image.open("data/eyeball_img/" + file).convert('RGB')
        img_array = np.asarray(img, dtype=np.float32) / 255.0  # normalize to [0,1]
        images_array[i, :] = img_array.flatten()  # flatten

    U, S, Vt = svds(images_array, k=100)
    np.savez("svd_results.npz", U=U, S=S, Vt=Vt)

In [42]:
data = np.load("svd_results.npz")
U, S, Vt = data["U"], data["S"], data["Vt"]
np.savez("svd_results_small", U=U, S=S)

In [None]:
data = np.load("svd_results.npz")
U, S, Vt = data["U"], data["S"], data["Vt"]
top_2_U = U[:, :2]
print(np.shape(top_2_U))
df = pd.read_csv('data/filtered_df2.csv')
df['top_1_coefficient'] = top_2_U[:, 0]  # First singular vector coefficient
df['top_2_coefficient'] = top_2_U[:, 1]  # Second singular vector coefficient

In [None]:
colors = []
for index, row in df.iterrows():
    if row['N'] == 1:
        colors.append('grey')
    elif row['D'] == 1:
        colors.append('red')
    elif row['G'] == 1:
        colors.append('yellow')
    elif row['C'] == 1:
        colors.append('green')
    elif row['A'] == 1:
        colors.append('orange')
    elif row['H'] == 1:
        colors.append('pink')
    elif row['M'] == 1:
        colors.append('blue')
    elif row['O'] == 1:
        colors.append('purple')
df.plot.scatter(x='top_1_coefficient', y='top_2_coefficient', c=colors)

In [None]:
indicators = ['N', 'D', 'G', 'C', 'A', 'H', 'M', 'O']

# Create an 8x8 grid of subplots
fig, axes = plt.subplots(8, 8, figsize=(20, 20))

# Iterate over the grid of subplots (8x8)
for i in range(8):
    for j in range(8):
        # Select two indicator variables to compare
        var1 = indicators[i]
        var2 = indicators[j]

        # Filter the data for the two indicator variables
        # Points where var1 == 1 and var2 == 0 (first class)
        class_1 = df[(df[var1] == 1) & (df[var2] == 0)]
        # Points where var1 == 0 and var2 == 1 (second class)
        class_2 = df[(df[var1] == 0) & (df[var2] == 1)]

        # Scatter plot for the first class (using color 'red')
        axes[i, j].scatter(class_1['top_1_coefficient'], class_1['top_2_coefficient'], color='red', label=var1, alpha=0.2)
        
        # Scatter plot for the second class (using color 'blue')
        axes[i, j].scatter(class_2['top_1_coefficient'], class_2['top_2_coefficient'], color='blue', label=var2, alpha=0.2)
        
        # Set plot labels
        axes[i, j].set_xlabel('Top Singular Vector 1')
        axes[i, j].set_ylabel('Top Singular Vector 2')
        axes[i, j].set_title(f'{var1} vs {var2}')

        # Optionally add a legend
        axes[i, j].legend()

# Adjust layout for better spacing
plt.tight_layout()
plt.show()
plt.clf()