In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import plotly.express as px


In [None]:
base_dir = '/Users/sonn/Sonn/Workspace/Projects/OrangesSweetnessML/data/processed/2024'
label_map = {'A': 0, 'B': 1, 'C': 2}
img_size = (128, 128)



In [None]:
label_map = {'A': 0, 'B': 1, 'C': 2}
img_size = (128, 128)

X = []
y = []
label_names = []


PCA

In [None]:
# 2D PCA
label_map = {'A': 0, 'B': 1, 'C': 2}
img_size = (128, 128)

X = []
y = []
label_names = []

# Đọc ảnh từ thư mục con
for label_name, label_id in label_map.items():
    label_folder = os.path.join(base_dir, label_name)
    for sub_folder in os.listdir(label_folder):
        sub_path = os.path.join(label_folder, sub_folder)
        if os.path.isdir(sub_path):
            for file in os.listdir(sub_path):
                if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    img_path = os.path.join(sub_path, file)
                    img = Image.open(img_path).convert('L')  # grayscale
                    img = img.resize(img_size)
                    X.append(np.array(img).flatten())
                    y.append(label_id)
                    label_names.append(label_name)

X = np.array(X)

X_pca = PCA(n_components=2).fit_transform(X)

df_plot = pd.DataFrame({
    'PCA1': X_pca[:, 0],
    'PCA2': X_pca[:, 1],
    'Label': label_names
})

fig = px.scatter(
    df_plot, x='PCA1', y='PCA2', color='Label',
    title='2D PCA Visualization',
    labels={'PCA1': 'PC 1', 'PCA2': 'PC 2'},
    opacity=0.7,
    height=600,
    width=800
)
fig.update_traces(marker=dict(size=6))
fig.update_layout(template='plotly_white')
fig.show()


In [None]:
# 3D PCA

X = []
y = []
label_names = []

for label_name, label_id in label_map.items():
    label_folder = os.path.join(base_dir, label_name)
    for sub_folder in os.listdir(label_folder):
        sub_path = os.path.join(label_folder, sub_folder)
        if os.path.isdir(sub_path):
            for file in os.listdir(sub_path):
                if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    img_path = os.path.join(sub_path, file)
                    img = Image.open(img_path).convert('L')  # grayscale
                    img = img.resize(img_size)
                    X.append(np.array(img).flatten())
                    y.append(label_id)
                    label_names.append(label_name)
X = np.array(X)

X_pca = PCA(n_components=3).fit_transform(X)

df_plot = pd.DataFrame({
    'PCA1': X_pca[:, 0],
    'PCA2': X_pca[:, 1],
    'PCA3': X_pca[:, 2],
    'Label': label_names
})

fig = px.scatter_3d(
    df_plot,
    x='PCA1', y='PCA2', z='PCA3',
    color='Label',
    title='3D PCA Visualization',
    opacity=0.7
)
fig.update_traces(marker=dict(size=4))
fig.update_layout(template='plotly_white')

fig.write_html("/Users/sonn/Sonn/Workspace/Projects/OrangesSweetnessML/reports/2024_3D_PCA_Classification.html")
fig.show()


T-SNE


In [None]:
# 2D T-SNE Visualization
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, n_iter=1000, random_state=42)
X_tsne = tsne.fit_transform(X)


df_plot = pd.DataFrame({
    'Dim1': X_tsne[:, 0],
    'Dim2': X_tsne[:, 1],
    'Label': label_names
})

fig = px.scatter(
    df_plot, x='Dim1', y='Dim2', color='Label',
    title='2D t-SNE Visualization',
    opacity=0.7,
    width=800, height=600
)
fig.update_traces(marker=dict(size=5))
fig.update_layout(template='plotly_white')
fig.show()


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



CNN AutoEncoder

In [None]:
import os
import numpy as np
from PIL import Image
from sklearn.preprocessing import StandardScaler



for label_name in label_map:
    label_folder = os.path.join(base_dir, label_name)
    for sub_folder in os.listdir(label_folder):
        sub_path = os.path.join(label_folder, sub_folder)
        if os.path.isdir(sub_path):
            for file in os.listdir(sub_path):
                if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    img_path = os.path.join(sub_path, file)
                    img = Image.open(img_path).convert('L')
                    img = img.resize(img_size)
                    X.append(np.array(img).flatten())
                    y.append(label_map[label_name])
                    label_names.append(label_name)

X = np.array(X) / 255.0
X = np.expand_dims(X, -1)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

input_dim = X.shape[1]

# Encoder
inputs = tf.keras.Input(shape=(input_dim, 1))
x = layers.Conv1D(16, 3, strides=2, padding='same', activation='relu')(inputs)
x = layers.Conv1D(32, 3, strides=2, padding='same', activation='relu')(x)
x = layers.Conv1D(64, 3, strides=2, padding='same', activation='relu')(x)
latent = layers.Conv1D(128, 3, strides=2, padding='same', activation='relu')(x)  # [N, L, 128]

# Decoder
x = layers.Conv1DTranspose(64, 3, strides=2, padding='same', activation='relu')(latent)
x = layers.Conv1DTranspose(32, 3, strides=2, padding='same', activation='relu')(x)
x = layers.Conv1DTranspose(16, 3, strides=2, padding='same', activation='relu')(x)
outputs = layers.Conv1DTranspose(1, 3, strides=2, padding='same', activation='sigmoid')(x)

autoencoder = models.Model(inputs, outputs)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.summary()


In [None]:
autoencoder.fit(X, X, epochs=20, batch_size=64, shuffle=True)
# Nếu GPU quá tải thì có thể thử:
# 1. Giảm batch_size xuống 32
# 2. Giảm số epochs xuống 10
# 3. Giảm kích thước ảnh xuống

Epoch 1/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 124ms/step - loss: 0.0865
Epoch 2/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 130ms/step - loss: 0.0150
Epoch 3/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 131ms/step - loss: 4.2690e-04
Epoch 4/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 132ms/step - loss: 1.5239e-04
Epoch 5/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 133ms/step - loss: 9.2204e-05
Epoch 6/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 135ms/step - loss: 7.1459e-05
Epoch 7/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 135ms/step - loss: 5.9561e-05
Epoch 8/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 138ms/step - loss: 5.1710e-05
Epoch 9/20
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 147ms/step - loss: 4.9216e-05
Epoch 10/20
[1m231/231[0m [32m━━━

<keras.src.callbacks.history.History at 0x3a0bd0cd0>

In [None]:
encoder = models.Model(inputs, latent)
latent_output = encoder.predict(X)


z = latent_output.reshape(latent_output.shape[0], -1)

z_tsne = TSNE(n_components=2, perplexity=30, n_iter=1000).fit_transform(z)

df_plot = pd.DataFrame({
    'Dim1': z_tsne[:, 0],
    'Dim2': z_tsne[:, 1],
    'Label': label_names
})

fig = px.scatter(df_plot, x='Dim1', y='Dim2', color='Label',
                 title='Latent Space by 1D-CNN Autoencoder + t-SNE (TF)',
                 width=800, height=600)

fig.write_html("/Users/sonn/Sonn/Workspace/Projects/OrangesSweetnessML/reports/2024_2D_CNNAutoEncoder-TSNE.html")


[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step


