# 1. Data Augmentation for generating images for train:test (80:20 ratio).

In [None]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from sklearn.model_selection import train_test_split
import shutil

# base directory to extract images
base_dir = "data_set"

# augmented images directory
aug_dir1 = "augmented_set1"
os.makedirs(aug_dir1, exist_ok=True)
aug_dir2 = "augmented_set2"
os.makedirs(aug_dir2, exist_ok=True)

# train and test directories
train_dir = os.path.join(aug_dir1, "train_set")
test_dir = os.path.join(aug_dir2, "test_set")

# Create directories if not present
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

 # Data augmentation
data_gen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Generate augmented images
def generate_augmented_images(base_dir, augmented_dir, batch_size=1, num_augmented_images=50):
    os.makedirs(augmented_dir, exist_ok=True)

    # Get the list of images in the base directory
    images = os.listdir(base_dir)

    for img_name in images:
        img_path = os.path.join(base_dir, img_name)
        img = load_img(img_path)  # Load the image
        x = img_to_array(img)  # Convert the image to numpy array
        x = np.expand_dims(x, axis=0)  # Add batch dimension

        # Create augmented images for this one image
        i = 0
        for batch in data_gen.flow(x, batch_size=batch_size, save_to_dir=augmented_dir, save_prefix=img_name, save_format='jpeg'):
            i += 1
            if i >= num_augmented_images:
                break

# for training
generate_augmented_images(base_dir, train_dir)
# for testing
generate_augmented_images(base_dir, test_dir,num_augmented_images=10)

# 2. Using VGG16.

In [None]:
import sys
import os
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

import numpy as np

# Increase recurssion limit (temporary solution)
sys.setrecursionlimit(10000)

# Load the VGG16 model pre-trained on ImageNet
base_model = VGG16(weights='imagenet', include_top=False,
                 input_shape=(224, 224, 3))

# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

# Create a new model and add the base model and new layers
model = Sequential([
    base_model,
    Flatten(),
    Dense(256, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


# Load and preprocess the dataset for training
train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
    'augmented_set1',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    color_mode='rgb',
)

# Load and preprocess the dataset for validating
validate_datagen = ImageDataGenerator(rescale=1./255)
validate_generator = validate_datagen.flow_from_directory(
    'validation_set',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    color_mode='rgb',
)


# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# Train the model
history = model.fit(train_generator, epochs=10, callbacks=[early_stopping], validation_data=validate_generator)

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot training & validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# 3. Fine-tunning.

In [None]:
from tensorflow.keras.optimizers import Adam
# Gradually unfreeze layers
for layer in model.layers[-4:]:
    layer.trainable = True
    

# Compile model again with lower learning rate for fine-tunning
model.compile(optimizer=Adam(learning_rate=0.00001),
              loss='binary_crossentropy',
              metrics=['accuracy']
             )

# Load and preprocess the dataset for training
train_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(
    'augmented_set2', # as my machine is not compatible to huge data we will use small Dummy_train set
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    color_mode='rgb',
)

# Load and preprocess the dataset for training
validate_datagen = ImageDataGenerator(rescale=1./255)
validate_generator = validate_datagen.flow_from_directory(
    'validation_set',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    color_mode='rgb',
)

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# Train the model
history = model.fit(train_generator, epochs=10, callbacks=[early_stopping], validation_data=validate_generator)

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot training & validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# 4. Feature Extraction

In [None]:
# Load and preprocess the dataset for training
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
    'augmented_set2',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    color_mode='rgb',
)


import numpy as np

# Get features from the model
features = model.predict(test_generator)

# Flatten the features (e.g., for a convolutional layer)
flattened_features = features.reshape(features.shape[0], -1)  # (num_images, feature_vector_size)

print("Feature shape:", flattened_features.shape)

# 5. Using K-means clustering for clusters

In [None]:
from sklearn.cluster import KMeans

# Define the number of clusters
num_clusters = 21  # Adjust based on your dataset and task

# Train k-means on the feature vectors
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(flattened_features)

# Get the cluster labels for each image
cluster_labels = kmeans.labels_

print("Cluster labels:", cluster_labels)

In [15]:
import pandas as pd

# Create a DataFrame with filenames and cluster labels
filenames = test_generator.filenames  # List of image file paths
results_df = pd.DataFrame({'Filename': filenames, 'Cluster': cluster_labels})

# Save to a CSV file
results_df.to_csv('clustering_results.csv', index=False)

In [None]:
from sklearn.decomposition import PCA

# Check feature dimensions
print("Feature shape:", flattened_features.shape)  # (n_samples, n_features)

# Adjust n_components based on dataset size
num_samples, num_features = flattened_features.shape
n_components = min(2, num_samples, num_features)

# Apply PCA
pca = PCA(n_components=n_components)
reduced_features = pca.fit_transform(flattened_features)

print("Reduced features shape:", reduced_features.shape)