# SimCLR

Here, SimCLR Architecture is defined and implemented along with K-means to cluster the images. For further evaluation of the clusters, Logistic Regression is employed to evaluate the accuracy, precision, recall, and F1-Score.

The necessary libraries are imported and the processed images and labels are uploaded.

In [1]:
import pandas as pd #import necessary libraries
import os
import numpy as np

In [2]:
def load_saved_images_labels(load_dir):
    images_filename = os.path.join(load_dir, "images.npy")
    labels_filename = os.path.join(load_dir, "labels.npy")

    image_list = np.load(images_filename, allow_pickle=True)
    labels = np.load(labels_filename, allow_pickle=True)

    return image_list, labels


load_dir = 'D:/Down/processedimages' #Replace the location to where the processed data is stored.
images, labels = load_saved_images_labels(load_dir)

In [3]:
len(images)

50000

In [4]:
pip install tensorflow numpy pandas scikit-learn scikit-image #install necessary libraries and packages

Note: you may need to restart the kernel to use updated packages.


In [5]:
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


In [6]:
pip install keras-applications

Note: you may need to restart the kernel to use updated packages.


The ResNet50 Model is loaded and a dense layer is added to reduce dimensionality. The SimCLR Model is then defined.

In [7]:

from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
import tensorflow as tf

base_encoder = ResNet50(include_top=False, weights='imagenet', input_shape=(64, 64, 3), pooling='avg') #the base encoder (resnet50) is defined

projection_head = tf.keras.layers.Dense(128, activation='relu')(base_encoder.output)

model = Model(inputs=base_encoder.input, outputs=projection_head)


In [8]:
from tensorflow.keras.layers.experimental import preprocessing

def create_augmenter():
    augmenter = tf.keras.Sequential([
        preprocessing.Rescaling(1./255), #The augmentor augments the images using Rescaling, Random Rotation, Random Zoom and Random Flip
        preprocessing.RandomRotation(0.2),
        preprocessing.RandomZoom(0.2),
        preprocessing.RandomFlip("horizontal"),
    ])
    return augmenter

def simclr_data_generator(images, batch_size=64): #pairs of images are augmented
    augmenter = create_augmenter()
    while True:
        indices = np.random.choice(len(images), batch_size)
        batch_images = images[indices]
        augmented_images_1 = augmenter(batch_images)
        augmented_images_2 = augmenter(batch_images)
        yield [augmented_images_1, augmented_images_2], np.zeros(batch_size)
        
def build_simclr_model(base_encoder, hidden_units=128):
    inputs1 = tf.keras.Input(shape=(64, 64, 3)) 
    inputs2 = tf.keras.Input(shape=(64, 64, 3))
    augmenter = create_augmenter()
    
    augmented1 = augmenter(inputs1)
    augmented2 = augmenter(inputs2)
    
    embeddings1 = base_encoder(augmented1)
    embeddings2 = base_encoder(augmented2)
    
    combined_embeddings = tf.keras.layers.Concatenate()([embeddings1, embeddings2]) #the pairs of augmented images are combined and sent to the projection head
    
    projection_head = tf.keras.layers.Dense(hidden_units, activation='relu')(combined_embeddings)
    
    simclr_model = tf.keras.Model([inputs1, inputs2], projection_head)
    return simclr_model


def contrastive_loss(y_true, y_pred, temperature=0.1): #the loss function is defined
    half_batch_size = tf.shape(y_pred)[0] // 2
    embeddings1 = y_pred[:half_batch_size]
    embeddings2 = y_pred[half_batch_size:]

    similarity_matrix1 = tf.matmul(embeddings1, embeddings1, transpose_b=True)
    similarity_matrix2 = tf.matmul(embeddings2, embeddings2, transpose_b=True)

    similarity_matrix = (similarity_matrix1 + similarity_matrix2) / 2.0
    similarity_matrix = similarity_matrix / temperature
    
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=similarity_matrix, labels=tf.eye(half_batch_size)))
    return loss



The model is run for 10 epochs with a batch size of 64.

In [9]:
simclr_model = build_simclr_model(base_encoder)
simclr_model.compile(optimizer='adam', loss=contrastive_loss)

data_gen = simclr_data_generator(images, batch_size=64)

simclr_model.fit(data_gen, epochs=10, steps_per_epoch=len(images) // 64) #the model is trained for 10 epochs

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2086b775010>

In [14]:
embeddings = base_encoder.predict(images)



The embeddings are fitted on K-means and Logistic Regression is employed for further evaluation.

In [13]:
from sklearn.cluster import KMeans

num_clusters = 4  # Four Clusters are chosen based on the Elbow Method Results
kmeans = KMeans(n_clusters=num_clusters).fit(embeddings)
cluster_labels = kmeans.labels_




In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(embeddings, cluster_labels, test_size=0.2, random_state=42)


In [15]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(max_iter=1000) #Logistic Regression is employed for further evaluation and comparison to other methods used
classifier.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
from sklearn.metrics import accuracy_score

y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)


Accuracy: 0.4087


In [17]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.47      0.40      2624
           1       0.73      0.22      0.34      2529
           2       0.35      0.41      0.38      1441
           3       0.43      0.50      0.46      3406

    accuracy                           0.41     10000
   macro avg       0.46      0.40      0.39     10000
weighted avg       0.47      0.41      0.40     10000



In [18]:
from sklearn.metrics import silhouette_score

silhouette_avg = silhouette_score(embeddings, cluster_labels)
print("Silhouette Score:", silhouette_avg)


Silhouette Score: 0.52488387


In [26]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = classifier.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
report = classification_report(y_test, y_pred, output_dict=True) 


print("Classification Report:")
print(classification_report(y_test, y_pred))

average_precision = report['macro avg']['precision']
average_recall = report['macro avg']['recall']
average_f1 = report['macro avg']['f1-score']

print("\nAverage Precision:", average_precision)
print("Average Recall:", average_recall)
print("Average F1-Score:", average_f1)


Accuracy: 0.4087
Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.47      0.40      2624
           1       0.73      0.22      0.34      2529
           2       0.35      0.41      0.38      1441
           3       0.43      0.50      0.46      3406

    accuracy                           0.41     10000
   macro avg       0.46      0.40      0.39     10000
weighted avg       0.47      0.41      0.40     10000


Average Precision: 0.46254423604474193
Average Recall: 0.4001982861443847
Average F1-Score: 0.3935027962818093
