In [None]:
#image cluster
#Creates 4 groups of photos, groups by content similarly. Prints 7 of each group
#Sonia Yaco
#Rutgers University
#2024

In [None]:
#Load the Drive helper and mount
from google.colab import drive#

drive.mount('/content/drive/', force_remount=True)

In [None]:
#set up paths
analys_path = "analysis"
folder_path ="data/photos"

In [None]:
from glob import glob
import os
# load libraries for loading/processing the images and text analysis
from keras.utils import load_img
from keras.utils import img_to_array
from keras.applications.vgg16 import preprocess_input

# models
from keras.applications.vgg16 import VGG16
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

#for natural language processing
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
#removed some libraries here

#translation libraries
from googletrans import Translator
import subprocess
import json



In [None]:
# load model
model = VGG16()
# remove the output layer
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img)
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3)
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [None]:
#load images
flist = glob(os.path.join(folder_path, "**", "*.tif"), recursive = True)


In [None]:
print(len(flist))

In [None]:
reload = True
#reload = False

if reload:
  # loop through each image in the dataset
  data = []
  fnames = []
  for img in flist:
    # try to extract the features and update the dictionary
    try:
      feat = extract_features(img,model)
      data.append(feat)
      fnames.append(img)
    except:
      pass

  npdata = np.array(data).reshape(-1,4096)

  with open(os.path.join(analys_path, '/npfeats.pkl'), 'wb') as f:
    pickle.dump(npdata, f)
  with open(os.path.join(analys_path, '/fnames.pkl'), 'wb') as f:
    pickle.dump(fnames, f)

else:
  with open(os.path.join(analys_path, '/npfeats.pkl'), 'rb') as f:
    npdata = pickle.load(f)
  with open(os.path.join(analys_path, '/fnames.pkl'), 'rb') as f:
    fnames = pickle.load(f)

In [None]:
fnames

In [None]:
# applying PCA
pca = PCA(n_components=len(flist)-100, random_state=22)
# changed to use number of images instead of static assignment #pca = PCA(n_components=100, random_state=22)
pca.fit(npdata)
x = pca.transform(npdata)

In [None]:
kmeans = KMeans(n_clusters=4, random_state=22)
kmeans.fit(x)

In [None]:
kmeans.labels_

In [None]:
groups = {}
for file, cluster in zip(range(0,425),kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

In [None]:
num = 7;
plt.figure(figsize=(15,5))
np.random.shuffle(groups[0])

# Selecting the first n entries
grp0 = groups[0][:num]
print(f"Group 0")
for i, idx in enumerate(grp0):
  plt.subplot(1, num, i+1)
  plt.axis('off')
  plt.imshow(load_img(fnames[idx]))
  print(fnames[idx])

In [None]:
num = 7;
plt.figure(figsize=(15,5))
np.random.shuffle(groups[1])
# Selecting the first n entries
grp1 = groups[1][:num]
print(f"Group 1")
for i, idx in enumerate(grp1):
  plt.subplot(1, num, i+1)
  plt.axis('off')
  plt.imshow(load_img(fnames[idx]))
  print(fnames[idx])

In [None]:
num = 7;
plt.figure(figsize=(15,5))
np.random.shuffle(groups[2])
# Selecting the first n entries
grp2 = groups[2][:num]
print(f"Group 2")
for i, idx in enumerate(grp2):
  plt.subplot(1, num, i+1)
  plt.axis('off')
  plt.imshow(load_img(fnames[idx]))
  print(fnames[idx])

In [None]:
num = 7;
plt.figure(figsize=(15,5))
np.random.shuffle(groups[3])
# Selecting the first n entries
grp3 = groups[3][:num]
print(f"Group 3")
for i, idx in enumerate(grp3):
  plt.subplot(1, num, i+1)
  plt.axis('off')
  plt.imshow(load_img(fnames[idx]))

  print(fnames[idx])