In [None]:
#adapted from https://towardsdatascience.com/how-to-cluster-images-based-on-visual-similarity-cd6e7209fe34
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras as keras

In [None]:
# for loading/processing the images  

from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle

In [None]:
import os
# this list holds all the image filename
pets = []

scores = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
scores = scores[['Id','Pawpularity']]

for dirname, _, filenames in os.walk('/kaggle/input/petfinder-pawpularity-score/train'):
    for i ,filename in enumerate(filenames):
        pets.append(dirname+'/'+filename)

print(pets[:10])

In [None]:
# load the image as a 224x224 array
img = load_img(pets[0], target_size=(224,224))
# convert from 'PIL.Image.Image' to numpy array
img = np.array(img)

print(img.shape)

In [None]:
reshaped_img = img.reshape(1,224,224,3)
print(reshaped_img.shape)

In [None]:
x = preprocess_input(reshaped_img)

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [None]:
tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
# load model
model = VGG16()
# remove the output layer
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
model.save('feature_extractor')

In [None]:
with tf.device('/GPU:0'):
    features = model.predict(reshaped_img)
print(features.shape)

In [None]:
def extract_features(file, model):
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [None]:
data = {}
p = '/kaggle/working/pet_features.pkl'

# lop through each image in the dataset
for pet in pets:
    # try to extract the features and update the dictionary
    try:
        with tf.device('/GPU:0'):
            feat = extract_features(pet,model)
        data[pet] = feat
    # if something fails, save the extracted features as a pickle file (optional)
    except:
        with open(p,'wb') as file:
            pickle.dump(data,file)
          
 
# get a list of the filenames
filenames = np.array(list(data.keys()))

# get a list of just the features
feat = np.array(list(data.values()))
feat.shape

feat = feat.reshape(-1,4096)
print(feat.shape)

In [None]:
pca = PCA(n_components=100, random_state=22)
pca.fit(feat)
pickle.dump(pca,  open('pca.pkl', 'wb'))
x = pca.transform(feat)

In [None]:
sse = []
list_k = list(range(3, 50))

for k in list_k:
    km = KMeans(n_clusters=k, random_state=22, n_jobs=-1)
    km.fit(x)
    
    sse.append(km.inertia_)

# Plot sse against k
plt.figure(figsize=(6, 6))
plt.plot(list_k, sse)
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance')

In [None]:
#select 20 for number of clusters based on inflection point above

kmeans = KMeans(n_clusters=20,n_jobs=-1, random_state=22)
kmeans.fit(x)

# af = AffinityPropagation(random_state=5).fit(x)... produces clusters too small for regression step
os.chdir('/kaggle/working')
import pickle
pickle.dump(kmeans,  open('clustering_model.pkl', 'wb'))

In [None]:
# holds the cluster id and the images { id: [images] }
import re

groups = {}
feature_map = dict(zip(filenames, x))
features = {}
score = {} 


for file, cluster in zip(filenames,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
        features[cluster] = []
        features[cluster].append(feature_map[file])
        score[cluster] = []
        m = re.search('[a-zA-Z0-9]+(?=\.)', file)
        score[cluster].append(scores[scores['Id']==m.group(0)]['Pawpularity'].item())
    else:
        groups[cluster].append(file)
        features[cluster].append(feature_map[file])
        m = re.search('[a-zA-Z0-9]+(?=\.)', file)
        score[cluster].append(scores[scores['Id']==m.group(0)]['Pawpularity'].item())



In [None]:
# function that lets you view a cluster (based on identifier)        
def view_cluster(cluster):
    plt.figure(figsize = (25,25));
    # gets the list of filenames for a cluster
    files = groups[cluster]
    # only allow up to 30 images to be shown at a time
    if len(files) > 30:
        print(f"Clipping cluster size from {len(files)} to 30")
        files = files[:29]
    # plot each image in the cluster
    for index, file in enumerate(files):
        plt.subplot(10,10,index+1);
        img = load_img(file)
        img = np.array(img)
        plt.title('Cluster Number: {}'.format(cluster))
        plt.imshow(img)
        plt.axis('off')

In [None]:
# for cluster in (groups.keys()):
#     view_cluster(cluster)

In [None]:
# !pip install pycaret

# # params = {"max_depth": np.random.randint(1, (len(data.columns)*.85),20),
# #           "max_features": np.random.randint(1, len(data.columns),20),
# #           "min_samples_leaf": [2,3,4,5,6],
# #           "criterion": ["gini", "entropy"]
# #           }

# from pycaret.regression import *
# for cluster in (groups.keys()):
#     a = np.stack(features[cluster], axis = 0)
#     df = pd.DataFrame(a)
#     df['score'] = pd.DataFrame(score[cluster])
#     exp_name = setup(data = df,  target = 'score', use_gpu = True, silent = True)
#     best = compare_models(exclude = ['catboost'], sort = 'RMSE')
# #     tuned_top3 = [tune_model(i, optimize ='RMSE', n_iter = 50, custom_grid = params) for i in best]
#     #stacked = stack_models(best)
#     # stacker = stack_models(tuned_top3)
#     # best_rmse_model = automl()
#     #boosted_best = ensemble_model(best, method = 'Boosting')
#     save_model(best, 'saved_lr_model_'+str(cluster))

In [None]:
from sklearn.ensemble import AdaBoostRegressor

for cluster in (groups.keys()):
    a = np.stack(features[cluster], axis = 0)
    X = pd.DataFrame(a).values
    y = pd.DataFrame(score[cluster]).values
    y = np.reshape(y,(y.shape[0]))
    ada = AdaBoostRegressor()
    ada.fit(X, y)
    pickle.dump(ada,  open('trained_lr_'+str(cluster)+'.pkl', 'wb'))