In [1]:
%load_ext autoreload
%autoreload 2

Workflow:
- get an example input file (in bytes)
- preprocess it --> return df resized and flattened
- KNN predict --> return indices of closest images
- indices to url --> return  list of urls of closest images

In [75]:
from vincentvanbot.preprocessing.utils import preprocess_image
import pandas as pd

In [20]:
dim = (36,42)

img = preprocess_image('example-input.jpg',dim=dim)
img

array([[0.8745098 , 0.7372549 , 0.5803922 , ..., 0.42745098, 0.40392157,
        0.35686275]], dtype=float32)

In [24]:
# get training data
from vincentvanbot.data import get_pickle

img_db = get_pickle()
img_db.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4526,4527,4528,4529,4530,4531,4532,4533,4534,4535
0,0.27451,0.282353,0.188235,0.215686,0.2,0.164706,0.180392,0.168627,0.14902,0.317647,...,0.078431,0.12549,0.086275,0.078431,0.090196,0.062745,0.039216,0.152941,0.109804,0.094118
1,0.262745,0.278431,0.211765,0.254902,0.294118,0.25098,0.219608,0.290196,0.243137,0.160784,...,0.156863,0.364706,0.301961,0.203922,0.345098,0.294118,0.231373,0.305882,0.231373,0.164706
10,0.066667,0.054902,0.027451,0.058824,0.031373,0.007843,0.078431,0.05098,0.027451,0.094118,...,0.0,0.286275,0.117647,0.05098,0.113725,0.035294,0.0,0.07451,0.019608,0.019608
100,0.176471,0.172549,0.164706,0.164706,0.145098,0.121569,0.192157,0.152941,0.14902,0.141176,...,0.243137,0.360784,0.301961,0.27451,0.356863,0.294118,0.298039,0.286275,0.239216,0.239216
101,0.254902,0.192157,0.094118,0.239216,0.172549,0.101961,0.215686,0.145098,0.098039,0.254902,...,0.105882,0.133333,0.129412,0.121569,0.137255,0.137255,0.129412,0.113725,0.109804,0.094118


In [67]:
# train
from sklearn.neighbors import NearestNeighbors
import pickle

def train_model(df_transformed): # previously called return_closest_images
    """Takes preprocessed train data as df. Saves fitted model in gcloud.
    Returns fitted KNN model and train data image indexes."""
    knn_model = NearestNeighbors().fit(df_transformed)
    
    return knn_model, df_transformed.index

In [68]:
# predict
def get_closest_images_indexes(user_input_transformed, model, indexes, nsimilar=3):
    """Takes preprocessed user_input_transformed as np.array and fitted knn model.
    Returns indices of nsimilar closest images"""
    index_neighbors = model.kneighbors(user_input_transformed, n_neighbors=nsimilar)[1][0]
    
    return [int(indexes[i]) for i in list(index_neighbors)]

In [69]:
model, indexes = train_model(img_db)

In [70]:
get_closest_images_indexes(img,model,indexes)

[41, 42, 91]

In [59]:
from vincentvanbot.data import get_data_locally

In [60]:
initial_df = get_data_locally(100_000)
initial_df.shape

(32008, 11)

In [71]:
for index in get_closest_images_indexes(img,model,indexes):
    print(initial_df.iloc[index]['URL'])

https://www.wga.hu/art/a/abbati/abbati5.jpg
https://www.wga.hu/art/a/abbatini/cornaro.jpg
https://www.wga.hu/art/a/aertsen/christ_m.jpg


In [73]:
from google.cloud import storage
from vincentvanbot.params import BUCKET_NAME
from vincentvanbot.preprocessing.utils import get_jpg_link

BUCKET_INITIAL_DATASET_FOLDER = 'data'

def get_info_from_index(indexes, all_info=False):
    """from given image indexes, gets initial dataset from gcloud
    and returns respective jpg links"""
    client = storage.Client()
    
    dataset_filename = 'catalog.csv'
    path = f"gs://{BUCKET_NAME}/{BUCKET_INITIAL_DATASET_FOLDER}/{dataset_filename}"
    
    df = pd.read_csv(path, encoding= 'unicode_escape')
    df['URL'] = df['URL'].map(get_jpg_link)
    
    urls = [df.iloc[i]['URL'] for i in indexes]

    return urls

In [76]:
model, indexes = train_model(img_db)
closest_i = get_closest_images_indexes(img,model,indexes)
get_url_from_index(closest_i)

['https://www.wga.hu/art/a/abbate/torfani2.jpg',
 'https://www.wga.hu/art/a/abbati/abbati1.jpg',
 'https://www.wga.hu/art/a/ademollo/ark1.jpg']