Workflow
- import preprocessing pipeline from Julia
- use it to train KNN
- get pre-trained KNN from Google storage
- use example title to get distances from both models
- merge results

In [1]:
import pandas as pd
import numpy as np
from google.cloud import storage
import joblib

#### Build random input

In [2]:
from vincentvanbot.data import get_data_locally

In [3]:
df = get_data_locally(10_000)
df.head(1)

Unnamed: 0,AUTHOR,BORN-DIED,TITLE,DATE,TECHNIQUE,LOCATION,URL,FORM,TYPE,SCHOOL,TIMEFRAME
0,"AACHEN, Hans von","(b. 1552, Köln, d. 1615, Praha)",Venus and Adonis,1574-88,"Oil on canvas, 68 x 95 cm","Fogg Art Museum, Harvard University, Cambridge",https://www.wga.hu/art/a/aachen/adonis.jpg,painting,mythological,German,1601-1650


In [4]:
# for example, let's use type and school as features
types = df['TYPE'].unique()
schools = df['SCHOOL'].unique()

In [5]:
cols = np.concatenate((types,schools))

In [6]:
rand_df = pd.DataFrame(np.zeros((10_000,36)),columns=cols)

In [7]:
def random_one(row):
    row.iloc[np.random.randint(0,len(types))] = 1
    row.iloc[np.random.randint(len(types),len(row))] = 1
    return row

In [8]:
rand_df = rand_df.apply(random_one, axis=1)

#### Train KNN

In [14]:
from sklearn.neighbors import NearestNeighbors

model_features = NearestNeighbors().fit(rand_df)
indexes_features = rand_df.index

In [23]:
from vincentvanbot.params import BUCKET_NAME

# download trained model
client = storage.Client().bucket(BUCKET_NAME)
local_name = 'model.joblib'
storage_location = f"predict/{local_name}"
blob = client.blob(storage_location)
blob.download_to_filename(local_name)
print(f"=> {local_name} downloaded from storage")
model_picture = joblib.load(local_name)

=> model.joblib downloaded from storage


In [13]:
local_name = 'train_indexes.joblib'
storage_location = f"predict/{local_name}"
blob = client.blob(storage_location)
blob.download_to_filename(local_name)
print(f"=> {local_name} downloaded from storage")
indexes_picture = joblib.load(local_name)

=> train_indexes.joblib downloaded from storage


In [15]:
# predict closest given the title
test_title = df.iloc[0]['TITLE']
test_title

'Venus and Adonis'

In [56]:
# get test input title
# there are several paintings with the same title. I'll take the first one for now.
test_index = df[df['TITLE']==test_title].iloc[0].name
user_input_preproc_features = rand_df.iloc[test_index].to_numpy().reshape(1,-1) # this should be replaced by the feature preprocessing pipeline

# get closest paintings index based on features
index_neighbors_features = model_features.kneighbors(user_input_preproc_features,n_neighbors=100)[1][0]
indexes_closest_features = [int(indexes_features[i]) for i in list(index_neighbors_features)]

# get closest paintings distance based on features
distance_neighbors_features = model_features.kneighbors(user_input_preproc_features,n_neighbors=100)[0][0]

In [57]:
# get preprocessed data for picture
from vincentvanbot.preprocessing.utils import preprocess_image
user_input_preproc_picture = preprocess_image('../raw_data/images/0.jpg')

# get closest paintings index based on picture
index_neighbors_picture = model_picture.kneighbors(user_input_preproc_picture,n_neighbors=100)[1][0]
indexes_closest_picture = [int(indexes_picture[i]) for i in list(index_neighbors_picture)]

# get closest paintings distance based on picture
distance_neighbors_picture = model_picture.kneighbors(user_input_preproc_picture,n_neighbors=100)[0][0]