In [1]:
import sys

%reload_ext autoreload
%autoreload 2

sys.path.insert(0, '../..')

In [2]:
from icecream import ic

In [None]:
from src.metrics import *
from src.utils import *

In [4]:
root_path = '../../data/artifacts'

In [5]:
from typing import List, Optional, Callable, Union, Any, Tuple, Dict

import os
import pandas as pd
import json
import numpy as np
from sklearn.preprocessing import MinMaxScaler

import faiss
import tensorflow_hub as hub

import torch
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)

from moviepy.editor import VideoFileClip

from img2vec_pytorch import Img2Vec
from PIL import Image

import warnings
warnings.filterwarnings('ignore')

from utils import *
from base import Recommendation

In [6]:
class Metadata(Recommendation):
    """
    Using text, images and videos to identify interest sub-categories
    """
    
    device = "cpu" # set to cuda to load on GPU

    @property
    def processed_file_names(self) -> Union[str, List[str], Tuple]:
        """The name of the files in the processed_path folder that
        must be present in order to skip training."""
        return ['multimodal_adventures.p',
                'multimodal_img_faiss.index',
                'multimodal_text_faiss.index',
                'kinetics_id_to_classname.p']


    def init_processed_paths(self):
        """
        initiate the file paths for processed data
        """
        self.processed_path_adventures = os.path.join(self.processed_dir, 'multimodal_adventures.p')
        self.processed_path_img_faiss_index = os.path.join(self.processed_dir, 'multimodal_img_faiss.index')
        self.processed_path_text_faiss_index = os.path.join(self.processed_dir, 'multimodal_text_faiss.index')
        self.processed_path_model_use_v5 = os.path.join(self.processed_dir, 'models/pretrained/use_v5')
        self.processed_path_model_torch_model_zoo = os.path.join(self.processed_dir, 'models/pretrained/torch_model_zoo')
        self.processed_path_kinetics_id_to_classname = os.path.join(self.processed_dir, 'kinetics_id_to_classname.p')

        os.environ['TORCH_HOME'] = self.processed_path_model_torch_model_zoo


    def process(self):
        """
        load the raw data, process it and save into processed data folder
        """
        # load the adventure interest sub-category table
        adventures = pd.read_csv(self.raw_path_adventures)

        # load the text vectorisation model
        embed = hub.KerasLayer(self.processed_path_model_use_v5)
        
        # process and save the adventures table
        adventures = adventures[['id', 'name', 'icon', 'parent_id']]
        adventures['save_path'] = adventures['icon'].apply(get_img_path, basepath=self.raw_path_images_adventures)
        save_pickle(adventures, self.processed_path_adventures)

        # convert text to vectors and save
        embeddings = embed(adventures['name'].tolist()).numpy()
        index_flat_text = IndexFlatL2(512, adventures, embeddings)
        index_flat_text.build()
        faiss.write_index(index_flat_text.index, self.processed_path_text_faiss_index)

        # convert image to vectors and save
        BUFFERSIZE = 100
        img_paths = adventures.save_path.tolist()
        img2vec = Img2Vec(cuda=False)
        img_vecs = None
        for i in range(0, len(img_paths), BUFFERSIZE):
            max_range = (i+BUFFERSIZE) if (i+BUFFERSIZE) <= len(img_paths) else len(img_paths)
            _img_paths = img_paths[i:i+BUFFERSIZE]
            _vectors = img2vec.get_vec([Image.open(ipath).convert('RGB') for ipath in _img_paths])
            if img_vecs is None:
                img_vecs = _vectors
            else:
                img_vecs = np.vstack((img_vecs, _vectors))
                
        index_flat_img = IndexFlatL2(512, adventures, img_vecs)
        index_flat_img.build()
        faiss.write_index(index_flat_img.index, self.processed_path_img_faiss_index)
        
        with open(self.raw_path_kinetics_classnames , "r") as f:
            kinetics_classnames = json.load(f)

        # Create an id to label name mapping
        kinetics_id_to_classname = {}
        for k, v in kinetics_classnames.items():
            kinetics_id_to_classname[v] = str(k).replace('"', "")

        kinetics_id_to_classname = pd.DataFrame(kinetics_id_to_classname.items(), columns=['id','label']).sort_values(by='id')
        kinetics_id_to_classname.set_index('id', inplace=True)
        kinetics_id_to_classname.to_pickle(self.processed_path_kinetics_id_to_classname)


    def load(self):
        """
        load the processed data from processed data folder into memory
        """
        self.adventures = load_pickle(self.processed_path_adventures)
        self.embed = hub.KerasLayer(self.processed_path_model_use_v5)
        img2vec = Img2Vec(cuda=False)
        self.img_embed = lambda x: img2vec.get_vec(x)
        text_index = faiss.read_index(self.processed_path_text_faiss_index)
        self.index_flat_text = IndexFlatL2(512, self.adventures, index=text_index)
        self.index_flat_text.build()
        img_index = faiss.read_index(self.processed_path_img_faiss_index)
        self.index_flat_img = IndexFlatL2(512, self.adventures, index=img_index)
        self.index_flat_img.build()
        
        self.kinetics_id_to_classname = pd.read_pickle(self.processed_path_kinetics_id_to_classname)

        device = self.device
        model_name = "x3d_xs"
        mean = [0.45, 0.45, 0.45]
        std = [0.225, 0.225, 0.225]
        frames_per_second = 30
        model_transform_params  = {
            "x3d_xs": {
                "side_size": 182,
                "crop_size": 182,
                "num_frames": 4,
                "sampling_rate": 12,
            }
        }
    
        self.model = torch.hub.load("facebookresearch/pytorchvideo:main",
                                    model=model_name,
                                    pretrained=True)
        # set to eval mode and move to desired device
        self.model = self.model.to(device)
        self.model = self.model.eval()

        transform_params = model_transform_params[model_name]

        self.transform =  ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(transform_params["num_frames"]),
                    Lambda(lambda x: x/255.0),
                    NormalizeVideo(mean, std),
                    ShortSideScale(size=transform_params["side_size"]),
                    CenterCropVideo(
                        crop_size=(transform_params["crop_size"], transform_params["crop_size"])
                    )
                ]
            ),
        )

        # duration of the input clip is specific to the model
        self.clip_duration = (transform_params["num_frames"] * transform_params["sampling_rate"])/frames_per_second


    def classify_video(self,
                       video_path,
                       topk = 5,
                       headstart = 0,
                       limit = 60,
                       verbose = True,
                       ):
        
        pred_class_names_all = []
        
        clip_length = VideoFileClip(video_path).duration # in seconds
        clip_length = clip_length - headstart
        clip_length = clip_length if clip_length < limit else limit

        segments = int(clip_length // self.clip_duration)

        for i in range(0, segments):

            start_sec = headstart + i * self.clip_duration

            # Select the duration of the clip to load by specifying the start and end duration
            # The start_sec should correspond to where the action occurs in the video
            end_sec = start_sec + self.clip_duration

            if verbose:
                print('Analysing {:.2f}s-{:.2f}s clip segment | Segment {}/{}'\
                    .format(start_sec, end_sec, i+1, segments))

            # Initialize an EncodedVideo helper class
            video = EncodedVideo.from_path(video_path)

            # Load the desired clip
            video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

            # Apply a transform to normalize the video input
            video_data = self.transform(video_data)

            # Move the inputs to the desired device
            inputs = video_data["video"]
            inputs = inputs.to(self.device)[None, ...] # for X3D model

            # Pass the input clip through the model 
            preds = self.model(inputs)

            # Get the predicted classes 
            post_act = torch.nn.Softmax(dim=1)
            preds = post_act(preds)
            pred_classes = preds.topk(k=topk).indices

            # Map the predicted classes to the label names
            pred_class_names = self.kinetics_id_to_classname.loc[[int(i) for i in pred_classes[0]], 'label'].values.tolist()
            pred_class_names_all.extend(pred_class_names)

        return pred_class_names_all
    

    def recommend(self,
                    title : str = None,
                    description : str = None,
                    image_path : str = None,
                    video_path : str = None,
                    top_k : int = 10,
                    threshold : int = 50,
                    weights : dict = None,
                    return_df : bool = False,
                    headstart = 10,
                    limit = 30,
                    verbose = False):
        """
        For the given title, description, image path,
        and video path, 
        identify and return the interest sub-categories
        """
        # get top-k interest sub-categories for the title
        title_candidates = pd.DataFrame(columns=['name','distance'])
        if title is not None:
            title_candidates = pd.DataFrame([{'name':x[1]['name'], 'distance':x[0]} for x in \
                                                get_ann_top_items(self.embed, self.index_flat_text, 
                                                                title, threshold)])

        # get top-k interest sub-categories for the description
        description_candidates = pd.DataFrame(columns=['name','distance'])
        if description is not None:
            description_candidates = pd.DataFrame([{'name':x[1]['name'], 'distance':x[0]} for x in \
                                                    get_ann_top_items(self.embed, self.index_flat_text, 
                                                                        description, threshold)])

        # get top-k interest sub-categories for the image
        image_candidates = pd.DataFrame(columns=['name','distance_image'])
        if image_path is not None:
            image_candidates = pd.DataFrame([{'name':x[1]['name'], 'distance_image':x[0]} for x in \
                                                get_ann_top_items_img(self.img_embed, self.index_flat_img, 
                                                                    image_path, threshold)])

        # get top-k interest sub-categories for the videos
        video_candidates = pd.DataFrame(columns=['name','distance_video'])
        video_weights = {'video': 1}
        
        if video_path is not None:
            # get list of labels in video
            labels = self.classify_video(video_path,
                                        headstart=headstart,
                                        limit=limit,
                                        verbose=verbose)
            
            # convert list to countdict
            labels = {x:labels.count(x) for x in labels}
            
            # convert dict to df
            labels = pd.DataFrame(labels.items(), columns=['label','count'])

            # select top-10 labels
            n_labels = 10
            labels = labels.sort_values(by='count', ascending=False).head(n_labels)

            video_weights = labels.set_index('label').to_dict()['count']
            col_names = labels['label'].tolist()
            
            video_candidates = pd.DataFrame(columns=['name','distance'])

            for index, row in labels.iterrows():
                _df = pd.DataFrame([{'name':x[1]['name'], 'distance':x[0]} for x in \
                    get_ann_top_items(self.embed, self.index_flat_text,
                                        row.label, threshold)])
                video_candidates = video_candidates.merge(_df, on='name', how='outer')

            video_candidates.set_index('name', inplace=True)
            video_candidates = video_candidates.iloc[: , 1:]
            video_candidates.columns = col_names

        # merge
        candidates = title_candidates.merge(description_candidates, on='name', how='outer', 
                                            suffixes=('_title','_description'))
        candidates = candidates.merge(image_candidates, on='name', how='outer')
        candidates = candidates.merge(video_candidates, on='name', how='outer')
        candidates.set_index('name', inplace=True)
        
        if weights is None:
            weights = {'title': 1, 'description': 1, 'image': 1}
        # add video candidate weights
        weights.update(video_weights)
        
        col_names = candidates.columns.tolist()

        # combine
        min_max_scaler = MinMaxScaler()
        x_scaled = min_max_scaler.fit_transform(candidates.values)
        candidates = pd.DataFrame(x_scaled, index=candidates.index)

        candidates.columns = col_names
        candidates = candidates.rename(columns={'distance_title':'title',
                                                    'distance_description':'description',
                                                    'distance_image':'image',
                                                    'distance_video':'video'})
        candidates.fillna(1, inplace=True)
        candidates = candidates.astype('float')
        candidates.replace(0, 1e-2, inplace=True)
        candidates = 1/np.sqrt(candidates)
        
        print(candidates.columns)
        print(weights)
        
        candidates['final_score'] = np.array([(candidates[x]*weights[x]).values for x in \
                                                candidates.columns]).sum(axis=0)
        candidates.sort_values(by='final_score', ascending=False, inplace=True)
        
        candidates = candidates[~candidates.index.duplicated(keep='first')]

        # selecting top-k
        candidates = candidates.head(top_k)

        if return_df:
            return candidates.to_json()

        candidates = candidates.index.tolist()

        return candidates

In [7]:
m = Metadata(root_path=root_path)

2022-05-25 21:27:51.342095: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using cache found in ../../data/artifacts/processed/models/pretrained/torch_model_zoo/hub/facebookresearch_pytorchvideo_main


In [None]:
pd.read_csv(m.raw_path_sights)

In [31]:
m.recommend(title='fine-dine experience',
            video_path=root_path+'/raw/videos/q0nm.mp4',
            headstart = 10,
            limit = 3,
            verbose = True)

Analysing 10.00s-11.60s clip segment | Segment 1/1
Index(['title', 'description', 'image', 'dining', 'tasting beer',
       'setting table', 'playing poker', 'drinking beer'],
      dtype='object')
{'title': 1, 'description': 1, 'image': 1, 'dining': 1, 'tasting beer': 1, 'setting table': 1, 'playing poker': 1, 'drinking beer': 1}


['Special Experience',
 'Dinner',
 'Beer Tasting and Tours',
 'Alcohol',
 'Restaurant',
 'Table Cloth',
 'Casino',
 'Multiple Experience',
 'Drinks',
 'Child Experience']

In [32]:
m.recommend(title='Rock climbing',
            description='Enjoy rock climbing adventure in the beautiful mountains',
            image_path=root_path+'/raw/images/aac3d.png',
            video_path=root_path+'/raw/videos/q0nm.mp4',
            headstart = 10,
            limit = 3,
            verbose = True)

Analysing 10.00s-11.60s clip segment | Segment 1/1
Index(['title', 'description', 'image', 'dining', 'tasting beer',
       'setting table', 'playing poker', 'drinking beer'],
      dtype='object')
{'title': 1, 'description': 1, 'image': 1, 'dining': 1, 'tasting beer': 1, 'setting table': 1, 'playing poker': 1, 'drinking beer': 1}


['Shower Chair',
 'Kids Toiletries',
 'Capri Set',
 'Adapted Bath',
 'Cabonmonoxide Alarm',
 'Toilet With Grab Rails',
 'Child foldable pushchair',
 'Climbing Tours',
 'Dinner',
 'Beer Tasting and Tours']

In [23]:
def recommend(self,
                title : str = None,
                description : str = None,
                image_path : str = None,
                video_path : str = None,
                top_k : int = 10,
                threshold : int = 50,
                weights : dict = None,
                return_df : bool = False,
                headstart = 10,
                limit = 30,
                verbose = False):
    """
    For the given title, description, image path,
    and video path, 
    identify and return the interest sub-categories
    """
    # get top-k interest sub-categories for the title
    title_candidates = pd.DataFrame(columns=['name','distance'])
    if title is not None:
        title_candidates = pd.DataFrame([{'name':x[1]['name'], 'distance':x[0]} for x in \
                                            get_ann_top_items(self.embed, self.index_flat_text, 
                                                            title, threshold)])

    # get top-k interest sub-categories for the description
    description_candidates = pd.DataFrame(columns=['name','distance'])
    if description is not None:
        description_candidates = pd.DataFrame([{'name':x[1]['name'], 'distance':x[0]} for x in \
                                                get_ann_top_items(self.embed, self.index_flat_text, 
                                                                    description, threshold)])

    # get top-k interest sub-categories for the image
    image_candidates = pd.DataFrame(columns=['name','distance_image'])
    if image_path is not None:
        image_candidates = pd.DataFrame([{'name':x[1]['name'], 'distance_image':x[0]} for x in \
                                            get_ann_top_items_img(self.img_embed, self.index_flat_img, 
                                                                image_path, threshold)])

    # get top-k interest sub-categories for the videos
    video_candidates = pd.DataFrame(columns=['name','distance_video'])
    video_weights = {'video': 1}
    
    if video_path is not None:
        # get list of labels in video
        labels = self.classify_video(video_path,
                                    headstart=headstart,
                                    limit=limit,
                                    verbose=verbose)
        
        # convert list to countdict
        labels = {x:labels.count(x) for x in labels}
        
        # convert dict to df
        labels = pd.DataFrame(labels.items(), columns=['label','count'])

        # select top-10 labels
        n_labels = 10
        labels = labels.sort_values(by='count', ascending=False).head(n_labels)

        video_weights = labels.set_index('label').to_dict()['count']
        col_names = labels['label'].tolist()
        
        video_candidates = pd.DataFrame(columns=['name','distance'])

        for index, row in labels.iterrows():
            _df = pd.DataFrame([{'name':x[1]['name'], 'distance':x[0]} for x in \
                get_ann_top_items(self.embed, self.index_flat_text,
                                    row.label, threshold)])
            video_candidates = video_candidates.merge(_df, on='name', how='outer')

        video_candidates.set_index('name', inplace=True)
        video_candidates = video_candidates.iloc[: , 1:]
        video_candidates.columns = col_names

    # merge
    candidates = title_candidates.merge(description_candidates, on='name', how='outer', 
                                        suffixes=('_title','_description'))
    candidates = candidates.merge(image_candidates, on='name', how='outer')
    candidates = candidates.merge(video_candidates, on='name', how='outer')
    candidates.set_index('name', inplace=True)
    
    if weights is None:
        weights = {'title': 1, 'description': 1, 'image': 1}
    # add video candidate weights
    weights.update(video_weights)
    
    col_names = candidates.columns.tolist()

    # combine
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(candidates.values)
    candidates = pd.DataFrame(x_scaled, index=candidates.index)

    candidates.columns = col_names
    candidates = candidates.rename(columns={'distance_title':'title',
                                                'distance_description':'description',
                                                'distance_image':'image',
                                                'distance_video':'video'})
    candidates.fillna(1, inplace=True)
    candidates = candidates.astype('float')
    candidates.replace(0, 1e-2, inplace=True)
    candidates = 1/np.sqrt(candidates)
    
    print(candidates.columns)
    print(weights)
    
    candidates['final_score'] = np.array([(candidates[x]*weights[x]).values for x in \
                                            candidates.columns]).sum(axis=0)
    candidates.sort_values(by='final_score', ascending=False, inplace=True)
    
    candidates = candidates[~candidates.index.duplicated(keep='first')]

    # selecting top-k
    candidates = candidates.head(top_k)

    if return_df:
        return candidates.to_json()

    candidates = candidates.index.tolist()

    return candidates

In [24]:
recommend(m,
        title='fine-dine experience',
        video_path=root_path+'/raw/videos/q0nm.mp4',
        headstart = 10,
        limit = 3,
        verbose = True)

Analysing 10.00s-11.60s clip segment | Segment 1/1
Index(['title', 'description', 'image', 'dining', 'tasting beer',
       'setting table', 'playing poker', 'drinking beer'],
      dtype='object')
{'title': 1, 'description': 1, 'image': 1, 'dining': 1, 'tasting beer': 1, 'setting table': 1, 'playing poker': 1, 'drinking beer': 1}


['Special Experience',
 'Dinner',
 'Beer Tasting and Tours',
 'Alcohol',
 'Restaurant',
 'Table Cloth',
 'Casino',
 'Multiple Experience',
 'Drinks',
 'Child Experience']

In [25]:
recommend(m,
          title='Rock climbing',
            description='Enjoy rock climbing adventure in the beautiful mountains',
            image_path=root_path+'/raw/images/aac3d.png',
            video_path=root_path+'/raw/videos/q0nm.mp4',
            headstart = 10,
            limit = 3,
            verbose = True)

Analysing 10.00s-11.60s clip segment | Segment 1/1
Index(['title', 'description', 'image', 'dining', 'tasting beer',
       'setting table', 'playing poker', 'drinking beer'],
      dtype='object')
{'title': 1, 'description': 1, 'image': 1, 'dining': 1, 'tasting beer': 1, 'setting table': 1, 'playing poker': 1, 'drinking beer': 1}


['Shower Chair',
 'Kids Toiletries',
 'Capri Set',
 'Adapted Bath',
 'Cabonmonoxide Alarm',
 'Toilet With Grab Rails',
 'Child foldable pushchair',
 'Climbing Tours',
 'Dinner',
 'Beer Tasting and Tours']

In [33]:
recommend(m,
          title='Rock climbing',
            description='Enjoy rock climbing adventure in the beautiful mountains',
            image_path=root_path+'/raw/images/aac3d.png',
            video_path=None,
            headstart = 10,
            limit = 3,
            verbose = True)

Index(['title', 'description', 'image', 'video'], dtype='object')
{'title': 1, 'description': 1, 'image': 1, 'video': 1}


['Shower Chair',
 'Adapted Bath',
 'Kids Toiletries',
 'Capri Set',
 'Child foldable pushchair',
 'Toilet With Grab Rails',
 'Cabonmonoxide Alarm',
 'Climbing Tours',
 'Three Piece Suit ( Skirt suits)',
 'Dinner Suits']

In [68]:
# get top-k interest sub-categories for the title
title_candidates = pd.DataFrame(columns=['name','distance'])
if title is not None:
    title_candidates = pd.DataFrame([{'name':x[1]['name'], 'distance':x[0]} for x in \
                                        get_ann_top_items(m.embed, m.index_flat_text, 
                                                        title, threshold)])

# get top-k interest sub-categories for the description
description_candidates = pd.DataFrame(columns=['name','distance'])
if description is not None:
    description_candidates = pd.DataFrame([{'name':x[1]['name'], 'distance':x[0]} for x in \
                                            get_ann_top_items(m.embed, m.index_flat_text, 
                                                                description, threshold)])

# get top-k interest sub-categories for the image
image_candidates = pd.DataFrame(columns=['name','distance_image'])
if image_path is not None:
    image_candidates = pd.DataFrame([{'name':x[1]['name'], 'distance':x[0]} for x in \
                                        get_ann_top_items_img(m.img_embed, m.index_flat_img, 
                                                            image_path, threshold)])

# get top-k interest sub-categories for the videos
video_candidates = pd.DataFrame(columns=['name','distance_video'])
video_weights = {'video': 1}
if video_path is not None:
    # get list of labels in video
    labels = m.classify_video(video_path,
                                headstart=headstart,
                                limit=limit,
                                verbose=verbose)
    
    # convert list to countdict
    labels = {x:labels.count(x) for x in labels}
    
    # convert dict to df
    labels = pd.DataFrame(labels.items(), columns=['label','count'])

    # select top-10 labels
    n_labels = 10
    labels = labels.sort_values(by='count', ascending=False).head(n_labels)

    video_weights = labels.set_index('label').to_dict()['count']
    col_names = labels['label'].tolist()
    
    video_candidates = pd.DataFrame(columns=['name','distance'])

    for index, row in labels.iterrows():
        _df = pd.DataFrame([{'name':x[1]['name'], 'distance':x[0]} for x in \
            get_ann_top_items(m.embed, m.index_flat_text,
                                row.label, threshold)])
        video_candidates = video_candidates.merge(_df, on='name', how='outer')

    video_candidates.set_index('name', inplace=True)
    video_candidates = video_candidates.iloc[: , 1:]
    video_candidates.columns = col_names

# merge
candidates = title_candidates.merge(description_candidates, on='name', how='outer', 
                                    suffixes=('_title','_description'))
candidates = candidates.merge(image_candidates, on='name', how='outer')
candidates = candidates.merge(video_candidates, on='name', how='outer')
candidates.set_index('name', inplace=True)

if weights is None:
    weights = {'title': 1, 'description': 1, 'image': 1}
# add video candidate weights
weights.update(video_weights)

col_names = candidates.columns.tolist()

# combine
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(candidates.values)
candidates = pd.DataFrame(x_scaled, index=candidates.index)

candidates.columns = col_names
candidates = candidates.rename(columns={'distance_title':'title',
                                            'distance_description':'description',
                                            'distance_image':'image',
                                            'distance_video':'video'})
candidates

Unnamed: 0_level_0,title,description,image,video
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Multiple Experience,0.0,,,
Special Experience,0.001913,,,
Child Experience,0.035976,,,
Infant Experience,0.144389,,,
Toddler Experience,0.289676,,,
School age Experience,0.357417,,,
Special Needs Experience,0.374378,,,
Preschool Experience,0.447892,,,
Margy's initiation,0.732655,,,
Water Adventure,0.816415,,,


In [69]:
candidates.fillna(1, inplace=True)
candidates = candidates.astype('float')
candidates.replace(0, 1e-2, inplace=True)
candidates = 1/np.sqrt(candidates)
candidates

Unnamed: 0_level_0,title,description,image,video
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Multiple Experience,10.0,1.0,1.0,1.0
Special Experience,22.86105,1.0,1.0,1.0
Child Experience,5.272236,1.0,1.0,1.0
Infant Experience,2.631683,1.0,1.0,1.0
Toddler Experience,1.85799,1.0,1.0,1.0
School age Experience,1.672679,1.0,1.0,1.0
Special Needs Experience,1.634348,1.0,1.0,1.0
Preschool Experience,1.494217,1.0,1.0,1.0
Margy's initiation,1.168289,1.0,1.0,1.0
Water Adventure,1.106738,1.0,1.0,1.0


In [70]:
weights

{'title': 1, 'description': 1, 'image': 1, 'video': 1}

In [71]:

candidates['final_score'] = np.array([(candidates[x]*weights[x]).values for x in \
                                        candidates.columns]).sum(axis=0)
candidates.sort_values(by='final_score', ascending=False, inplace=True)

In [72]:
candidates

Unnamed: 0_level_0,title,description,image,video,final_score
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Special Experience,22.86105,1.0,1.0,1.0,25.86105
Multiple Experience,10.0,1.0,1.0,1.0,13.0
Child Experience,5.272236,1.0,1.0,1.0,8.272236
Infant Experience,2.631683,1.0,1.0,1.0,5.631683
Toddler Experience,1.85799,1.0,1.0,1.0,4.85799
School age Experience,1.672679,1.0,1.0,1.0,4.672679
Special Needs Experience,1.634348,1.0,1.0,1.0,4.634348
Preschool Experience,1.494217,1.0,1.0,1.0,4.494217
Margy's initiation,1.168289,1.0,1.0,1.0,4.168289
Water Adventure,1.106738,1.0,1.0,1.0,4.106738
