## Bunny Studio
### Search Data Scientist Test

Objective:

- Find samples(content) given a search pattern and make sure recommendations are as close as possible to the search pattern based not only on the sample metadata but also on it's creator's (pro) statistics in the platform.


Workflow:

- Read and preprocess data as described in Figure 1
- Obtain embeddings from every sample attributes and tags
- Export embeddings 


In [1]:
##lets import some libraries to read and manipulate data

import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from unidecode import unidecode
from utils import *
from config import ATTRIBUTES_BY_CATEGORY,MODEL_NAME_HF

from sentence_transformers import SentenceTransformer

model = SentenceTransformer(MODEL_NAME_HF)



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jhonp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jhonp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Read Data

In [10]:
pro_stats=pd.read_csv("result/pro_stats.csv")
sample_attributes=pd.read_csv("result/sample_attributes.csv")
sample_tags=pd.read_csv("result/sample_tags.csv")

print("[INFO] PRO_STATS shape: {}".format(pro_stats.shape))
print("[INFO] SAMPLE_ATTRIBUTES shape: {}".format(sample_attributes.shape))
print("[INFO] SAMPLE_TAGS shape: {}".format(sample_tags.shape))

[INFO] PRO_STATS shape: (108140, 10)
[INFO] SAMPLE_ATTRIBUTES shape: (632084, 6)
[INFO] SAMPLE_TAGS shape: (293603, 4)


In [None]:
#As sample attributes may have multiple kinds of attributes, we must find which ones could be filtered in order to avoid wordy 

### Exploring Sample Attributes by category


In [3]:
##make descriptive plot to show how every feature per category was selected
sample_attributes.query('category=="article"').attribute_value.value_counts()

0                                           3257
eng-us                                      1290
-0.2                                        1106
-0.5                                         878
-1                                           863
                                            ... 
Political,Racism,Xenophobia                    1
Thriller,Fiction,Action                        1
Poems,Creative,concise                         1
Crime,Short,Stories,Murder                     1
mental health,covid-19,frontline workers       1
Name: attribute_value, Length: 1444, dtype: int64

In [12]:
##fill null values with an empty string
sample_attributes.fillna("",inplace=True) 
sample_attributes.loc[:,'if_article_specs']=sample_attributes.apply(lambda k: k['attribute_name'].split("_")[-1] if k['attribute_name'] in ['tone_serious_humorous','tone_sincere_sarcastic','tone_journalistic_creative'] and to_float(k['attribute_value'])>0 else "",axis=1)
sample_attributes.head()

Unnamed: 0,sample_id,category,pro_id,booking_score,attribute_name,attribute_value,if_article_specs
0,7,audio,D6,0.0,purpose,phoneSystems,
1,7,audio,D6,0.0,language_id,eng-us,
2,7,audio,D6,0.0,gender_and_age_id,youngAdultFemale,
3,21,audio,2362,0.0,purpose,phoneSystems,
4,21,audio,2362,0.0,language_id,eng-us,


In [109]:
sample_attributes.category.value_counts()

audio      602586
article     22968
video        3692
image        2838
Name: category, dtype: int64

In [13]:
rows_to_keep=sample_attributes.apply(lambda k: True if k['attribute_name'] in ATTRIBUTES_BY_CATEGORY[k['category']] else False, axis=1)

In [14]:
## new sample_attributes file
sample_attributes=sample_attributes.loc[rows_to_keep,:]

In [15]:
#modify attributes value to be more specific
sample_attributes.loc[:,'attribute_value']=sample_attributes.apply(lambda d: clean_sentence(str(d['attribute_value'])) if "language" not in d['attribute_name'] else decode_language_code(str(d['attribute_value'])),axis=1)

In [16]:
#concatenate attribute_value in one single column
sample_attributes.loc[:,'attribute_value']=sample_attributes.attribute_value+""+sample_attributes.if_article_specs

In [17]:
sample_attributes=sample_attributes.groupby(['sample_id','category'])['attribute_value'].agg(lambda d: " ".join(list(set(d)))).reset_index()

In [27]:
sample_attributes.shape

(205301, 3)

### Exploring Sample Tags

In [19]:
sample_descriptions=sample_tags.groupby(["sample_id","category"])["tag_name"].agg(lambda d: ",".join(list(set(d)))).reset_index()

In [20]:
sample_descriptions.loc[:,'tag_name']=sample_descriptions.tag_name.apply(lambda d: remove_multispaces(remove_special_characters(d.lower(),replacement=" ")))
sample_descriptions.head()

Unnamed: 0,sample_id,category,tag_name
0,324,audio,articulate neutral phone operator vibrant engl...
1,350,audio,approachable educational calm
2,441,audio,announcer approachable neutral informative eng...
3,499,audio,storyteller neutral authentic english us gener...
4,964,audio,serious neutral attentive english us general a...


In [22]:
samples_composed_data=sample_attributes.merge(sample_descriptions,on=['sample_id','category'],how="inner")
samples_composed_data.head()

Unnamed: 0,sample_id,category,attribute_value,tag_name
0,324,audio,phone system middle age female english american,articulate neutral phone operator vibrant engl...
1,350,audio,radio tv ad middle age male english american,approachable educational calm
2,441,audio,phone system middle age gender english american,announcer approachable neutral informative eng...
3,499,audio,audiobooks podcasts senior gender english amer...,storyteller neutral authentic english us gener...
4,964,audio,presentation educational middle age gender eng...,serious neutral attentive english us general a...


In [23]:
samples_composed_data.loc[:,'embeddings_attributes']=samples_composed_data.attribute_value.apply(lambda d: model.encode(d))
samples_composed_data.loc[:,'embeddings_tags']=samples_composed_data.tag_name.apply(lambda d: model.encode(d))

In [51]:
##average embeddings
samples_composed_data.loc[:,'avg_embeddings']=samples_composed_data.apply(lambda k: (k['embeddings_attributes']+k['embeddings_tags'])/2 if k['tag_name']!="" else k['embeddings_attributes'],axis=1)


In [52]:
## Checkpoint: Save Embeddings to a file to unnecessarily processing datasets
samples_composed_data.drop(columns=['embeddings_attributes','embeddings_tags']).to_pickle("samples_composed_data.pkl")

In [56]:
samples_composed_data.shape

(65410, 8)

In [57]:
samples_composed_data=pd.read_pickle("samples_composed_data.pkl")

In [58]:
samples_composed_data

Unnamed: 0,sample_id,category,attribute_value,tag_name,avg_embeddings
0,324,audio,phone system middle age female english american,articulate neutral phone operator vibrant engl...,"[-0.033360533, 0.022383057, -0.042735875, -0.0..."
1,350,audio,radio tv ad middle age male english american,approachable educational calm,"[0.05191251, 0.04593984, -0.014720706, -0.0286..."
2,441,audio,phone system middle age gender english american,announcer approachable neutral informative eng...,"[-0.018108007, 0.013963416, -0.06848956, -0.08..."
3,499,audio,audiobooks podcasts senior gender english amer...,storyteller neutral authentic english us gener...,"[0.0032551463, -0.056001894, -0.07373111, -0.0..."
4,964,audio,presentation educational middle age gender eng...,serious neutral attentive english us general a...,"[0.07064739, 0.024581958, -0.02573366, -0.0170..."
...,...,...,...,...,...
65405,525119,audio,middle age male radio tv ad english american,natural friendly believable conversational tru...,"[0.008179849, -0.027468357, -0.049302094, -0.0..."
65406,525126,audio,senior male movie trailer english american,english us southern deep gravely sam elliot co...,"[-0.022263903, -0.01084226, -0.058256976, -0.0..."
65407,525138,audio,senior male videogames english american afr,deep calm james earl jones commanding king aut...,"[-0.029892016, -0.0022064876, 0.01211681, -0.0..."
65408,525154,audio,radio tv ad french france young adult female,conversational professional friendly mysterious,"[-0.021844044, 0.034046784, -0.026430318, -0.0..."


In [None]:
##join tags and samples and remove duplicated words 
##generate embeddings for descriptions and search terms

In [34]:
from sentence_transformers import util

def cosine_sim(vec1,vec2):
    return np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))

sentence1="british"
sentence2="united kingdom"
emb1=model.encode(sentence1)
emb2=model.encode(sentence2)

cosine_sim(emb1,emb2)

0.6134414

In [32]:
search="uk voice over"
search=decode_language_code(search,sep=" ")
print(search)
search=clean_sentence(search)
print(search)
embeddings_search=model.encode(search)


british voice over
british voice


In [36]:
samples_composed_data.loc[:,'sim_attributes']=samples_composed_data.embeddings_attributes.apply(lambda d: cosine_sim(d,embeddings_search))

In [None]:
samples_composed_data.loc[:,'sim_tags']=samples_composed_data.embeddings_attributes.apply(lambda d: cosine_sim(d,embeddings_search))

In [94]:
class SearchRecommender:
    
    def __init__(self,samples_filepath: str,pros_filepath: str,tags_filepath: str,separator=","):
        
        self.samples=samples_composed_data
        self.pros=pd.read_csv(pros_filepath,sep=separator)
        self.model=SentenceTransformer(MODEL_NAME_HF)


    @staticmethod
    def preprocess_data():
        pass

    def detect_intent(search: str):
        pass

    def suggest_samples(self,search: str,top_n: int=50):
        search=decode_language_code(search,sep=" ")
        search=clean_sentence(search)
        search_emb=self.model.encode(search)



    
        