## Model to do

- Try to build a model that handles the word splits better
- Build a simple matcher that generates example sentences

In [89]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance
import numpy as np
import pandas as pd
import os

In [275]:
import inspect

In [45]:
class Vectorizer(object):
    
    def __init__(self, model_name = "paraphrase-albert-small-v2"):
        
        self.model = SentenceTransformer(model_name)
    
    def vectorize_list(self, input_list):
        
        embedding_matrix = self.model.encode(input_list)
        
        return embedding_matrix
    
    def vectorize_single(self, input_string):
        
        embedding_vector = self.model.encode([input_string])
        
        return embedding_vector

In [77]:
loaded_model = AutoModelForTokenClassification.from_pretrained("distilbert-base-case-finetuned-app-ner")
loaded_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-case-finetuned-app-ner-tok")

In [78]:
id2label_dict = loaded_model.config.id2label
label2id_dict = loaded_model.config.label2id
model_label_2_id = {v:k for k,v in id2label_dict.items()}
model_id_2_label = {v:k for k,v in label2id_dict.items()}

In [79]:
ner_pipe = pipeline("ner", aggregation_strategy="max", model=loaded_model,tokenizer=loaded_tokenizer)
feature_pipe = pipeline("feature-extraction", model=loaded_model,tokenizer=loaded_tokenizer)

In [106]:
def generate_matching_dictionary(matcher, DATA_PATH = "data"):
    
    matching_dict = {"APP":{},"CAT":{},"PLAT":{},"REG":{},"MET":{}}
    
    app_matcher = pd.read_csv(os.path.join(DATA_PATH,"app_matching.csv"))
    matching_phrases = app_matcher["matching_phrase"].tolist()
    app_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["APP"]["embedding"] = app_embedding_matrix
    matching_dict["APP"]["matching_list"] = matching_phrases
    
    metric_matcher = pd.read_csv(os.path.join(DATA_PATH,"metric_matching.csv"))
    matching_phrases = metric_matcher["metric_name"].tolist()
    matching_type = metric_matcher["metric_type"].tolist()
    metric_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["MET"]["embedding"] = metric_embedding_matrix
    matching_dict["MET"]["matching_list"] = matching_phrases
    matching_dict["MET"]["metric_type"] = matching_type
    
    platform_matcher = pd.read_csv(os.path.join(DATA_PATH,"platform_matching.csv"))
    matching_phrases = platform_matcher["matching_phrase"].tolist()
    platform_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["PLAT"]["embedding"] = platform_embedding_matrix
    matching_dict["PLAT"]["matching_list"] = matching_phrases
    
    region_matcher = pd.read_csv(os.path.join(DATA_PATH,"region_matching.csv"))
    matching_phrases = region_matcher["matching_phrase"].tolist()
    region_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["REG"]["embedding"] = region_embedding_matrix
    matching_dict["REG"]["matching_list"] = matching_phrases
    
    category_matcher = pd.read_csv(os.path.join(DATA_PATH,"category_matching.csv"))
    matching_phrases = category_matcher["matching_phrase"].tolist()
    category_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["CAT"]["embedding"] = category_embedding_matrix
    matching_dict["CAT"]["matching_list"] = matching_phrases
    
    return matching_dict
    
    

In [242]:
matcher = Vectorizer()

In [109]:
matching_dict = generate_matching_dictionary(matcher,DATA_PATH="data")

In [377]:
class RandomMatchSelector(object):
    
    def __init__(self,ent_type,matcher_frame):
        
        self.ent_type = ent_type
        
        if isinstance(matcher_frame,pd.DataFrame):
            self.raw_probs = 1 - np.array(matcher_frame["score"])
            probs = self.raw_probs**2
            self.probs = probs/np.sum(probs)
            self.match_list = matcher_frame["match"].tolist()
            
            if self.ent_type == "MET":
                metric_types = matcher_frame["type"].tolist()
                self.match_list = [(m,t) for m,t in zip(self.match_list,metric_types)]
        else:
            self.match_list = None
            
    def return_matches(self,ndraws=5):
                                      
        if self.match_list:
            matches = self.match_list[:2]
            return matches + [self.match_list[i] for i in np.random.choice(np.arange(len(self.match_list)),size=ndraws-2,replace=True,p=self.probs)]
        else:
            return None


def construct_example_sentences(input_sentence,all_matches,vectorizer,nmatch=5):
    
    sentences = {}
    
    # there must be a metric match
    metric_comps = all_matches["MET"]["matches_for_sentence"]
    
    if len(metric_comps) == 0:
        # no matches
        return {}, []
    
    # deal with category match
    if len(all_matches["CAT"]["ner_match"]) > 0: 
        has_category = True
        category_comps = all_matches["CAT"]["matches_for_sentence"]
    else:
        has_category = False
        
    # deal with app match
    if len(all_matches["APP"]["ner_match"]) > 0:
        has_app = True
    else:
        has_app = False
    
        
    for sent_id in range(nmatch):
        sent = ''
        valid_sent = True
        sent_comps = {}
        metric_name, metric_type = metric_comps[sent_id]
        
        # if we have app metric but not apps are present
        if metric_type == "app" and not has_app:
            valid_sent = False
        
        else:
            sent += metric_name + " "
            for ent_id, v in all_matches.items():
                added_category = False 
                if (len(v["ner_match"]) > 0) and (ent_id != "MET"):
                    sent_comp = v["matches_for_sentence"][sent_id]
                
                    # Where we have an app and a market level query, ensure that we make the query about a group of
                    # apps. We can use the competitive set API for this, for example.
                    if (ent_id == "APP") and (metric_type == "market"):
                        #sent_comps[ent_id] = sent_comp + " vs. its competitors"
                        #sent += sent_comp + " vs. its competitors"
                        valid_sent = False
                        pass
                    # Where we have an app and a category, we want to remove the category
                    elif (ent_id == "CAT") and has_app:
                        valid_sent = False
                        pass
                    else:
                        sent += sent_comp
                    sent += " "
        sent = sent.strip()
        if valid_sent:
            sentences[sent] = sent_comps
        
    if len(sentences) > 0:
        
        ## USE A SENTENCE TRANSFORMER MATCHER HERE SINCE WE ARE TRYING TO ORDER THE
        ## OUTPUT BY MOST RELEVANT TO THE INPUT
        
        matched_sentences_embedding = vectorizer.vectorize_list(list(sentences.keys()))
        input_sentence_embedding = vectorizer.vectorize_single(input_sentence)
        sentence_ordering = np.argsort(distance.cdist(input_sentence_embedding,matched_sentences_embedding,metric="cosine")[0])
    
        return sentences, [list(sentences.keys())[i] for i in sentence_ordering]
    
    else:
        # no matches
        return {}, []

def extract_entities(model_output):

    ents_data = {"APP":[],"REG":[],"DATE":[],"PLAT":[],"MET":[],"CAT":[]}
    for ent in model_output[0]:
        ent_id = model_label_2_id[ent['entity_group']]
        ent_label = model_id_2_label[ent_id]
    
        if "-" in ent_label:
            ent_label = ent_label.split("-")[1]
            ents_data[ent_label].append(ent["word"])
    
    return ents_data


def get_top_matches_to_ent(ents,match_dict,ent_type,nmatch=5):
    
    assert ent_type in ["APP", "MET", "CAT", "REG", "PLAT","DATE"]
    
    if ent_type not in matching_dict:
        return None
    
    # Doesn't work will for typos - need to use the ngram char matcher instead
    # from the autocomplete matcher
    
    ents_to_match = ents[ent_type]
    if len(ents_to_match) <= 2:
        matched_ents = [" ".join(ents_to_match)]
    else:
        matched_ents = [" ".join(ents_to_match[:2])," ".join(ents_to_match[2:])]
    
    matched_dfs = []
    
    for matched_ent in matched_ents:
        
        ## USE A MATCHER THAT IS BETTER ABLE TO DEAL WITH TYPOS HERE
        ## THIS IS FOR FUZZY MATCHING FROM THE EXTRACTED ENTITIES TO THE DBS
        
        matching_vector = matcher.vectorize_single(matched_ent)
        distances = distance.cdist(matching_vector,matching_dict[ent_type]["embedding"],metric="cosine")[0]
        sorted_distances = np.argsort(distances)[:nmatch]
        
        print(matched_ent,min(distances))
        if min(distances) < 0.6:
            # attempt to prevent matches that are really poor
        
            closest_matches = [matching_dict[ent_type]["matching_list"][i] for i in sorted_distances]
            if ent_type == "MET":
                metric_types = [matching_dict[ent_type]["metric_type"][i] for i in sorted_distances]
                matched_dfs.append(pd.DataFrame({"query":[matched_ent]*nmatch,"match":closest_matches,"score":distances[sorted_distances],"type":metric_types}))
            else:
                matched_dfs.append(pd.DataFrame({"query":[matched_ent]*nmatch,"match":closest_matches,"score":distances[sorted_distances]}))
    
    if matched_dfs:
        return pd.concat(matched_dfs).sort_values(by="score").reset_index(drop=True)
    else:
        return None

def get_matches(ents_data,match_dict):
    
    ents_matched_dict = {"APP":None,"REG":None,"DATE":None,"PLAT":None,"MET":None,"CAT":None}
    
    for ent, matches in ents_data.items():
        
        if len(matches) > 0:
            matched_ents = get_top_matches_to_ent(ents_data,match_dict=matching_dict,ent_type=ent)
            
            if isinstance (matched_ents,type(None)):
                # no matches 
                ents_matched_dict[ent] = {
                    "ner_match":[], 
                    "fuzzy_matches":None, 
                    "matches_for_sentence":[]
                }
                
            else:
            
                match_selector = RandomMatchSelector(ent,matched_ents)
                chosen_matches_for_sentence = match_selector.return_matches()
        
                ents_matched_dict[ent] = {
                        "ner_match":matches, 
                        "fuzzy_matches":matched_ents, 
                        "matches_for_sentence":chosen_matches_for_sentence
                }
        
        else:
            # no matches
            ents_matched_dict[ent] = {
                    "ner_match":[], 
                    "fuzzy_matches":None, 
                    "matches_for_sentence":[]
            }
    
    return ents_matched_dict
    

In [383]:
query = "blah blash apps snapcht ratings in france"
# Get NER ents
op = ner_pipe([query])
# Extract thaen from the model output
ents = extract_entities(op)
# matches dictionary and their fuzzy matches
all_matches = get_matches(ents,match_dict=matching_dict)
# generate example sentences and then rank them - these will be the final matches
sentences_map, ordered_sentences = construct_example_sentences(query,all_matches,vectorizer=matcher)
ordered_sentences



blah blash 0.6878438266315849
snapcht 0.34731029404777103
france 7.439604488013174e-13
ratings 5.472289288377397e-13


['ratings snap france', 'ratings snap germany', 'reviews snapchat italy']

In [365]:
all_matches

{'APP': {'ner_match': ['blah', 'blash'],
  'fuzzy_matches':         query     match     score
  0  blah blash   audible  0.687844
  1  blah blash  doordash  0.774296
  2  blah blash  doordash  0.774296
  3  blah blash  snapchat  0.777177
  4  blah blash    caviar  0.778069,
  'matches_for_sentence': ['audible',
   'doordash',
   'audible',
   'caviar',
   'snapchat']},
 'REG': {'ner_match': [], 'fuzzy_matches': None, 'matches_for_sentence': []},
 'DATE': {'ner_match': [], 'fuzzy_matches': None, 'matches_for_sentence': []},
 'PLAT': {'ner_match': [], 'fuzzy_matches': None, 'matches_for_sentence': []},
 'MET': {'ner_match': ['apps reviews'],
  'fuzzy_matches':           query                match     score    type
  0  apps reviews              reviews  0.526014     app
  1  apps reviews    top chart reviews  0.646233  market
  2  apps reviews              ratings  0.683010     app
  3  apps reviews  top chart downloads  0.688862  market
  4  apps reviews            downloads  0.720671  

In [255]:
all_matches

{'APP': {'ner_match': ['facebook'],
  'fuzzy_matches':       query               match         score
  0  facebook            facebook  6.376011e-13
  1  facebook  facebook messenger  2.701812e-01
  2  facebook           messenger  6.388313e-01
  3  facebook        youtube kids  6.431609e-01
  4  facebook             youtube  6.466601e-01,
  'match_selector': <__main__.RandomMatchSelector at 0x7f975b11a208>,
  'matches_for_sentence': ['facebook',
   'facebook messenger',
   'youtube',
   'messenger',
   'facebook']},
 'REG': {'ner_match': ['uk'],
  'fuzzy_matches':   query           match         score
  0    uk              uk  6.382672e-13
  1    uk  united kingdom  2.337332e-01
  2    uk   great britain  3.025736e-01
  3    uk       australia  5.053589e-01
  4    uk             usa  5.859149e-01,
  'match_selector': <__main__.RandomMatchSelector at 0x7f975b994438>,
  'matches_for_sentence': ['uk', 'uk', 'uk', 'uk', 'uk']},
 'DATE': {'ner_match': [],
  'fuzzy_matches': None,
  'match

In [289]:
vv = 'class RandomMatchSelector(object):\n    \n    def __init__(self,ent_type="APP",matcher_frame):\n        \n        probs = 1 - np.array(matcher_frame["score"])\n        self.probs = probs/np.max(probs)\n        self.match_list = matcher_frame["fuzzy_matches"].tolist()\n    \n    def return_matches(self,ndraws=5):\n        \n        return np.random.choice(self.match_list,size=ndraws,replace=True,p=self.probs)\n        \n    '

In [290]:
print(vv)

class RandomMatchSelector(object):
    
    def __init__(self,ent_type="APP",matcher_frame):
        
        probs = 1 - np.array(matcher_frame["score"])
        self.probs = probs/np.max(probs)
        self.match_list = matcher_frame["fuzzy_matches"].tolist()
    
    def return_matches(self,ndraws=5):
        
        return np.random.choice(self.match_list,size=ndraws,replace=True,p=self.probs)
        
    
