## Model to do

- Try to build a model that handles the word splits better
- Build a simple matcher that generates example sentences
- Use the workflow here to identify issues with the NER, improve the training set to fix them

### Possible NER issues:
- train more with apostrohie to indicate app
- make sure that capitalization is used - help to distinguish apps
- train with some typos

Build a typo generator?
See https://github.com/tdhoward/pyTypo/blob/master/tpo.py 

In [423]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance
import numpy as np
import pandas as pd
import os
import itertools

In [275]:
import inspect

In [45]:
class Vectorizer(object):
    
    def __init__(self, model_name = "paraphrase-albert-small-v2"):
        
        self.model = SentenceTransformer(model_name)
    
    def vectorize_list(self, input_list):
        
        embedding_matrix = self.model.encode(input_list)
        
        return embedding_matrix
    
    def vectorize_single(self, input_string):
        
        embedding_vector = self.model.encode([input_string])
        
        return embedding_vector

In [77]:
loaded_model = AutoModelForTokenClassification.from_pretrained("distilbert-base-case-finetuned-app-ner")
loaded_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-case-finetuned-app-ner-tok")

In [78]:
id2label_dict = loaded_model.config.id2label
label2id_dict = loaded_model.config.label2id
model_label_2_id = {v:k for k,v in id2label_dict.items()}
model_id_2_label = {v:k for k,v in label2id_dict.items()}

In [79]:
ner_pipe = pipeline("ner", aggregation_strategy="max", model=loaded_model,tokenizer=loaded_tokenizer)
feature_pipe = pipeline("feature-extraction", model=loaded_model,tokenizer=loaded_tokenizer)

In [106]:
def generate_matching_dictionary(matcher, DATA_PATH = "data"):
    
    matching_dict = {"APP":{},"CAT":{},"PLAT":{},"REG":{},"MET":{}}
    
    app_matcher = pd.read_csv(os.path.join(DATA_PATH,"app_matching.csv"))
    matching_phrases = app_matcher["matching_phrase"].tolist()
    app_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["APP"]["embedding"] = app_embedding_matrix
    matching_dict["APP"]["matching_list"] = matching_phrases
    
    metric_matcher = pd.read_csv(os.path.join(DATA_PATH,"metric_matching.csv"))
    matching_phrases = metric_matcher["metric_name"].tolist()
    matching_type = metric_matcher["metric_type"].tolist()
    metric_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["MET"]["embedding"] = metric_embedding_matrix
    matching_dict["MET"]["matching_list"] = matching_phrases
    matching_dict["MET"]["metric_type"] = matching_type
    
    platform_matcher = pd.read_csv(os.path.join(DATA_PATH,"platform_matching.csv"))
    matching_phrases = platform_matcher["matching_phrase"].tolist()
    platform_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["PLAT"]["embedding"] = platform_embedding_matrix
    matching_dict["PLAT"]["matching_list"] = matching_phrases
    
    region_matcher = pd.read_csv(os.path.join(DATA_PATH,"region_matching.csv"))
    matching_phrases = region_matcher["matching_phrase"].tolist()
    region_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["REG"]["embedding"] = region_embedding_matrix
    matching_dict["REG"]["matching_list"] = matching_phrases
    
    category_matcher = pd.read_csv(os.path.join(DATA_PATH,"category_matching.csv"))
    matching_phrases = category_matcher["matching_phrase"].tolist()
    category_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["CAT"]["embedding"] = category_embedding_matrix
    matching_dict["CAT"]["matching_list"] = matching_phrases
    
    return matching_dict
    
    

In [242]:
matcher = Vectorizer()

In [109]:
matching_dict = generate_matching_dictionary(matcher,DATA_PATH="data")

In [439]:
class RandomMatchSelector(object):
    
    def __init__(self,ent_type,matcher_frame):
        
        self.ent_type = ent_type
        
        if isinstance(matcher_frame,pd.DataFrame):
            self.raw_probs = 1 - np.array(matcher_frame["score"])
            probs = self.raw_probs
            self.probs = probs/np.sum(probs)
            self.match_list = matcher_frame["match"].tolist()
            
            if self.ent_type == "MET":
                metric_types = matcher_frame["type"].tolist()
                self.match_list = [(m,t) for m,t in zip(self.match_list,metric_types)]
        else:
            self.match_list = None
            
    def return_matches(self,ndraws=5):
                                      
        if self.match_list:
            matches = self.match_list[:2]
            return matches + [self.match_list[i] for i in np.random.choice(np.arange(len(self.match_list)),size=ndraws-2,replace=True,p=self.probs)]
        else:
            return None


def construct_example_sentences(input_sentence,all_matches,vectorizer,nmatch=5):
    
    ## NO LONGER USED, SEE GET MATCHING SENTENCES BELOW
    
    sentences = {}
    
    # there must be a metric match
    metric_comps = all_matches["MET"]["matches_for_sentence"]
    
    if len(metric_comps) == 0:
        # no matches
        return {}, []
    
    # deal with category match
    if len(all_matches["CAT"]["ner_match"]) > 0: 
        has_category = True
        category_comps = all_matches["CAT"]["matches_for_sentence"]
    else:
        has_category = False
        
    # deal with app match
    if len(all_matches["APP"]["ner_match"]) > 0:
        has_app = True
    else:
        has_app = False
    
        
    for sent_id in range(nmatch):
        sent = ''
        valid_sent = True
        sent_comps = {}
        metric_name, metric_type = metric_comps[sent_id]
        sent_comps["MET"] = metric_name
        # if we have app metric but not apps are present
        if metric_type == "app" and not has_app:
            valid_sent = False
        
        else:
            sent += metric_name + " "
            for ent_id, v in all_matches.items():
                added_category = False 
                if (len(v["ner_match"]) > 0) and (ent_id != "MET"):
                    sent_comp = v["matches_for_sentence"][sent_id]
                    sent_comps[ent_id] = sent_comp
                
                    # Where we have an app and a market level query, ensure that we make the query about a group of
                    # apps. We can use the competitive set API for this, for example.
                    if (ent_id == "APP") and (metric_type == "market"):
                        #sent_comps[ent_id] = sent_comp + " vs. its competitors"
                        #sent += sent_comp + " vs. its competitors"
                        valid_sent = False
                        pass
                    # Where we have an app and a category, we want to remove the category
                    elif (ent_id == "CAT") and has_app:
                        valid_sent = False
                        pass
                    else:
                        sent += sent_comp
                    sent += " "

        sent = sent.strip()
        if valid_sent:
            sentences[sent] = sent_comps
        
    if len(sentences) > 0:
        
        ## USE A SENTENCE TRANSFORMER MATCHER HERE SINCE WE ARE TRYING TO ORDER THE
        ## OUTPUT BY MOST RELEVANT TO THE INPUT
        
        matched_sentences_embedding = vectorizer.vectorize_list(list(sentences.keys()))
        input_sentence_embedding = vectorizer.vectorize_single(input_sentence)
        sentence_ordering = np.argsort(distance.cdist(input_sentence_embedding,matched_sentences_embedding,metric="cosine")[0])
    
        return sentences, [list(sentences.keys())[i] for i in sentence_ordering]
    
    else:
        # no matches
        return {}, []

def extract_entities(model_output):

    ents_data = {"APP":[],"REG":[],"DATE":[],"PLAT":[],"MET":[],"CAT":[]}
    for ent in model_output[0]:
        ent_id = model_label_2_id[ent['entity_group']]
        ent_label = model_id_2_label[ent_id]
    
        if "-" in ent_label:
            ent_label = ent_label.split("-")[1]
            ents_data[ent_label].append(ent["word"])
    
    return ents_data


def get_top_matches_to_ent(ents,match_dict,ent_type,nmatch=3):
    
    assert ent_type in ["APP", "MET", "CAT", "REG", "PLAT","DATE"]
    
    if ent_type not in matching_dict:
        return None
    
    # Doesn't work will for typos - need to use the ngram char matcher instead
    # from the autocomplete matcher
    
    ents_to_match = ents[ent_type]
    if len(ents_to_match) <= 2:
        matched_ents = [" ".join(ents_to_match)]
    else:
        matched_ents = [" ".join(ents_to_match[:2])," ".join(ents_to_match[2:])]
    
    matched_dfs = []
    
    for matched_ent in matched_ents:
        
        ## USE A MATCHER THAT IS BETTER ABLE TO DEAL WITH TYPOS HERE
        ## THIS IS FOR FUZZY MATCHING FROM THE EXTRACTED ENTITIES TO THE DBS
        
        matching_vector = matcher.vectorize_single(matched_ent)
        distances = distance.cdist(matching_vector,matching_dict[ent_type]["embedding"],metric="cosine")[0]
        sorted_distances = np.argsort(distances)[:nmatch]
        
        if min(distances) < 0.7:
            # attempt to prevent matches that are really poor
        
            closest_matches = [matching_dict[ent_type]["matching_list"][i] for i in sorted_distances]
            if ent_type == "MET":
                metric_types = [matching_dict[ent_type]["metric_type"][i] for i in sorted_distances]
                matched_dfs.append(pd.DataFrame({"query":[matched_ent]*nmatch,"match":closest_matches,"score":distances[sorted_distances],"type":metric_types}))
            else:
                matched_dfs.append(pd.DataFrame({"query":[matched_ent]*nmatch,"match":closest_matches,"score":distances[sorted_distances]}))
    
    if matched_dfs:
        return pd.concat(matched_dfs).sort_values(by="score").reset_index(drop=True)
    else:
        return None

def get_matches(ents_data,match_dict):
    
    ents_matched_dict = {"APP":None,"REG":None,"DATE":None,"PLAT":None,"MET":None,"CAT":None}
    
    for ent, matches in ents_data.items():
        
        if len(matches) > 0:
            matched_ents = get_top_matches_to_ent(ents_data,match_dict=matching_dict,ent_type=ent)
            
            if isinstance (matched_ents,type(None)):
                # no matches 
                ents_matched_dict[ent] = {
                    "ner_match":[], 
                    "fuzzy_matches":None, 
                    "matches_for_sentence":[]
                }
                
            else:
            
                match_selector = RandomMatchSelector(ent,matched_ents)
                chosen_matches_for_sentence = match_selector.return_matches()
        
                ents_matched_dict[ent] = {
                        "ner_match":matches, 
                        "fuzzy_matches":matched_ents, 
                        "matches_for_sentence":chosen_matches_for_sentence
                }
        
        else:
            # no matches
            ents_matched_dict[ent] = {
                    "ner_match":[], 
                    "fuzzy_matches":None, 
                    "matches_for_sentence":[]
            }
    
    return ents_matched_dict
    

In [440]:
query = "call of duty on ios in china by reviews"
# Get NER ents
op = ner_pipe([query])
# Extract thaen from the model output
ents = extract_entities(op)
# matches dictionary and their fuzzy matches
all_matches = get_matches(ents,match_dict=matching_dict)
# generate example sentences and then rank them - these will be the final matches
sentences_map, ordered_sentences = construct_example_sentences(query,all_matches,vectorizer=matcher)
ordered_sentences



call of duty 3.9479530755670567e-13
china 3.7614356074300304e-13
ios 0.0
reviews 1.0373923942097463e-12


['reviews call of duty china ios',
 'ratings clash of clans taiwan ios store',
 'reviews clash of clans hong kong ios store',
 'reviews clash of clans china apple']

In [413]:
sentences_map

{}

In [441]:
all_matches

{'APP': {'ner_match': ['call', 'of duty'],
  'fuzzy_matches':           query           match         score
  0  call of duty    call of duty  3.947953e-13
  1  call of duty  clash of clans  7.185045e-01
  2  call of duty   into the dead  7.232482e-01,
  'matches_for_sentence': ['call of duty',
   'clash of clans',
   'clash of clans',
   'call of duty',
   'clash of clans']},
 'REG': {'ner_match': ['china'],
  'fuzzy_matches':    query      match         score
  0  china      china  3.761436e-13
  1  china     taiwan  3.067843e-01
  2  china  hong kong  3.143438e-01,
  'matches_for_sentence': ['china', 'taiwan', 'china', 'china', 'hong kong']},
 'DATE': {'ner_match': [], 'fuzzy_matches': None, 'matches_for_sentence': []},
 'PLAT': {'ner_match': ['ios'],
  'fuzzy_matches':   query      match     score
  0   ios        ios  0.000000
  1   ios  ios store  0.241801
  2   ios      apple  0.551396,
  'matches_for_sentence': ['ios', 'ios store', 'apple', 'ios', 'ios store']},
 'MET': {'ner_m

In [487]:
def get_matching_sentences(all_matches, topk=20):
    
    all_matches_reordered = []
    for ent_type in ["MET","APP","PLAT","REG","CAT"]:
        
        if isinstance(all_matches[ent_type]["fuzzy_matches"],pd.DataFrame):
            
            if ent_type == "MET":
                all_matches_reordered.append([("MET",*e) for e in all_matches["MET"]["fuzzy_matches"][["match","score","type"]].to_records(index=False)])
            else:
                all_matches_reordered.append([(ent_type,*e,"") for e in all_matches[ent_type]["fuzzy_matches"][["match","score"]].to_records(index=False)])
    
    # Get all combinations of the objects and order them according to the sum of their matching scores 
    combinations = list(itertools.product(*all_matches_reordered))
    combination_scores = [sum([p[2] for p in q]) for q in combinations]
    combinations_sorted = np.argsort(combination_scores)
    top_combinations = [combinations[i] for i in combinations_sorted[:topk]]
    top_combination_sentences = {" ".join([p[1] for p in q]):q for q in top_combinations}
    
    return top_combination_sentences

In [510]:
def clean_match_sentences(top_sentences):
    
    cleaned_sentences = {}
    for sent, data in top_sentences.items():
        metric_types = [p[0]+"_"+p[-1] for p in data]

        #no metrics = skip
        if ("MET_app" not in metric_types) and ("MET_market" not in metric_types):
            pass
        #app metric but no app = skip
        elif ("MET_app" in metric_types) and ("APP_" not in metric_types):
            pass
        #app with market level metric = skip
        elif ("MET_market" in metric_types) and ("APP_" in metric_types):
            pass
        #app and category = skip
        elif ("APP_" in metric_types) and ("CAT_" in metric_types):
            pass
        else:
            cleaned_sentences[sent] = data
    
    return cleaned_sentences

def semantic_match_with_input(cleaned_sentences, input_query, matcher):
    
    
    if len(cleaned_sentences) > 0:
        
        ## USE A SENTENCE TRANSFORMER MATCHER HERE SINCE WE ARE TRYING TO ORDER THE
        ## OUTPUT BY MOST RELEVANT TO THE INPUT
        
        candidate_sentences = list(cleaned_sentences.keys())
        matched_sentences_embedding = matcher.vectorize_list(candidate_sentences)
        input_sentence_embedding = matcher.vectorize_single(input_query)
        sentence_ordering = np.argsort(distance.cdist(input_sentence_embedding,matched_sentences_embedding,metric="cosine")[0])
    
        return {candidate_sentences[i]:cleaned_sentences[candidate_sentences[i]] for i in sentence_ordering}
    
    else:
        
        return {}

In [524]:
query = "uber eats reviews on google play"
# Get NER ents
op = ner_pipe([query])
# Extract thaen from the model output
ents = extract_entities(op)
# matches dictionary and their fuzzy matches
all_matches = get_matches(ents,match_dict=matching_dict)

all_combinations = get_matching_sentences(all_matches)
cleaned_combinations = clean_match_sentences(all_combinations)
ordered_combinations = semantic_match_with_input(cleaned_combinations,query,matcher)
ordered_combinations



uber eats 4.629630012686903e-13
google play 1.1102230246251565e-16
reviews 1.0373923942097463e-12


{'reviews uber eats google play': (('MET',
   'reviews',
   1.0373923942097463e-12,
   'app'),
  ('APP', 'uber eats', 4.629630012686903e-13, ''),
  ('PLAT', 'google play', 1.1102230246251565e-16, '')),
 'ratings uber eats google play': (('MET',
   'ratings',
   0.4157903265217979,
   'app'),
  ('APP', 'uber eats', 4.629630012686903e-13, ''),
  ('PLAT', 'google play', 1.1102230246251565e-16, '')),
 'reviews uber google play': (('MET',
   'reviews',
   1.0373923942097463e-12,
   'app'),
  ('APP', 'uber', 0.34374647890056054, ''),
  ('PLAT', 'google play', 1.1102230246251565e-16, '')),
 'reviews uber eats google store': (('MET',
   'reviews',
   1.0373923942097463e-12,
   'app'),
  ('APP', 'uber eats', 4.629630012686903e-13, ''),
  ('PLAT', 'google store', 0.4170538587712811, '')),
 'ratings uber eats google store': (('MET',
   'ratings',
   0.4157903265217979,
   'app'),
  ('APP', 'uber eats', 4.629630012686903e-13, ''),
  ('PLAT', 'google store', 0.4170538587712811, '')),
 'reviews uber

In [523]:
ents

{'APP': [],
 'REG': [],
 'DATE': [],
 'PLAT': ['google', 'play'],
 'MET': ['uber reviews'],
 'CAT': []}

In [503]:
cleaned_combinations = clean_match_sentences(all_combinations)

In [511]:
ordered_combinations = semantic_match_with_input(cleaned_combinations,query,matcher)

In [512]:
ordered_combinations

{'reviews call of duty ios china': (('MET',
   'reviews',
   1.0373923942097463e-12,
   'app'),
  ('APP', 'call of duty', 3.9479530755670567e-13, ''),
  ('PLAT', 'ios', 0.0, ''),
  ('REG', 'china', 3.7614356074300304e-13, '')),
 'ratings call of duty ios china': (('MET',
   'ratings',
   0.4157903265217979,
   'app'),
  ('APP', 'call of duty', 3.9479530755670567e-13, ''),
  ('PLAT', 'ios', 0.0, ''),
  ('REG', 'china', 3.7614356074300304e-13, '')),
 'reviews call of duty ios store china': (('MET',
   'reviews',
   1.0373923942097463e-12,
   'app'),
  ('APP', 'call of duty', 3.9479530755670567e-13, ''),
  ('PLAT', 'ios store', 0.2418007206294126, ''),
  ('REG', 'china', 3.7614356074300304e-13, '')),
 'reviews call of duty ios taiwan': (('MET',
   'reviews',
   1.0373923942097463e-12,
   'app'),
  ('APP', 'call of duty', 3.9479530755670567e-13, ''),
  ('PLAT', 'ios', 0.0, ''),
  ('REG', 'taiwan', 0.3067842515118486, '')),
 'reviews call of duty ios hong kong': (('MET',
   'reviews',
   1.

In [452]:
regions = [("REG",*e) for e in all_matches["REG"]["fuzzy_matches"][["match","score"]].to_records(index=False)]
apps = [("APP",*e) for e in all_matches["APP"]["fuzzy_matches"][["match","score"]].to_records(index=False)]
platforms = [("PLAT",*e) for e in all_matches["PLAT"]["fuzzy_matches"][["match","score"]].to_records(index=False)]
#categories = all_matches["CAT"]["fuzzy_matches"]["match"].tolist()
metrics = [("MET",*e) for e in all_matches["MET"]["fuzzy_matches"][["match","score","type"]].to_records(index=False)]

In [453]:
l3 = [regions,apps,platforms,metrics]

In [474]:
topk = 10
combinations = list(itertools.product(*l3))
combination_scores = [sum([p[2] for p in q]) for q in combinations]
combinations_sorted = np.argsort(combination_scores)
top_combinations = [combinations[i] for i in combinations_sorted[:topk]]
top_combination_sentences = {" ".join([p[1] for p in q]):q for q in top_combinations}

In [475]:
top_combination_sentences

{'china call of duty ios reviews': (('REG', 'china', 3.7614356074300304e-13),
  ('APP', 'call of duty', 3.9479530755670567e-13),
  ('PLAT', 'ios', 0.0),
  ('MET', 'reviews', 1.0373923942097463e-12, 'app')),
 'china call of duty ios store reviews': (('REG',
   'china',
   3.7614356074300304e-13),
  ('APP', 'call of duty', 3.9479530755670567e-13),
  ('PLAT', 'ios store', 0.2418007206294126),
  ('MET', 'reviews', 1.0373923942097463e-12, 'app')),
 'taiwan call of duty ios reviews': (('REG', 'taiwan', 0.3067842515118486),
  ('APP', 'call of duty', 3.9479530755670567e-13),
  ('PLAT', 'ios', 0.0),
  ('MET', 'reviews', 1.0373923942097463e-12, 'app')),
 'hong kong call of duty ios reviews': (('REG',
   'hong kong',
   0.31434379374675603),
  ('APP', 'call of duty', 3.9479530755670567e-13),
  ('PLAT', 'ios', 0.0),
  ('MET', 'reviews', 1.0373923942097463e-12, 'app')),
 'china call of duty ios ratings': (('REG', 'china', 3.7614356074300304e-13),
  ('APP', 'call of duty', 3.9479530755670567e-13),
 

In [465]:
len(combinations)

81