## Model to do

- Try to build a model that handles the word splits better
- Build a simple matcher that generates example sentences

In [89]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance
import numpy as np
import pandas as pd
import os

In [45]:
class Vectorizer(object):
    
    def __init__(self, model_name = "paraphrase-albert-small-v2"):
        
        self.model = SentenceTransformer(model_name)
    
    def vectorize_list(self, input_list):
        
        embedding_matrix = self.model.encode(input_list)
        
        return embedding_matrix
    
    def vectorize_single(self, input_string):
        
        embedding_vector = self.model.encode([input_string])
        
        return embedding_vector

In [77]:
loaded_model = AutoModelForTokenClassification.from_pretrained("distilbert-base-case-finetuned-app-ner")
loaded_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-case-finetuned-app-ner-tok")

In [78]:
id2label_dict = loaded_model.config.id2label
label2id_dict = loaded_model.config.label2id
model_label_2_id = {v:k for k,v in id2label_dict.items()}
model_id_2_label = {v:k for k,v in label2id_dict.items()}

In [79]:
ner_pipe = pipeline("ner", aggregation_strategy="max", model=loaded_model,tokenizer=loaded_tokenizer)
feature_pipe = pipeline("feature-extraction", model=loaded_model,tokenizer=loaded_tokenizer)

In [106]:
def generate_matching_dictionary(matcher, DATA_PATH = "data"):
    
    matching_dict = {"APP":{},"CAT":{},"PLAT":{},"REG":{},"MET":{}}
    
    app_matcher = pd.read_csv(os.path.join(DATA_PATH,"app_matching.csv"))
    matching_phrases = app_matcher["matching_phrase"].tolist()
    app_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["APP"]["embedding"] = app_embedding_matrix
    matching_dict["APP"]["matching_list"] = matching_phrases
    
    metric_matcher = pd.read_csv(os.path.join(DATA_PATH,"metric_matching.csv"))
    matching_phrases = metric_matcher["metric_name"].tolist()
    matching_type = metric_matcher["metric_type"].tolist()
    metric_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["MET"]["embedding"] = metric_embedding_matrix
    matching_dict["MET"]["matching_list"] = matching_phrases
    matching_dict["MET"]["metric_type"] = matching_type
    
    platform_matcher = pd.read_csv(os.path.join(DATA_PATH,"platform_matching.csv"))
    matching_phrases = platform_matcher["matching_phrase"].tolist()
    platform_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["PLAT"]["embedding"] = platform_embedding_matrix
    matching_dict["PLAT"]["matching_list"] = matching_phrases
    
    region_matcher = pd.read_csv(os.path.join(DATA_PATH,"region_matching.csv"))
    matching_phrases = region_matcher["matching_phrase"].tolist()
    region_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["REG"]["embedding"] = region_embedding_matrix
    matching_dict["REG"]["matching_list"] = matching_phrases
    
    category_matcher = pd.read_csv(os.path.join(DATA_PATH,"category_matching.csv"))
    matching_phrases = category_matcher["matching_phrase"].tolist()
    category_embedding_matrix = matcher.vectorize_list(matching_phrases)
    
    matching_dict["CAT"]["embedding"] = category_embedding_matrix
    matching_dict["CAT"]["matching_list"] = matching_phrases
    
    return matching_dict
    
    

In [242]:
matcher = Vectorizer()

In [109]:
matching_dict = generate_matching_dictionary(matcher,DATA_PATH="data")

In [226]:
def construct_example_sentences(input_sentence,all_matches,vectorizer,nmatch=5):
    
    sentences = {}
    for sent_id in range(nmatch):
        sent = ''
        sent_comps = {}
        for ent_id, key in all_matches.items():
            if len(key["ner_match"]) > 0:
                sent_comp = key["matches_for_sentence"][sent_id]
                sent_comps[ent_id] = sent_comp
                sent += sent_comp
                sent += " "
        sent = sent.strip()
        sentences[sent] = sent_comps
        
    matched_sentences_embedding = vectorizer.vectorize_list(list(sentences.keys()))
    input_sentence_embedding = vectorizer.vectorize_single(input_sentence)
    sentence_ordering = np.argsort(distance.cdist(input_sentence_embedding,matched_sentences_embedding,metric="cosine")[0])
    
    return sentences, [list(sentences.keys())[i] for i in sentence_ordering]

def extract_entities(model_output):

    ents_data = {"APP":[],"REG":[],"DATE":[],"PLAT":[],"MET":[],"CAT":[]}
    for ent in model_output[0]:
        ent_id = model_label_2_id[ent['entity_group']]
        ent_label = model_id_2_label[ent_id]
    
        if "-" in ent_label:
            ent_label = ent_label.split("-")[1]
            ents_data[ent_label].append(ent["word"])
    
    return ents_data


def get_top_matches_to_ent(ents,match_dict,ent_type,nmatch=5):
    
    assert ent_type in ["APP", "MET", "CAT", "REG", "PLAT","DATE"]
    
    if ent_type not in matching_dict:
        return None
    
    # Doesn't work will for typos - need to use the ngram char matcher instead
    # from the autocomplete matcher
    
    matched_ent = " ".join(ents[ent_type])
    matching_vector = matcher.vectorize_single(matched_ent)
    distances = distance.cdist(matching_vector,matching_dict[ent_type]["embedding"],metric="cosine")[0]
    sorted_distances = np.argsort(distances)[:nmatch]
    closest_matches = [matching_dict[ent_type]["matching_list"][i] for i in sorted_distances]
    matched_df = pd.DataFrame({"query":[matched_ent]*nmatch,"match":closest_matches,"score":distances[sorted_distances]})
    
    return matched_df

def get_matches(ents_data,match_dict):
    
    ents_matched_dict = {"APP":None,"REG":None,"DATE":None,"PLAT":None,"MET":None,"CAT":None}
    
    for ent, matches in ents_data.items():
        
        matched_ents = get_top_matches_to_ent(ents_data,match_dict=matching_dict,ent_type=ent)
        match_selector = RandomMatchSelector(ent,matched_ents)
        chosen_matches_for_sentence = match_selector.return_matches()
        ents_matched_dict[ent] = {"ner_match":matches, "fuzzy_matches":matched_ents, "match_selector":match_selector, "matches_for_sentence":chosen_matches_for_sentence}
    
    return ents_matched_dict
    

In [249]:
query = "the top chart ranks in uk"
# Get NER ents
op = ner_pipe([query])
# Extract thaen from the model output
ents = extract_entities(op)
# matches dictionary and their fuzzy matches
all_matches = get_matches(ents,match_dict=matching_dict)
# generate example sentences and then rank them - these will be the final matches
sentences_map, ordered_sentences = construct_example_sentences(query,all_matches,vectorizer=matcher)

  cpuset_checked))


In [246]:
all_matches

{'APP': {'ner_match': [],
  'fuzzy_matches':   query      match     score
  0              bbc  0.571618
  1             bing  0.571880
  2         bbc news  0.582875
  3         cbc news  0.616941
  4        economist  0.622728,
  'match_selector': <__main__.RandomMatchSelector at 0x7f975b0cb048>,
  'matches_for_sentence': ['bbc', 'bbc', 'bbc news', 'bing', 'economist']},
 'REG': {'ner_match': ['uk'],
  'fuzzy_matches':   query           match         score
  0    uk              uk  6.382672e-13
  1    uk  united kingdom  2.337332e-01
  2    uk   great britain  3.025736e-01
  3    uk       australia  5.053589e-01
  4    uk             usa  5.859149e-01,
  'match_selector': <__main__.RandomMatchSelector at 0x7f975b511048>,
  'matches_for_sentence': ['uk', 'australia', 'uk', 'uk', 'united kingdom']},
 'DATE': {'ner_match': [],
  'fuzzy_matches': None,
  'match_selector': <__main__.RandomMatchSelector at 0x7f975b5114e0>,
  'matches_for_sentence': [None]},
 'PLAT': {'ner_match': [],
  'f

In [247]:
ordered_sentences

['uk top chart ranks',
 'united kingdom top chart ranks',
 'uk top chart reviews',
 'australia top chart downloads']

In [248]:
all_matches

{'APP': {'ner_match': [],
  'fuzzy_matches':   query      match     score
  0              bbc  0.571618
  1             bing  0.571880
  2         bbc news  0.582875
  3         cbc news  0.616941
  4        economist  0.622728,
  'match_selector': <__main__.RandomMatchSelector at 0x7f975b0cb048>,
  'matches_for_sentence': ['bbc', 'bbc', 'bbc news', 'bing', 'economist']},
 'REG': {'ner_match': ['uk'],
  'fuzzy_matches':   query           match         score
  0    uk              uk  6.382672e-13
  1    uk  united kingdom  2.337332e-01
  2    uk   great britain  3.025736e-01
  3    uk       australia  5.053589e-01
  4    uk             usa  5.859149e-01,
  'match_selector': <__main__.RandomMatchSelector at 0x7f975b511048>,
  'matches_for_sentence': ['uk', 'australia', 'uk', 'uk', 'united kingdom']},
 'DATE': {'ner_match': [],
  'fuzzy_matches': None,
  'match_selector': <__main__.RandomMatchSelector at 0x7f975b5114e0>,
  'matches_for_sentence': [None]},
 'PLAT': {'ner_match': [],
  'f