In [1]:
# _importing required libraries

import pandas as pd
import os
import string
import numpy as np
import hdbscan
import itertools

import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import spacy

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
from clusteval import clusteval

import texthero as hero
from texthero import preprocessing

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

import umap

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tf_model = hub.load(os.getcwd()+ '/../../models/USE_model')

In [3]:
keywords_df = pd.read_pickle(os.getcwd()+'/final_keywords_dataframe.pkl')
keywords_df = keywords_df[['id', 'keywords', 'lang']]
keywords_df = keywords_df.rename(columns={'id': 'page_id'})

In [4]:
keywords_df.columns

Index(['page_id', 'text', 'text_len', 'lang', 'text_tokens', 'nc_vec', 'label',
       'label_name', 'title', 'pubDate', 'url', 'doc_repr_vec', 'keywords'],
      dtype='object')

In [5]:
main_df = pd.read_pickle(os.getcwd()+'/../dataframes/retrieval_dataset.pkl')

In [14]:
meta_data = []

for idx, row in main_df.iterrows():
    
    page_id = row['page_id']
    page_row = keywords_df[keywords_df['page_id'] == page_id]
    
    keywords = page_row['keywords'].values[0]
    lang = page_row['lang'].values[0]
    
    meta_data.append({
        'page_id': page_id,
        'query': row['query'],
        'label': row['label'],
        'text': row['text'],
        'text_len': row['text_len'],
        'noun_chunks': row['noun_chunks'],
        'mean_nc_vec': row['mean_nc_vec'],
        'title': row['title'],
        'published_date': row['published_date'],
        'source_url': row['source_url'],
        'keywords': keywords,
        'lang': lang
    })

In [16]:
len(meta_df.index)

629

In [15]:
meta_df = pd.DataFrame(meta_data)
meta_df.sample(2)

Unnamed: 0,page_id,query,label,text,text_len,noun_chunks,mean_nc_vec,title,published_date,source_url,keywords,lang
232,210705_news_305315,Satellitenkommunikation,2,SAN FRANCISCO – The U.S. Army plans to evaluat...,452,"[the U.S. Army, the performance, Isotropic Sys...","[[-0.011022232472896576, -4.06815015594475e-05...",US military to test Isotropic Systems antennas...,2020-09-24 09:08:33,https://spacenews.com/army-test-isotropic-ses/,"[(an SES geostationary satellite, 0.53605896),...",en
365,210705_news_337519,Defense,3,"Dublin, Nov. 03, 2020 (GLOBE NEWSWIRE) -- Th...",1016,"[Dublin, GLOBE NEWSWIRE, the ""Defense Drone A...","[[-0.011885600164532661, 0.010710920207202435,...",Global Defense Drone Antenna Market 2020-2027:...,2020-11-03 09:48:00,https://www.globenewswire.com/news-release/202...,"[(Global Defense Drone Antenna Market, 0.58164...",en


In [13]:
main_df.columns

Index(['page_id', 'query', 'label', 'text', 'text_len', 'noun_chunks',
       'mean_nc_vec', 'title', 'published_date', 'source_url'],
      dtype='object')

In [17]:
meta_df.to_pickle(os.getcwd()+'/../dataframes/final_xxx_dataset.pkl')

In [62]:
query_list = set(meta_df['query'].values)
query_df_folderpath = os.getcwd()+'/../dataframes/query_dataframes/'

for query in query_list:
    query_updated = query.lower().replace(' ', '_')
    filename = query_df_folderpath + f'{query_updated}_df.pkl'
    
    query_df = meta_df[meta_df['query'] == query]
    query_df.to_pickle(filename)

In [119]:
mr_df = main_df[main_df['query'] == 'Mixed Reality']
vs_df = main_df[main_df['query'] == 'Visualisierung']

In [120]:
mr_df = pd.concat([mr_df.set_index('page_id'), keywords_df.set_index('page_id')], axis=1, join='inner').reset_index()
vs_df = pd.concat([vs_df.set_index('page_id'), keywords_df.set_index('page_id')], axis=1, join='inner').reset_index()

### 1. keyword extraction
### 2. Candidate pool extraction

In [3]:
def get_modified_vectors(vec_data):
    
    new_data = []
    for val in vec_data:
        new_data.append(val)
    
    new_data = np.array(new_data).reshape(-1, 512)
    return new_data

def get_pool_vec(doc_vec_list, pool):
    
    doc_vec_list = get_modified_vectors(doc_vec_list)
    if pool == 'mean':
        return np.nanmean(doc_vec_list, axis=0)
    elif pool == 'max':
        return np.nanmax(doc_vec_list, axis=0)

def get_document_vec(text):
    
    return tf_model(text)['outputs'].numpy()[0].reshape(1, -1)

def get_sent_transformers_keywords_use(keywords, query_vec, max_keyword_cnt = 30):
    
    keywords = list(dict(keywords).keys())
    
    candidate_embeddings_keywords = [tf_model(kw)['outputs'].numpy()[0] for kw in keywords]
        
    query_distances = cosine_similarity([query_vec], candidate_embeddings_keywords)
    subtopic_keywords_dict = dict()
    for index in query_distances.argsort()[0][-max_keyword_cnt:]: 
        
        subtopic_keywords_dict[keywords[index]] = query_distances[0][index]
    
    subtopic_keywords_dict = sorted(subtopic_keywords_dict.items(), key=lambda x: x[1], reverse=True)

    return subtopic_keywords_dict

def get_candidate_pool(subtopic_keywords_list):
    
    candidate_pool = []
    
    lower_limit = 0.2
    upper_limit = 0.4
    
    for key, value in subtopic_keywords_list:
        
        if value > 0.2 and value < 0.4:
            candidate_pool.append(key)
            
    return candidate_pool

In [98]:
query_1 = 'Mixed Reality'
query_vec_1 = tf_model(query_1)['outputs'].numpy()[0]

query_2 = 'Visualisierung'
query_vec_2 = tf_model(query_2)['outputs'].numpy()[0]

In [121]:
%timeit mr_df['keywords_use'] = mr_df.apply(lambda x:get_sent_transformers_keywords_use(x['keywords'], query_vec_1, max_keyword_cnt = 30), axis=1)

31.5 s ± 443 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [101]:
%timeit vs_df['keywords_use'] = vs_df.apply(lambda x:get_sent_transformers_keywords_use(x['keywords'], query_vec_1, max_keyword_cnt = 30), axis=1)

32.4 s ± 680 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [122]:
%timeit mr_df['candidate_pool'] = mr_df.apply(lambda x:get_candidate_pool(x['keywords_use']), axis=1)

3.05 ms ± 55 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [103]:
%timeit vs_df['candidate_pool'] = vs_df.apply(lambda x:get_candidate_pool(x['keywords_use']), axis=1)

3.02 ms ± 43.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
df_xlm = pd.read_pickle(os.getcwd()+'/final_dataframe.pkl')
df_xlm = df_xlm[['id', 'lang']]
df_xlm = df_xlm.rename(columns={'id': 'page_id'})

In [16]:
mr_df = pd.concat([mr_df.set_index('page_id'), df_xlm.set_index('page_id')], axis=1, join='inner').reset_index()
vs_df = pd.concat([vs_df.set_index('page_id'), df_xlm.set_index('page_id')], axis=1, join='inner').reset_index()

In [24]:
mr_df.to_pickle(os.getcwd()+'/../dataframes/retrieval_dataset_mr.pkl')
vs_df.to_pickle(os.getcwd()+'/../dataframes/retrieval_dataset_vs.pkl')

In [4]:
mr_df = pd.read_pickle(os.getcwd()+'/../dataframes/retrieval_dataset_mr.pkl')
vs_df = pd.read_pickle(os.getcwd()+'/../dataframes/retrieval_dataset_vs.pkl')

In [5]:
print(f'Retrieval score for Mixed reality: {mr_df.label.sum()}')
print(f'Retrieval score for Visualisierung: {vs_df.label.sum()}')

Retrieval score for Mixed reality: 73
Retrieval score for Visualisierung: 87


In [10]:
def get_irp(a, b):
    return ((a-b)/a)*100

In [21]:
nlp_de = spacy.load("de_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

In [19]:
def get_document_entities(text, lang):
    
    doc = None

    try:
        if len(text) >= 999999:
            return None

        if lang == 'en':
            doc = nlp_en(text)
        elif lang == 'de':
            doc = nlp_de(text)
        else:
            return None
        
        entities = []
        
        for ent in doc.ents:
            entities.append(ent.label_) 
    except Exception as e:
        return None
            
    return entities

In [23]:
mr_df['entities'] = mr_df.apply(lambda x:get_document_entities(x['text'], x['lang']),axis=1) 
vs_df['entities'] = vs_df.apply(lambda x:get_document_entities(x['text'], x['lang']),axis=1) 

In [32]:
def get_semantic_query_expectation(df):
    
    query = df['query'].values[0]
    query_vec = get_modified_vectors(tf_model(query)['outputs'].numpy()[0])
    
    df['query_sim'] = df.apply(lambda x:cosine_similarity(query_vec, x['mean_nc_vec'])[0][0], axis=1)
    df = df.sort_values(by=['query_sim'], ascending=False)
    
    exp_score_5 = exp_score_10 = exp_score_15 = exp_score_20 = exp_score_25 = 0
    
    for idx, row in df.iterrows():
        
        if row['label'] in [1, 2]:
            if idx <= 5:
                exp_score_5 += 1
            elif idx <= 10:
                exp_score_10 += 1
            elif idx <= 15:
                exp_score_15 += 1
            elif idx <= 20:
                exp_score_20 += 1
            elif idx <= 25:
                exp_score_25 += 1
    
    exp_score_10 += exp_score_5
    exp_score_15 += exp_score_10
    exp_score_20 += exp_score_15
    exp_score_25 += exp_score_20
    
    return  {
        'exp_5': exp_score_5,
        'exp_10': exp_score_10,
        'exp_15': exp_score_15,       
        'exp_20': exp_score_20,
        'exp_25': exp_score_25,
    }

In [33]:
get_semantic_query_expectation(mr_df)

{'exp_5': 2, 'exp_10': 5, 'exp_15': 7, 'exp_20': 11, 'exp_25': 14}

In [34]:
get_semantic_query_expectation(vs_df)

{'exp_5': 2, 'exp_10': 6, 'exp_15': 7, 'exp_20': 8, 'exp_25': 9}

In [21]:
mr_df.label.value_counts()

2    10
3     8
4     6
1     5
Name: label, dtype: int64

In [22]:
get_semantic_query_expectation(vs_df)

(1, 3, 1, 0, 0)

### 3. Clustering

In [5]:
def get_umap_output(vec_array, dim_size=5):
    
    umap_obj = umap.UMAP(n_neighbors=40, 
                        n_components=dim_size, 
                        min_dist=0.01,
                        metric='cosine',
                        random_state=123).fit(vec_array) 
    
    umap_output = umap_obj.transform(vec_array) 
    return umap_output, umap_obj

def get_hdbscan_output(data_points, cluster_size=7):
    
    hdbscan_output = hdbscan.HDBSCAN(
        #min_cluster_size=cluster_size,
#                                       min_samples=2,
                                      metric='euclidean',
                                     cluster_selection_method='eom').fit(data_points)
    return hdbscan_output

def get_dbscan_output(data_points, cluster_size=7):
    
    dbscan_output = DBSCAN(
        #eps=3,
#                                       min_samples=2,
                                      metric='euclidean').fit(data_points)
    return dbscan_output

def project_on_2Dplane(umap_output, cluster_ids):
    
    umap_df = pd.DataFrame(np.column_stack((umap_output, cluster_ids)), columns=['x', 'y', 'cluster ids'])
    grid = sns.FacetGrid(umap_df, hue='cluster ids', height=7)
    grid.map(plt.scatter, 'x', 'y').add_legend()
    
def get_clustering_analysis(cluster_df, final_candidate_pool_vecs, dimen_size=5, cluster_size=7):
    
    umap_output_5, umap_5 = get_umap_output(final_candidate_pool_vecs, dim_size=dimen_size)
    hdbscan_output = get_hdbscan_output(umap_output_5, cluster_size=cluster_size)
#     hdbscan_output = get_dbscan_output(umap_output_5, cluster_size=cluster_size)
    
    cluster_df['cluster_id'] = hdbscan_output.labels_
#     cluster_df.cluster_id.hist(bins=150)
    
    umap_output_2, umap_2 = get_umap_output(final_candidate_pool_vecs, dim_size=2)
#     project_on_2Dplane(umap_output_2, cluster_df['cluster_id'])
    
    return cluster_df

def get_nearest_keyword(keywords, keyword_vecs, mean_vec):
    
    query_distances = cosine_similarity([mean_vec], list(keyword_vecs))
    subtopic_keywords_dict = dict()
    for index in query_distances.argsort()[0]: 
        
        subtopic_keywords_dict[keywords[index]] = query_distances[0][index]
    
    subtopic_keywords_dict = sorted(subtopic_keywords_dict.items(), key=lambda x: x[1], reverse=True)
    return subtopic_keywords_dict[0][0]

def get_topics(cluster_data_df, candidate_pool):
    
    topic_list = []
    
    for idx, row in cluster_data_df.iterrows():
        
        candidate_words = row['candidate_words']
        topic = row['topic']
        
        for cp in candidate_pool:
            if cp in candidate_words:
                topic_list.append(topic)
                break
    
    return topic_list

def get_topic_documents(topic_words, final_df):
    
    doc_id_list = []
    for idx, row in final_df.iterrows():

        candidate_pool = row['candidate_pool']
        doc_id = row['page_id']

        for tw in topic_words:
            if tw in candidate_pool:
                doc_id_list.append(doc_id)

    return list(set(doc_id_list))

def get_meta_data(cluster_id, df, page_id_list):
    
    entity_list = []
    product_cnt = person_cnt = law_cnt = 0
    label_1 = label_2 = label_3 = label_4 = 0
    page_len = len(page_id_list)
    
    for page_id in page_id_list:
        entities = df[df['page_id'] == page_id]['entities'].values[0]
        
        product_cnt += entities.count('PRODUCT')
        person_cnt += entities.count('PERSON')
        law_cnt += entities.count('LAW')
        
        label = int(df[df['page_id'] == page_id]['label'].values[0])
        
        if label == 1:
            label_1 += 1
        elif label == 2:
            label_2 += 1
        elif label == 3:
            label_3 += 1
        elif label == 4:
            label_4 += 1
            
    return {
        'cluster_id': cluster_id,
        'product_cnt': product_cnt/page_len,
        'person_cnt': person_cnt/page_len,
        'law_cnt': law_cnt/page_len,
        'label_1': label_1,
        'label_2': label_2,
        'label_3': label_3,
        'label_4': label_4,        
    }

def get_sub_topic_modelling(query_df):
    
    final_candidate_pool = []

    for idx, row in query_df.iterrows():
        final_candidate_pool.extend(row['candidate_pool'])
        
    final_candidate_pool = list(set(final_candidate_pool))

    final_candidate_pool_vecs = [tf_model(nc)['outputs'].numpy()[0] for nc in final_candidate_pool]

    df_data = []
    for word, vec in zip(final_candidate_pool, final_candidate_pool_vecs):
        df_data.append((word, vec))

    cluster_df = pd.DataFrame(df_data, columns= ['candidate_words', 'candidate_vecs'])
    cluster_df = get_clustering_analysis(cluster_df, final_candidate_pool_vecs, dimen_size=5, cluster_size=8)

    cluster_data = []

    for cluster_id in set(cluster_df.cluster_id.values):

        if cluster_id != -1:
            df = cluster_df[cluster_df['cluster_id'] == cluster_id]
            cluster_data.append((cluster_id, df.candidate_words.values, df.candidate_vecs.values))

    cluster_data_df = pd.DataFrame(cluster_data, columns=['cluster_id', 'candidate_words', 'candidate_vecs'])
    cluster_data_df['mean_vec'] = cluster_data_df.apply(lambda x:get_pool_vec(x['candidate_vecs'], 'mean'), axis=1)
    cluster_data_df['topic'] = cluster_data_df.apply(lambda x:get_nearest_keyword(x['candidate_words'], x['candidate_vecs'], x['mean_vec']), axis=1)

    cluster_data_df['page_id_list'] = cluster_data_df.apply(lambda x:get_topic_documents(x['candidate_words'], query_df), axis=1)
    cluster_data_df['doc_cnt'] = cluster_data_df.apply(lambda x:len(x['page_id_list']), axis=1)
    
    meta_data = []
    for idx, row in cluster_data_df.iterrows():
        meta_data.append(get_meta_data(row['cluster_id'], query_df, row['page_id_list']))
        
    entity_df = pd.DataFrame(meta_data)
    cluster_data_df = pd.concat([cluster_data_df.set_index('cluster_id'), entity_df.set_index('cluster_id')], axis=1, join='inner').reset_index()
    
    return cluster_data_df

In [8]:
vs_df.columns

Index(['page_id', 'query', 'label', 'text', 'text_len', 'noun_chunks',
       'mean_nc_vec', 'title', 'published_date', 'source_url', 'keywords',
       'keywords_use', 'candidate_pool', 'lang', 'entities'],
      dtype='object')

In [6]:
cluster_data_df_mr = get_sub_topic_modelling(mr_df)
cluster_data_df_vs = get_sub_topic_modelling(vs_df)

In [29]:
columns = ['cluster_id', 'topic','doc_cnt','label_1','label_2','label_3','label_4', 'product_cnt', 'person_cnt', 'law_cnt']
cluster_data_df_mr[columns]

In [29]:
def get_covered_doc_data(page_id_list, df):
    
    covered_df = df[df['page_id'].isin(page_id_list)]
    covered_df = covered_df[covered_df['label'].isin([1, 2])]    
    
    return len(covered_df.index)
    
    
def calculate_expectation(cluster_dict, df, n_results):
    
    cluster_data = np.array(list(cluster_dict.keys()))
    
#     print(f'Running n_results, cluster_cnt ... {n_results} -- {len(cluster_data)}\n')
    
    all_combinations = []

    if len(cluster_dict) < 7:
        all_combinations = list(itertools.permutations(cluster_data))
    else:
        while len(set(all_combinations)) < 1000:
            all_combinations.append(tuple(np.random.permutation(cluster_data)))
            
    list_of_found_relevant_doc_per_permutation = []
    for comb in all_combinations:
        
        number_of_found_relevant_docs = 0
        number_of_inspected_docs = 0
        
        page_id_covered = []
        
        for cluster in comb:
            page_id_covered.extend(cluster_dict[cluster])
            page_id_covered = list(set(page_id_covered))
            
            if number_of_inspected_docs > n_results:
                break
            else:
                covered_pos_docs = get_covered_doc_data(page_id_covered, df)
                covered_docs = len(page_id_covered)
                
#                 print(f'Covered_docs and pos docs ... {covered_docs} ---- {covered_pos_docs}')

                number_of_inspected_docs = len(set(page_id_covered))
                number_of_found_relevant_docs = covered_pos_docs
                
        list_of_found_relevant_doc_per_permutation.append(number_of_found_relevant_docs)
                
    return sum(list_of_found_relevant_doc_per_permutation) / len(list_of_found_relevant_doc_per_permutation)

def get_cluster_expectation_scores(cluster_df, df):
    
    cluster_dict = dict()
    
    for idx, row in cluster_df.iterrows():
        cluster_dict[row['cluster_id']] = row['page_id_list']
        
    return {
        'exp_5': calculate_expectation(cluster_dict, df, n_results=5),
        'exp_10': calculate_expectation(cluster_dict, df, n_results=10),
        'exp_15': calculate_expectation(cluster_dict, df, n_results=15),       
        'exp_20': calculate_expectation(cluster_dict, df, n_results=20),
        'exp_25': calculate_expectation(cluster_dict, df, n_results=25),
    }

def get_document_labels(page_id_list, df):
    
    label_1 = label_2 = label_3 = label_4 = 0
    for page_id in list(page_id_list):
        
        label = df[df['page_id'] == page_id]['label'].values[0]
        
        if label == 1:
            label_1 += 1
        elif label == 2:
            label_2 += 1
        elif label == 3:
            label_3 += 1
        elif label == 4:
            label_4 += 1
            
    return label_1, label_2, label_3, label_4

def get_elimination_cluster_cnt(cluster_df, df):
    
    cluster_cnt_eliminated = 0
    neg_doc_eliminated = []
    query = df['query'].values[0]
    
    negative_page_id_list = []
    for page_id, label in zip(df.page_id.values, df.label.values):
        if label == 3 or label == 4:
            negative_page_id_list.append(page_id)
        
    total_doc_cnt = len(set(negative_page_id_list))
    
    for idx, row in cluster_df.iterrows():
        
        label_1 = row['label_1']
        label_2 = row['label_2']
        label_3 = row['label_3']
        label_4 = row['label_4']
        
        criteria = label_2 - (label_3+label_4)
        if label_1 == 0 and criteria < 0:
            
            cluster_cnt_eliminated += 1
            common_neg_docs = list(set(negative_page_id_list) & set(row['page_id_list']))
            neg_doc_eliminated.extend(common_neg_docs)
    
    targeted_doc_cnt = len(set(neg_doc_eliminated))
    
    clustered_page_id_list = []
    for id_list in cluster_df.page_id_list.values:
        clustered_page_id_list.extend(id_list)
    
    cluster_page_ids = set(clustered_page_id_list)
    all_page_ids = set(df.page_id.values)
    
    clustering_loss = all_page_ids - cluster_page_ids
    missed_documents = get_document_labels(clustering_loss, df)
    
    expectation_data = get_cluster_expectation_scores(cluster_df, df)
    
    return query, len(cluster_df.index), cluster_cnt_eliminated, targeted_doc_cnt, total_doc_cnt, round((targeted_doc_cnt/total_doc_cnt) * 100, 2), missed_documents[0], missed_documents[1], missed_documents[2], missed_documents[3], expectation_data

In [30]:
get_elimination_cluster_cnt(cluster_data_df_mr, mr_df)

('Mixed Reality',
 14,
 5,
 10,
 14,
 71.43,
 0,
 0,
 1,
 0,
 {'exp_5': 3.807,
  'exp_10': 6.191,
  'exp_15': 8.564,
  'exp_20': 11.112,
  'exp_25': 13.912})

In [31]:
get_elimination_cluster_cnt(cluster_data_df_vs, vs_df)

('Visualisierung',
 16,
 4,
 11,
 21,
 52.38,
 0,
 0,
 0,
 1,
 {'exp_5': 3.022,
  'exp_10': 4.32,
  'exp_15': 5.87,
  'exp_20': 7.241,
  'exp_25': 8.414})

In [89]:
vs_df.label.mean()

2.9

In [44]:
mr_df

Unnamed: 0,page_id,query,label,text,text_len,noun_chunks,mean_nc_vec,title,published_date,source_url,keywords,keywords_use,candidate_pool,lang,entities
0,210705_news_450807,Mixed Reality,2,Apple arbeitet an seinem ersten eigenen Mixed-...,333,"[Apple, sein erster eigen Mixed-Reality-Headse...","[[-0.008407283574342728, 0.005582862533628941,...",Apples Augmented-Reality-Headset angeblich mit...,2021-03-23 08:35:00,https://www.heise.de/news/Apples-Augmented-Rea...,"[(sein erster eigen Mixed-Reality-Headset, 0.3...","[(der geplant Mixed-Reality-Headset, 0.6080339...","[der Augmented-Reality-Technik, Zukunftsmusik,...",de,"[ORG, LOC, LOC, LOC, PER, MISC, ORG, MISC, MIS..."
1,210705_news_316645,Mixed Reality,3,Kampfflugzeuge sind seit jeher ein Symbol für ...,329,"[Kampfflugzeug, ein Symbol, national Hochtechn...","[[-0.020709725096821785, -0.000700523902196437...",Konzeptionelle Vorstellungen zum Future Comba...,2020-10-07 11:17:09,https://esut.de/2020/10/fachbeitraege/22981/ko...,"[(ein Future Combat Air, 0.60385466), (Kampffl...","[(der Einsatzrealität, 0.5460597), (kollaborat...","[kollaborativ Konzept, zunehmend hybrid prakti...",de,"[PER, ORG, PER, MISC, PER, MISC, LOC, LOC, LOC..."
2,210705_news_93916,Mixed Reality,4,"Die experimentelle Plattform namens ""Assembler...",284,"[der experimentell Plattform, --Assembler, For...","[[-0.018326913937926292, 0.001563811907544732,...",Google: Neue Plattform gegen Falschinformation...,2020-02-05 13:45:00,https://www.heise.de/newsticker/meldung/Deepfa...,"[(der Thema Falschinformations-Kampagn, 0.3010...","[(Deepfak, 0.47651172), (Fälschung, 0.40600026...","[Extremismus, Faktencheckern, verschieden Wiss...",de,"[MISC, ORG, ORG, PER, PER, ORG, LOC, LOC, LOC,..."
3,210705_news_223568,Mixed Reality,3,Im European Defence Industrial Development Pro...,275,[European Defence Industrial Development Progr...,"[[-0.010143241845071316, -0.001262652920559048...",Entwicklung eines europäischen unbemannten Bod...,2020-06-22 14:39:05,https://esut.de/2020/06/meldungen/21212/entwic...,[(European Defence Industrial Development Prog...,"[(künstlich Intelligenz, 0.3486532), (Milrem R...","[künstlich Intelligenz, Milrem Robotic, der Gr...",de,"[ORG, ORG, MISC, LOC, ORG, LOC, LOC, LOC, LOC,..."
4,210705_news_395875,Mixed Reality,3,DARPA’s Robotic Autonomy in Complex Environme...,228,"[DARPA’s Robotic Autonomy, Complex Environmen...","[[-0.020166562870144844, 0.0002533818187657743...",DARPA RACER-SIM,2021-01-20 11:37:10,https://euro-sd.com/2021/01/news/industry-news...,"[(DARPA’s Robotic Autonomy, 0.61190784), (RAC...","[(the real world, 0.5533277), (simulation, 0.4...",[a demonstrable simulation-to-real world capab...,en,"[ORG, ORG, DATE, CARDINAL, CARDINAL, PERSON, O..."
5,210705_news_418451,Mixed Reality,2,Plant Apple die Verwendung neuartiger Micro-OL...,379,"[Plant Apple, der Verwendung, neuartig Micro-O...","[[-0.008398597128689289, -0.001110038720071315...",Apples Augmented-Reality-Brille: Micro-OLEDs v...,2021-02-15 10:30:00,https://www.heise.de/news/Apple-Brille-Micro-O...,"[(neuartig Micro-OLED-Bildschirme, 0.47497174)...","[(Mixed-Reality-Erlebnisse, 0.80068773), (Mixe...","[der VR-Welt, diverser Patente, Sehstärke, der...",de,"[ORG, LOC, MISC, ORG, ORG, LOC, ORG, MISC, MIS..."
6,210705_news_420397,Mixed Reality,1,Das Technische Hilfswerk (THW) hat für die Aus...,260,"[der technisch Hilfswerk, --THW, der Ausbildun...","[[-0.022845558822155, 0.004152955021709204, -0...",Deichläuferausbildung mit Virtual Reality,2021-02-17 11:48:10,https://esut.de/2021/02/meldungen/25614/deichl...,[(der Virtual-Reality-Deichverteidigungs-Simul...,"[(Reality-Brillen, 0.52013505), (ein virtuell ...","[der Virtual-Reality-Trainingssystem, künstlic...",de,"[ORG, ORG, MISC, ORG, ORG, ORG, MISC, LOC, PER..."
7,210705_news_221064,Mixed Reality,1,"VIDEO In Mixed Reality News June 19, 2020 – ...",426,"[video, Mixed Reality News, MedCognition, a Te...","[[-0.008169465698301792, -0.000316161662340164...",MedCognition delivers Mixed Reality military m...,2020-06-19 07:13:17,https://www.auganix.org/medcognition-delivers-...,"[(US Army Combat Veteran Physician, 0.54729724...","[(realistic patient simulation, 0.46624163), (...","[MedCognition, an augmented reality medical si...",en,"[DATE, GPE, ORG, GPE, ORG, MONEY, DATE, ORG, P..."
8,210705_news_437186,Mixed Reality,3,Der bekannte Analyst Ming-Chi Kuo vom taiwanis...,377,"[der bekannt Analyst Ming-Chi Kuo, taiwanisch ...","[[-0.015583396889269352, -0.005017408635467291...","Apples Augmented-Reality-Pläne: Helm, Brille, ...",2021-03-08 08:23:00,https://www.heise.de/news/Apples-AR-Plaene-Hel...,"[(Augmented Reality, 0.31786597), (Apple, 0.30...","[(Mixed-Reality-Headset, 0.72614324), (Augment...","[auch Augmented-Reality-Erfahrunge, Ultimative...",de,"[ORG, LOC, ORG, MISC, ORG, MISC, LOC, PER, LOC..."
9,210705_news_489243,Mixed Reality,2,Apple hat seine Investition in einen auf Laser...,259,"[Apple, sein Investition, Laser-Technik, mehr,...","[[-0.012443368323147297, 0.0017926818691194057...","Apple pumpt mehr Geld in Laser-Technik ""made i...",2021-05-05 15:08:00,https://www.heise.de/news/Apple-pumpt-mehr-Gel...,"[(Laser-Technik, 0.366076), (apple Autoprojekt...","[(Augmented-Reality-App, 0.4578149), (2022 erw...","[2022 erwartetes, der Technik, der Gesichtserk...",de,"[ORG, LOC, LOC, ORG, ORG, ORG, ORG, LOC, MISC,..."


In [177]:
cluster_data_df_vs[columns]

Unnamed: 0,cluster_id,topic,doc_cnt,relevance_score,label_1,label_2,label_3,label_4,product_cnt,person_cnt,law_cnt
0,0,peripheral color perception,3,1,1,1,0,1,0.333333,10.333333,0.0
1,1,VR display,3,2,1,1,1,0,0.333333,4.0,0.0
2,2,ein 3D-Visualisierung,7,-1,2,1,2,2,0.285714,2.142857,0.0
3,3,Natural Scenes,2,1,1,0,1,0,0.5,5.5,0.0
4,4,Gehirn,3,-1,0,1,2,0,0.0,2.0,0.0
5,5,digital Einzigartigkeit,5,-7,0,1,0,4,0.0,0.0,0.0
6,6,Simulatoren,1,1,0,1,0,0,0.0,0.0,0.0
7,7,mehrere Roboter,6,-2,1,1,3,1,0.0,0.0,0.0
8,8,der Projekt,9,-12,0,1,3,5,0.0,1.0,0.666667
9,9,video,3,0,1,0,2,0,0.333333,5.333333,0.0


In [171]:
get_elimination_cluster_cnt(cluster_data_df_vs)

(6, 19, 1, 10)

In [155]:
cluster_data_df_mr[columns].describe()

Unnamed: 0,cluster_id,doc_cnt,relevance_score,label_1,label_2,label_3,label_4,product_cnt,person_cnt,law_cnt
count,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0,14.0
mean,6.5,6.0,-0.214286,1.428571,1.285714,2.214286,1.071429,0.588876,6.175479,0.256439
std,4.1833,2.601775,4.209487,1.2225,1.325987,1.476929,1.071612,1.204264,11.957039,0.29681
min,0.0,2.0,-6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.25,5.0,-3.0,0.25,0.0,1.25,0.0,0.025,1.1,0.0
50%,6.5,5.5,-1.5,1.5,1.5,2.0,1.0,0.225,3.05,0.190909
75%,9.75,7.75,2.75,2.0,2.0,2.75,1.75,0.534091,5.334821,0.35625
max,13.0,11.0,7.0,4.0,4.0,6.0,3.0,4.666667,46.666667,1.0


In [77]:
#  cluster_data_df_mr.boxplot(column=['product_cnt', 'person_cnt', 'law_cnt'])

In [156]:
cluster_data_df_mr[['doc_cnt', 'relevance_score','label_1','label_2','label_3','label_4', 'product_cnt', 'person_cnt', 'law_cnt']].corr()

Unnamed: 0,doc_cnt,relevance_score,label_1,label_2,label_3,label_4,product_cnt,person_cnt,law_cnt
doc_cnt,1.0,0.344155,0.701352,0.53513,0.740678,-0.05518,-0.30604,-0.288658,0.045731
relevance_score,0.344155,1.0,0.661976,0.714656,-0.05391,-0.729607,-0.281333,-0.371604,-0.512523
label_1,0.701352,0.661976,1.0,0.298279,0.24345,-0.142601,-0.294241,-0.330188,0.024412
label_2,0.53513,0.714656,0.298279,1.0,0.241284,-0.610955,-0.155829,-0.219576,-0.466567
label_3,0.740678,-0.05391,0.24345,0.241284,1.0,-0.156223,-0.48617,-0.391508,-0.123831
label_4,-0.05518,-0.729607,-0.142601,-0.610955,-0.156223,1.0,0.455509,0.487131,0.831166
product_cnt,-0.30604,-0.281333,-0.294241,-0.155829,-0.48617,0.455509,1.0,0.953579,0.627961
person_cnt,-0.288658,-0.371604,-0.330188,-0.219576,-0.391508,0.487131,0.953579,1.0,0.676839
law_cnt,0.045731,-0.512523,0.024412,-0.466567,-0.123831,0.831166,0.627961,0.676839,1.0


In [157]:
def get_elimination_pool(df, column_name, quantile):
    
    df = df[df[column_name] > quantile]
    
    elimination_pool = []
    for page_id in df.page_id_list.values:
        elimination_pool.extend(page_id)
    
    return list(set(elimination_pool))

In [158]:
def get_retrieval_updates(percentile):
    
    law_cnt_75 = cluster_data_df_mr.law_cnt.quantile(percentile)
#     person_cnt_75 = cluster_data_df_mr.person_cnt.quantile(percentile)
    product_cnt_75 = cluster_data_df_mr.product_cnt.quantile(percentile)

    law_elimination_pool = get_elimination_pool(cluster_data_df_mr, 'law_cnt', law_cnt_75)
#     person_elimination_pool = get_elimination_pool(cluster_data_df_mr, 'person_cnt', person_cnt_75)
    product_elimination_pool = get_elimination_pool(cluster_data_df_mr, 'product_cnt', product_cnt_75)

    elimination_pool = list(set(law_elimination_pool) & set(product_elimination_pool))

    elm_df = mr_df[mr_df['page_id'].isin(elimination_pool)]
    print(elm_df.label.value_counts())

In [159]:
for prnt in np.arange(0.1, 1, 0.02):
    
    prnt = round(prnt, 2)
    print(f'\n{prnt} ==== ')
    get_retrieval_updates(prnt)


0.1 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.12 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.14 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.16 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.18 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.2 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.22 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.24 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.26 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.28 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.3 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.32 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.34 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.36 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.38 ==== 
2    7
3    7
4    6
1    5
Name: label, dtype: int64

0.4 ==== 
2 