In [1]:
import pandas as pd
import os
import string
import numpy as np
import hdbscan
import itertools
import json

import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub

import warnings
warnings.filterwarnings('ignore')

In [28]:
tf_model = hub.load(os.getcwd()+ '/../../models/USE_model')

In [3]:
def read_document_data(filepath):

    try:
        with open(filepath, 'r') as f:
            data_dict = json.load(f)
    except Exception as e:
        data_dict = dict()
        print(e)

    return data_dict

In [37]:
# keywords_df = pd.read_pickle(os.getcwd()+'/final_keywords_dataframe.pkl')
keywords_df = pd.read_pickle(os.getcwd()+'/../dataframes/final_keywords_dataframe_cdd.pkl')

# keywords_df = keywords_df.rename(columns={'id': 'page_id', 'nc_vec': 'mean_nc_vec', 'text_tokens': 'noun_chunks'})

In [None]:
def get_modified_vectors(vec_data):
    
    new_data = []
    for val in vec_data:
        new_data.append(val)
    
    new_data = np.array(new_data).reshape(-1, 512)
    return new_data

def get_pool_vec(doc_vec_list, pool):
    
    doc_vec_list = get_modified_vectors(doc_vec_list)
    if pool == 'mean':
        return np.nanmean(doc_vec_list, axis=0)
    elif pool == 'max':
        return np.nanmax(doc_vec_list, axis=0)

def get_document_vec(text):
    
    return tf_model(text)['outputs'].numpy()[0].reshape(1, -1)

def get_sent_transformers_keywords_use(keywords, query_vec, max_keyword_cnt = 30):
    
    keywords = list(dict(keywords).keys())
    
    candidate_embeddings_keywords = [tf_model(kw)['outputs'].numpy()[0] for kw in keywords]
        
    query_distances = cosine_similarity([query_vec], candidate_embeddings_keywords)
    subtopic_keywords_dict = dict()
    for index in query_distances.argsort()[0][-max_keyword_cnt:]: 
        
        subtopic_keywords_dict[keywords[index]] = query_distances[0][index]
    
    subtopic_keywords_dict = sorted(subtopic_keywords_dict.items(), key=lambda x: x[1], reverse=True)

    return subtopic_keywords_dict

def get_candidate_pool(subtopic_keywords_list):
    
    candidate_pool = []
    
    lower_limit = 0.2
    upper_limit = 0.4
    
    for key, value in subtopic_keywords_list:
        
        if value > 0.2 and value < 0.4:
            candidate_pool.append(key)
            
    return candidate_pool

In [35]:
type(keywords_df[keywords_df['id'] == '210705_news_213812']['keywords'].values[0])

numpy.ndarray

In [38]:
keywords_df.columns

Index(['id', 'text', 'text_len', 'lang', 'nc_vec', 'label', 'label_name',
       'title', 'pubDate', 'url', 'doc_repr_vec', 'text_tokens', 'keywords'],
      dtype='object')

In [29]:
keywords_df.sample(2)

Unnamed: 0,id,text,text_len,lang,text_tokens,nc_vec,label,label_name,title,pubDate,url,doc_repr_vec,keywords,text_tokens.1,keywords.1
1861,210705_news_423869,Three workers died and three more were injured...,271,en,"[three worker, Saturday, repair work, a proces...","[[-0.004616789054125547, -0.012914112769067287...",1,technology,3 Dead in Russian Mining Giant's Arctic Accident,2021-02-21 08:05:19,https://www.themoscowtimes.com/2021/02/21/3-de...,"[-0.006567743, -0.020839494, 0.072888, 0.00717...","[(russian mining giant Norilsk Nickel, 0.64577...","[orereloading facility, Russian arctic, court,...","[(Russian arctic, 0.49613178), (Russias riches..."
2522,210705_news_213812,Photo: GomSpace The Norwegian Defence Research...,154,en,"[photo, GomSpace, FFI, GomSpace, a contract, a...","[[-0.012575079686939716, 0.0164578128606081, 0...",2,military,GomSpace to Develop Norwegian Military Satelli...,2020-06-11 18:37:31,https://www.satellitetoday.com/government-mili...,"[0.006033211, 0.029967265, 0.09030905, 0.02045...","[(a military communication nanosatellite, 0.49...","[polar LowEarth orbit, Arctic satellite relay,...","[(military communications nanosatellite, 0.499..."


In [9]:
document_cnt_data = []
final_page_id_list = []
target_queries = ['Schutz von unbemannten Systemen', 'Waffen Systeme', 'Defense', 'militärische Entscheidungsfindung', 'unbemannte Wirksysteme', 'Data Centric Warfare', 'Militärische Kommunikation', 'Unbemannte Landsysteme', 'Cyber Attack', 'Kryptologie', 'Quantentechnologie', 'Satellitenkommunikation', 'Big Data, KI für Analyse', 'IT-Standards', 'Edge computing', 'Mixed Reality', 'Architekturanalyse', 'Kommunikationsnetze', 'Methode Architektur', 'Robotik', 'Visualisierung', 'Wellenformen und -ausbreitung']
search_results_folder = os.getcwd() + '/../dataframes/search_results_index/'

for query in target_queries:
    query_updated = query.lower().replace(' ', '_')
    query_vec = tf_model(query)['outputs'].numpy()[0]
    
    search_type = 'bm25'    
    es_data = read_document_data(search_results_folder + f'{query_updated}_{search_type}_result.json')
    search_type = 'semantic'  
    semantic_data = read_document_data(search_results_folder + f'{query_updated}_{search_type}_result.json')
    
    cdd_page_id_list = list(set(list(es_data.values()) + list(semantic_data.values())))
    
    cdd_df = keywords_df[keywords_df['page_id'].isin(cdd_page_id_list)]
    cdd_df['query'] = query
    
    cdd_df['keywords_use'] = cdd_df.apply(lambda x:get_sent_transformers_keywords_use(x['keywords'], query_vec_1, max_keyword_cnt = 25), axis=1)
    cdd_df['candidate_pool'] = cdd_df.apply(lambda x:get_candidate_pool(x['keywords_use']), axis=1)
    
    cdd_df['label'] = cdd_df.apply(lambda x:get_cdd_label(x['page_id']) , axis=1)
    
    
    

107
102
106
110
106
108
110
103
108
58
91
105
107
106
101
106
108
97
105
104
73
86


In [None]:
es_mr_data = read_document_data(os.getcwd()+'/../dataframes/search_results_index/mixed_reality_bm25_result.json')
ss_mr_data = read_document_data(os.getcwd()+'/../dataframes/search_results_index/mixed_reality_semantic_result.json')