### I. Global feature analysis -- query is unknown

     1. Document length, no.of tokens, no.of sentences
     2. Title length, no. of tokens
     3. Pos-tagging distribution -- only NC tested
     4. Entity types and distribution

### II. Local feature analysis -- query is known

     1. Sub-topic distribution in each class
     
### Assumptions:

     1. Best label in case of ambiguity
     2. Spearman correlation
         Ordinal vs Continuous

## I. Global feature analysis

In [48]:
# _importing required libraries

import pandas as pd
import os
import string
import numpy as np
import hdbscan

import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import spacy

from sklearn.metrics.pairwise import cosine_similarity

import texthero as hero
from texthero import preprocessing

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

import umap

import warnings
warnings.filterwarnings('ignore')

In [97]:
tf_model = hub.load(os.getcwd()+ '/../../models/USE_model')

In [25]:
final_df = pd.read_pickle(os.getcwd()+'/../dataframes/retrieval_dataset.pkl')
final_df.head(2)

Unnamed: 0,page_id,query,label,text,text_len,noun_chunks,mean_nc_vec,title,published_date,source_url
0,210705_news_167749,Methode Architektur,3,"ROME, N.Y. – U.S. Air Force researchers are a...",467,"[N.Y., industry help, big improvement, small, ...","[[-0.010767139494419098, -0.001241824007593095...",SWaP embedded computing artificial intelligenc...,2020-04-27 07:30:41,https://www.militaryaerospace.com/computers/ar...
1,210705_news_499712,Methode Architektur,3,A recently published 156-page paper from a tea...,758,"[a recently publish 156-page paper, a team, Im...","[[-0.008159193210303783, -0.001107345684431493...",An Erlangen Programme to Establish the Geometr...,2021-05-17 11:45:17,https://syncedreview.com/2021/05/05/deepmind-p...


In [49]:
df_xlm = pd.read_pickle(os.getcwd()+'/final_dataframe.pkl')
df_xlm = df_xlm[['id', 'lang']]
df_xlm = df_xlm.rename(columns={'id': 'page_id'})

In [51]:
final_df = pd.concat([final_df.set_index('page_id'), df_xlm.set_index('page_id')], axis=1, join='inner').reset_index()

In [26]:
duplicated_page_ids = set(list(final_df[final_df['page_id'].duplicated() == True].page_id.values))

for page_id in duplicated_page_ids:
    final_df.loc[final_df['page_id'] == page_id, 'label'] = min(final_df[final_df['page_id'] == page_id].label.values)

In [33]:
final_df = final_df.rename(columns={'text_len': 'num_tokens'})
final_df = final_df.drop_duplicates(subset=['page_id'])

In [42]:
doc_df = final_df[['page_id', 'label', 'text', 'num_tokens', 'noun_chunks']]
doc_df['doc_len'] = doc_df.apply(lambda x:len(x['text']),axis=1)
doc_df['num_sents'] = doc_df.apply(lambda x:len(sent_tokenize(x['text'])),axis=1)
doc_df['nc_cnt'] = doc_df.apply(lambda x:len(x['noun_chunks']),axis=1)

In [84]:
title_df = final_df[['page_id', 'label', 'title']]
title_df['title_len'] = title_df.apply(lambda x:len(x['title']),axis=1)
title_df['title_num_tokens'] = title_df.apply(lambda x:len(x['title'].split()),axis=1)

In [85]:
doc_title_df = pd.concat([doc_df.set_index('page_id'), title_df.set_index('page_id')], axis=1, join='inner').reset_index()
column_names = doc_title_df.columns.values
column_names[1] = 'label_duplicate'
doc_title_df.columns = column_names

doc_title_df.drop('label_duplicate', axis=1, inplace=True)

## Average document and title feature analysis

In [86]:
doc_title_df = doc_title_df.groupby(['label']).agg({'doc_len': 'mean',
'num_tokens': 'mean',
'num_sents': 'mean',
'nc_cnt': 'mean',
'title_len': 'mean',
'title_num_tokens': 'mean',})
doc_title_df

Unnamed: 0_level_0,doc_len,num_tokens,num_sents,nc_cnt,title_len,title_num_tokens
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,4893.706667,695.626667,31.306667,198.493333,59.64,7.893333
2,5288.601562,741.601562,34.671875,208.460938,59.898438,7.789062
3,6098.787072,863.692015,38.722433,239.403042,65.965779,8.714829
4,6310.435294,882.035294,40.294118,250.729412,66.858824,9.058824


In [46]:
doc_len_sim = doc_df["label"].corr(doc_df["doc_len"], method='spearman')
doc_token_sim = doc_df["label"].corr(doc_df["num_tokens"], method='spearman')
doc_sent_sim = doc_df["label"].corr(doc_df["num_sents"], method='spearman')
doc_nc_sim = doc_df["label"].corr(doc_df["nc_cnt"], method='spearman')

title_len_sim = title_df["label"].corr(title_df["title_len"], method='spearman')
title_token_sim = title_df["label"].corr(title_df["title_num_tokens"], method='spearman')

In [47]:
print(f'Document length correlation score: {doc_len_sim}')
print(f'Document token correlation score: {doc_token_sim}')
print(f'Document sentence correlation score: {doc_sent_sim}')
print(f'Document noun-chunks correlation score: {doc_nc_sim}\n')

print(f'Title length correlation score: {title_len_sim}')
print(f'Title token correlation score: {title_token_sim}')

Document length correlation score: 0.021493587282691552
Document token correlation score: 0.02595750949939813
Document sentence correlation score: -0.010399220857056942
Document noun-chunks correlation score: 0.023655502592754032

Title length correlation score: 0.09400850528936555
Title token correlation score: 0.10060472213293657


In [55]:
nlp_de = spacy.load("de_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")

In [61]:
def get_document_entities(text, lang):
    
    doc = None

    try:
        if len(text) >= 999999:
            return None

        if lang == 'en':
            doc = nlp_en(text)
        elif lang == 'de':
            doc = nlp_de(text)
        else:
            return None
        
        entities = []
        
        for ent in doc.ents:
            entities.append(ent.label_) 
    except Exception as e:
        return None
            
    return entities

In [62]:
final_df['entities'] = final_df.apply(lambda x:get_document_entities(x['text'], x['lang']),axis=1) 

In [65]:
unique_entities = []

for entities in final_df.entities.values:
    unique_entities.extend(set(entities))
    
print(set(unique_entities))

{'CARDINAL', 'PERCENT', 'LOC', 'MISC', 'PERSON', 'TIME', 'WORK_OF_ART', 'EVENT', 'PER', 'NORP', 'ORDINAL', 'LANGUAGE', 'LAW', 'GPE', 'PRODUCT', 'DATE', 'QUANTITY', 'ORG', 'MONEY', 'FAC'}


#### 4. Entity analysis

##### Important entities

    1. LOC - Location
    2. PERSON - People
    3. DATE - date
    4. EVENT - Named wars, events, sports, hurricanes, 
    5. NORP - Nationalities, or Religious or Political groups
    6. LAW - law related
    7. GPE - Countries, states, cities
    8. PRODUCT - Objects
    9. ORG - Organisations, companies, institutes

In [67]:
entity_data_list = []

for idx, row in final_df.iterrows():
    
    loc = person = date = event = norp = law = gpe = product = org = None
    entities = row['entities']
    
    entity_data_list.append(
    {
        'id': row['page_id'],
        'label': row['label'],
        'LOC': entities.count('LOC'),
        'PERSON': entities.count('PERSON'),
        'DATE': entities.count('DATE'),
        'EVENT': entities.count('EVENT'),
        'NORP': entities.count('NORP'),
        'LAW': entities.count('LAW'),
        'GPE': entities.count('GPE'),
        'PRODUCT': entities.count('PRODUCT'),
        'ORG': entities.count('ORG'),
    })

In [70]:
entity_df = pd.DataFrame(entity_data_list)
entity_df['dummy'] = 1

## Average entity feature analysis

In [77]:
group_df = entity_df.groupby(['label']).agg({'LOC': 'mean',
'PERSON': 'mean',
'DATE': 'mean',
'EVENT': 'mean',
'NORP': 'mean',
'LAW': 'mean',
'GPE': 'mean',
'PRODUCT': 'mean',
'ORG': 'mean'})
group_df  # _person und product

Unnamed: 0_level_0,LOC,PERSON,DATE,EVENT,NORP,LAW,GPE,PRODUCT,ORG
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3.413333,4.173333,2.013333,0.026667,1.026667,0.053333,4.546667,0.373333,16.333333
2,4.273438,3.65625,2.8125,0.109375,0.375,0.015625,2.015625,0.398438,18.054688
3,5.091255,4.69962,5.486692,0.277567,1.798479,0.155894,5.642586,0.551331,21.821293
4,4.211765,8.011765,4.035294,0.141176,1.729412,0.211765,6.376471,1.011765,23.764706


In [79]:
loc_entity_sim = entity_df["label"].corr(entity_df["LOC"], method='spearman')
person_entity_sim = entity_df["label"].corr(entity_df["PERSON"], method='spearman')
date_entity_sim = entity_df["label"].corr(entity_df["DATE"], method='spearman')
event_entity_sim = entity_df["label"].corr(entity_df["EVENT"], method='spearman')
norp_entity_sim = entity_df["label"].corr(entity_df["NORP"], method='spearman')
law_entity_sim = entity_df["label"].corr(entity_df["LAW"], method='spearman')
gpe_entity_sim = entity_df["label"].corr(entity_df["GPE"], method='spearman')
product_entity_sim = entity_df["label"].corr(entity_df["PRODUCT"], method='spearman')
org_entity_sim = entity_df["label"].corr(entity_df["ORG"], method='spearman')

In [80]:
print(f'Entity LOC correlation score: {loc_entity_sim}')
print(f'Entity PERSON correlation score: {person_entity_sim}')
print(f'Entity DATE correlation score: {date_entity_sim}')
print(f'Entity EVENT correlation score: {event_entity_sim}')
print(f'Entity NORP correlation score: {norp_entity_sim}')
print(f'Entity LAW correlation score: {law_entity_sim}')
print(f'Entity GPE correlation score: {gpe_entity_sim}')
print(f'Entity PRODUCT correlation score: {product_entity_sim}')
print(f'Entity ORG correlation score: {org_entity_sim}')

Entity LOC correlation score: -0.020544408137479313
Entity PERSON correlation score: 0.06299336879161169
Entity DATE correlation score: 0.11564571552662409
Entity EVENT correlation score: 0.08821053385550824
Entity NORP correlation score: 0.04181442232350223
Entity LAW correlation score: 0.12179940407770956
Entity GPE correlation score: 0.04873733273720441
Entity PRODUCT correlation score: -0.016908961987197324
Entity ORG correlation score: 0.03417995528927692


## II. Local feature analysis

In [91]:
keywords_df = pd.read_pickle(os.getcwd()+'/final_keywords_dataframe.pkl')
keywords_df = keywords_df[['id', 'keywords']]
keywords_df = keywords_df.rename(columns={'id': 'page_id'})

In [88]:
main_df = pd.read_pickle(os.getcwd()+'/../dataframes/retrieval_dataset.pkl')

In [119]:
mr_df = main_df[main_df['query'] == 'Mixed Reality']
vs_df = main_df[main_df['query'] == 'Visualisierung']

In [120]:
mr_df = pd.concat([mr_df.set_index('page_id'), keywords_df.set_index('page_id')], axis=1, join='inner').reset_index()
vs_df = pd.concat([vs_df.set_index('page_id'), keywords_df.set_index('page_id')], axis=1, join='inner').reset_index()

### 1. keyword extraction
### 2. Candidate pool extraction

In [99]:
def get_modified_vectors(vec_data):
    
    new_data = []
    for val in vec_data:
        new_data.append(val)
    
    new_data = np.array(new_data).reshape(-1, 512)
    return new_data

def get_pool_vec(doc_vec_list, pool):
    
    doc_vec_list = get_modified_vectors(doc_vec_list)
    if pool == 'mean':
        return np.nanmean(doc_vec_list, axis=0)
    elif pool == 'max':
        return np.nanmax(doc_vec_list, axis=0)

def get_document_vec(text):
    
    return tf_model(text)['outputs'].numpy()[0].reshape(1, -1)

def get_sent_transformers_keywords_use(keywords, query_vec, max_keyword_cnt = 30):
    
    keywords = list(dict(keywords).keys())
    
    candidate_embeddings_keywords = [tf_model(kw)['outputs'].numpy()[0] for kw in keywords]
        
    query_distances = cosine_similarity([query_vec], candidate_embeddings_keywords)
    subtopic_keywords_dict = dict()
    for index in query_distances.argsort()[0][-max_keyword_cnt:]: 
        
        subtopic_keywords_dict[keywords[index]] = query_distances[0][index]
    
    subtopic_keywords_dict = sorted(subtopic_keywords_dict.items(), key=lambda x: x[1], reverse=True)

    return subtopic_keywords_dict

def get_candidate_pool(subtopic_keywords_list):
    
    candidate_pool = []
    
    lower_limit = 0.2
    upper_limit = 0.4
    
    for key, value in subtopic_keywords_list:
        
        if value > 0.2 and value < 0.4:
            candidate_pool.append(key)
            
    return candidate_pool

In [98]:
query_1 = 'Mixed Reality'
query_vec_1 = tf_model(query_1)['outputs'].numpy()[0]

query_2 = 'Visualisierung'
query_vec_2 = tf_model(query_2)['outputs'].numpy()[0]

In [121]:
%timeit mr_df['keywords_use'] = mr_df.apply(lambda x:get_sent_transformers_keywords_use(x['keywords'], query_vec_1, max_keyword_cnt = 30), axis=1)

31.5 s ± 443 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [101]:
%timeit vs_df['keywords_use'] = vs_df.apply(lambda x:get_sent_transformers_keywords_use(x['keywords'], query_vec_1, max_keyword_cnt = 30), axis=1)

32.4 s ± 680 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [122]:
%timeit mr_df['candidate_pool'] = mr_df.apply(lambda x:get_candidate_pool(x['keywords_use']), axis=1)

3.05 ms ± 55 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [103]:
%timeit vs_df['candidate_pool'] = vs_df.apply(lambda x:get_candidate_pool(x['keywords_use']), axis=1)

3.02 ms ± 43.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [123]:
mr_df.to_pickle(os.getcwd()+'/../dataframes/retrieval_dataset_mr.pkl')
vs_df.to_pickle(os.getcwd()+'/../dataframes/retrieval_dataset_vs.pkl')

In [125]:
mr_df = pd.read_pickle(os.getcwd()+'/../dataframes/retrieval_dataset_mr.pkl')
vs_df = pd.read_pickle(os.getcwd()+'/../dataframes/retrieval_dataset_vs.pkl')

### 3. Clustering

In [126]:
def get_umap_output(vec_array, dim_size=5):
    
    umap_obj = umap.UMAP(n_neighbors=40, 
                        n_components=dim_size, 
                        min_dist=0.01,
                        metric='cosine',
                        random_state=123).fit(vec_array) 
    
    umap_output = umap_obj.transform(vec_array) 
    return umap_output, umap_obj

def get_hdbscan_output(data_points, cluster_size=7):
    
    hdbscan_output = hdbscan.HDBSCAN(
        #min_cluster_size=cluster_size,
#                                       min_samples=2,
                                      metric='euclidean',
                                     cluster_selection_method='eom').fit(data_points)
    return hdbscan_output

def project_on_2Dplane(umap_output, cluster_ids):
    
    umap_df = pd.DataFrame(np.column_stack((umap_output, cluster_ids)), columns=['x', 'y', 'cluster ids'])
    grid = sns.FacetGrid(umap_df, hue='cluster ids', height=7)
    grid.map(plt.scatter, 'x', 'y').add_legend()
    
def get_clustering_analysis(cluster_df, final_candidate_pool_vecs, dimen_size=5, cluster_size=7):
    
    umap_output_5, umap_5 = get_umap_output(final_candidate_pool_vecs, dim_size=dimen_size)
    hdbscan_output = get_hdbscan_output(umap_output_5, cluster_size=cluster_size)
    
    cluster_df['cluster_id'] = hdbscan_output.labels_
    cluster_df.cluster_id.hist(bins=150)
    
    umap_output_2, umap_2 = get_umap_output(final_candidate_pool_vecs, dim_size=2)
    project_on_2Dplane(umap_output_2, cluster_df['cluster_id'])
    
    return cluster_df

def get_nearest_keyword(keywords, keyword_vecs, mean_vec):
    
    query_distances = cosine_similarity([mean_vec], list(keyword_vecs))
    subtopic_keywords_dict = dict()
    for index in query_distances.argsort()[0]: 
        
        subtopic_keywords_dict[keywords[index]] = query_distances[0][index]
    
    subtopic_keywords_dict = sorted(subtopic_keywords_dict.items(), key=lambda x: x[1], reverse=True)
    return subtopic_keywords_dict[0][0]

def get_topics(cluster_data_df, candidate_pool):
    
    topic_list = []
    
    for idx, row in cluster_data_df.iterrows():
        
        candidate_words = row['candidate_words']
        topic = row['topic']
        
        for cp in candidate_pool:
            if cp in candidate_words:
                topic_list.append(topic)
                break
    
    return topic_list

def get_sub_topic_modelling(query_df):
    
    final_candidate_pool = []

    for idx, row in query_df.iterrows():
        final_candidate_pool.extend(row['candidate_pool'])
        
    final_candidate_pool = list(set(final_candidate_pool))

    final_candidate_pool_vecs = [tf_model(nc)['outputs'].numpy()[0] for nc in final_candidate_pool]

    df_data = []
    for word, vec in zip(final_candidate_pool, final_candidate_pool_vecs):
        df_data.append((word, vec))

    cluster_df = pd.DataFrame(df_data, columns= ['candidate_words', 'candidate_vecs'])
    cluster_df = get_clustering_analysis(cluster_df, final_candidate_pool_vecs, dimen_size=5, cluster_size=8)

    cluster_data = []

    for cluster_id in set(cluster_df.cluster_id.values):

        if cluster_id != -1:
            df = cluster_df[cluster_df['cluster_id'] == cluster_id]
            cluster_data.append((cluster_id, df.candidate_words.values, df.candidate_vecs.values))

    cluster_data_df = pd.DataFrame(cluster_data, columns=['cluster_id', 'candidate_words', 'candidate_vecs'])
    cluster_data_df['mean_vec'] = cluster_data_df.apply(lambda x:get_pool_vec(x['candidate_vecs'], 'mean'), axis=1)
    cluster_data_df['topic'] = cluster_data_df.apply(lambda x:get_nearest_keyword(x['candidate_words'], x['candidate_vecs'], x['mean_vec']), axis=1)
    
    query_df['topics'] = query_df.apply(lambda x:get_topics(cluster_data_df, x['candidate_pool']), axis=1)

    return query_df

In [131]:
mr_df = get_sub_topic_modelling(mr_df)
vs_df = get_sub_topic_modelling(vs_df)

In [132]:
mr_df['topics_cnt'] = mr_df.apply(lambda x:len(x['topics']), axis=1)
vs_df['topics_cnt'] = vs_df.apply(lambda x:len(x['topics']), axis=1)

In [133]:
mr_topic_sim = mr_df["label"].corr(mr_df["topics_cnt"], method='spearman')
vs_topic_sim = vs_df["label"].corr(vs_df["topics_cnt"], method='spearman')

## Average sub-topic feature analysis

In [135]:
mr_group_df = mr_df.groupby(['label']).agg({'topics_cnt': 'mean'})
mr_group_df

Unnamed: 0_level_0,topics_cnt
label,Unnamed: 1_level_1
1,4.0
2,2.2
3,3.75
4,2.333333


In [136]:
vs_group_df = vs_df.groupby(['label']).agg({'topics_cnt': 'mean'})
vs_group_df

Unnamed: 0_level_0,topics_cnt
label,Unnamed: 1_level_1
1,1.4
2,1.5
3,1.1
4,1.090909


In [134]:
print(f'Sub-topics Mixed Reality correlation score: {mr_topic_sim}')
print(f'Sub-topics Visualisierung correlation score: {vs_topic_sim}')

Sub-topics Mixed Reality correlation score: -0.08978520557597833
Sub-topics Visualisierung correlation score: -0.27062314559150014


In [None]:
# _which clusters are useful for identifying positive clusters 
# _which clusters are useful for identifying negative clusters