# Co-relation analysis with arXiv abstracts

In [50]:
import sqlite3
import os
import pandas as pd
import numpy as np
import math

import json

import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import ndcg_score

import warnings
warnings.filterwarnings('ignore')

In [147]:
query = 'robotik'
query_updated = query.lower().replace(' ', '_')

with open(f'search_results_index/{query_updated}_bm25_result.json', 'r') as f:
    bm25_ranking = json.load(f)

with open(f'search_results_index/{query_updated}_semantic_result.json', 'r') as f:
    semantic_ranking = json.load(f)

dataframe_folder_path = f'dataframes/{query.lower()}/'
results_folder_path = f'results/{query.lower()}/'

final_df_filepath = dataframe_folder_path+'final_df.pkl'
arxiv_query_df_filepath = dataframe_folder_path+'arxiv_query_tf_df.pkl'

ndcg_results_filepath = results_folder_path+f'{query_updated}_ndcg_results.csv'
correlation_results_filepath = results_folder_path+f'{query_updated}_correlation_results.csv'

if not os.path.isdir(results_folder_path):
    os.mkdir(results_folder_path)
    
ndcg_results = []
correal_results = []

In [148]:
final_df = pd.read_pickle(final_df_filepath)
arxiv_query_df = pd.read_pickle(arxiv_query_df_filepath)

In [149]:
def transform_gt_rank(rank):
    
    if rank == 1:
        return 3
    elif rank == 2:
        return 2
    elif rank == 3:
        return 1
    elif rank == 4:
        return 0
    
def transform_gt_rank_new(rank):
    
    if rank == 3:
        return 3
    elif rank == 2:
        return 2
    elif rank == 0:
        return 1
    
def get_rank_from_rank_df(idx):
    return arxiv_query_df[arxiv_query_df['id'] == idx]['gt_rank'].values[0]

In [150]:
final_df['gt_rank'] = final_df.apply(lambda x:transform_gt_rank(x['gt_rank']), axis=1)

In [151]:
final_df['gt_rank'].value_counts()

2    13
3     9
1     7
Name: gt_rank, dtype: int64

In [152]:
gt_rank_df = final_df[['id', 'gt_rank']].sort_values('gt_rank', ascending=False)
gt_rank_df.head(2)

Unnamed: 0,id,gt_rank
14,210705_news_324381,3
8,210705_news_230374,3


# Performance evaluation using nDCG

1. BM25
2. Semantic search

In [153]:
bm25_doc_list = []
semantic_doc_list = []

for key, item in bm25_ranking.items():
    bm25_doc_list.append(item)

for key, item in semantic_ranking.items():
    semantic_doc_list.append(item)

In [154]:
gt_rank_bm25_df = gt_rank_df[gt_rank_df['id'].isin(bm25_doc_list)]
gt_rank_semantic_df = gt_rank_df[gt_rank_df['id'].isin(semantic_doc_list)]

In [155]:
def get_gt_ranking(df):
    return df.gt_rank.values

def get_gt_rank_for_id(idx):
    return gt_rank_df[gt_rank_df['id'] == idx]['gt_rank'].values[0]

def get_bm25_semantic_relevance():
    
    bm25_relevance = []
    semantic_relevance = []
    
    for key, item in bm25_ranking.items():
        bm25_relevance.append(get_gt_rank_for_id(item))
        
    for key, item in semantic_ranking.items():
        semantic_relevance.append(get_gt_rank_for_id(item))
        
    return np.asarray([bm25_relevance]), np.asarray([semantic_relevance])

def get_ndcg_scores(true_ranking, predicted_ranking):
    
    ndcg_15 = ndcg_score(true_ranking, predicted_ranking, k=15)
    ndcg_10 = ndcg_score(true_ranking, predicted_ranking, k=10)
    ndcg_5 = ndcg_score(true_ranking, predicted_ranking, k=5)
    
    print(f'NDCG@15 -- {ndcg_15}')
    print(f'NDCG@10 -- {ndcg_10}')
    print(f'NDCG@5 -- {ndcg_5}')
    
    return (ndcg_15, ndcg_10, ndcg_5)

In [156]:
true_relevance_bm25 = np.asarray([get_gt_ranking(gt_rank_bm25_df)])
true_relevance_semantic = np.asarray([get_gt_ranking(gt_rank_semantic_df)])

bm25_relevance, semantc_relevance = get_bm25_semantic_relevance()

In [157]:
true_relevance_bm25

array([[3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [158]:
true_relevance_semantic

array([[3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1]], dtype=int64)

In [159]:
semantc_relevance

array([[3, 3, 3, 3, 2, 2, 3, 2, 1, 2, 3, 2, 2, 2, 1]], dtype=int64)

In [160]:
bm25_relevance

array([[1, 1, 1, 1, 3, 2, 2, 3, 2, 1, 3, 1, 2, 2, 2]], dtype=int64)

In [161]:
bm25_ndcg_results = get_ndcg_scores(true_relevance_bm25, bm25_relevance)
ndcg_results.append((query, 'BM-25', bm25_ndcg_results[0], bm25_ndcg_results[1], bm25_ndcg_results[2]))

NDCG@15 -- 0.8343759803568778
NDCG@10 -- 0.673731805029147
NDCG@5 -- 0.595158707838956


In [162]:
semantic_ndcg_results = get_ndcg_scores(true_relevance_semantic, semantc_relevance)
ndcg_results.append((query, 'Semantic', semantic_ndcg_results[0], semantic_ndcg_results[1], semantic_ndcg_results[2]))

NDCG@15 -- 0.9617381426458651
NDCG@10 -- 0.9253884151646965
NDCG@5 -- 0.8888888888888887


# Arxiv dataset loading

In [163]:
def query_docs(abstract):
    
    abstract = abstract.lower()
    if 'quantum technology' in abstract:
        return 1
    return 0

In [164]:
# arxiv_query_df['quant_label'] = arxiv_query_df.apply(lambda x:query_docs(x['abstract']), axis=1)
# arxiv_query_df = arxiv_query_df[arxiv_query_df['quant_label'] == 1]

In [165]:
# arxiv_query_df['lang'].value_counts()

In [166]:
# arxiv_query_df = arxiv_query_df[arxiv_query_df['abstract_len'] < 1000]
# arxiv_query_df = arxiv_query_df[arxiv_query_df['abstract_len'] > 500]

In [167]:
# arxiv_query_df['abstract_len'] = arxiv_query_df.apply(lambda x:len(x['abstract']), axis=1)
# arxiv_query_df['abstract_len'].hist()

In [168]:
# arxiv_query_df.abstract_len.describe()

# I. Does manual labels correlate with arxiv abstracts? (cosine sim)

1. Co-relation analyis between ranked result vectors and mean/max abstracts vectors
2. Co-relation analyis between ranked result **noun-chunk** vectors and mean/max abstracts **noun-chunk** vectors
3. Co-relation analyis between ranked result **keyword** vectors and mean/max abstracts **keyword** vectors
4. Co-relation analyis between ranked result **paragraph** vectors and abstracts **summarization** vectors

In [169]:
def get_modified_vectors(vec_data):
    
    new_data = []
    for val in vec_data:
        new_data.append(val)
    
    new_data = np.array(new_data).reshape(-1, 512)
    return new_data

def get_document_vec(text):
    
    return tf_model(text)['outputs'].numpy()[0].reshape(1, -1)

def get_pool_vec(doc_vec_list, pool):
    
    doc_vec_list = get_modified_vectors(doc_vec_list)
    if pool == 'mean':
        return np.nanmean(doc_vec_list, axis=0)
    elif pool == 'max':
        return np.nanmax(doc_vec_list, axis=0)
    
def get_cosine_sim(vec_1, vec_2):
    
    return cosine_similarity(vec_1.reshape(1, -1), vec_2.reshape(1, -1))[0][0]

In [170]:
mean_doc_vec = get_pool_vec(arxiv_query_df.doc_vec.values, pool='mean')
max_doc_vec = get_pool_vec(arxiv_query_df.doc_vec.values, pool='max')

In [171]:
final_df['mean_doc_sim'] = final_df.apply(lambda x:get_cosine_sim(x['doc_vec'], mean_doc_vec), axis=1)
final_df['max_doc_sim'] = final_df.apply(lambda x:get_cosine_sim(x['doc_vec'], max_doc_vec), axis=1)

In [172]:
mean_doc_cor = final_df['gt_rank'].corr(final_df['mean_doc_sim'], method='spearman')
mean_doc_cor

0.10951307978491295

In [173]:
max_doc_cor = final_df['gt_rank'].corr(final_df['max_doc_sim'], method='spearman')
max_doc_cor

-0.03914960340136985

In [174]:
final_df[['gt_rank', 'mean_doc_sim', 'max_doc_sim']].groupby('gt_rank').mean()

Unnamed: 0_level_0,mean_doc_sim,max_doc_sim
gt_rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.417297,0.101856
2,0.44097,0.102524
3,0.451104,0.098856


#### 2. Co-relation analyis between ranked result **noun-chunk** vectors and mean/max abstracts **noun-chunk** vectors

In [175]:
mean_doc_vec_nounchunk = get_pool_vec(arxiv_query_df.nounchunk_mean_vec.values, pool='mean')
max_doc_vec_nounchunk = get_pool_vec(arxiv_query_df.nounchunk_mean_vec.values, pool='max')

final_df['mean_sim_nounchunk'] = final_df.apply(lambda x:get_cosine_sim(x['nounchunk_mean_vec'], mean_doc_vec_nounchunk), axis=1)
final_df['max_sim_nounchunk'] = final_df.apply(lambda x:get_cosine_sim(x['nounchunk_mean_vec'], max_doc_vec_nounchunk), axis=1)

In [176]:
mean_nounchunk_cor = final_df['gt_rank'].corr(final_df['mean_sim_nounchunk'], method='spearman')
mean_nounchunk_cor

-0.12485549192869304

In [177]:
max_nounchunk_cor = final_df['gt_rank'].corr(final_df['max_sim_nounchunk'], method='spearman' )
max_nounchunk_cor

0.053962966850536816

In [178]:
final_df[['gt_rank', 'mean_sim_nounchunk', 'max_sim_nounchunk']].groupby('gt_rank').mean()

Unnamed: 0_level_0,mean_sim_nounchunk,max_sim_nounchunk
gt_rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.902752,0.510114
2,0.918923,0.526424
3,0.900972,0.51543


In [179]:
noun_chunk_scores_df = final_df[['id', 'mean_sim_nounchunk']]
noun_chunk_scores_df.sample(2)

Unnamed: 0,id,mean_sim_nounchunk
8,210705_news_230374,0.902723
12,210705_news_275343,0.932061


In [180]:
bm25_nounchunk_ranking_df = pd.concat([noun_chunk_scores_df.set_index('id'),gt_rank_bm25_df.set_index('id')], axis=1, join='inner').reset_index()
semantic_nounchunk_ranking_df = pd.concat([noun_chunk_scores_df.set_index('id'),gt_rank_semantic_df.set_index('id')], axis=1, join='inner').reset_index()

In [181]:
bm25_nounchunk_ranking_df = bm25_nounchunk_ranking_df.sort_values(['mean_sim_nounchunk'], ascending=False)
semantic_nounchunk_ranking_df = semantic_nounchunk_ranking_df.sort_values(['mean_sim_nounchunk'], ascending=False)

In [182]:
bm25_nc_relevance = np.asarray([get_gt_ranking(bm25_nounchunk_ranking_df)])
semantic_nc_relevance = np.asarray([get_gt_ranking(semantic_nounchunk_ranking_df)])

In [183]:
bm25_nc_ndcg_results = get_ndcg_scores(true_relevance_bm25, bm25_nc_relevance)
ndcg_results.append((query, 'BM-25 and Noun-chunk', bm25_nc_ndcg_results[0], bm25_nc_ndcg_results[1], bm25_nc_ndcg_results[2]))

NDCG@15 -- 0.826371681748925
NDCG@10 -- 0.7250510793674484
NDCG@5 -- 0.5745690876613313


In [184]:
semantic_nc_ndcg_results = get_ndcg_scores(true_relevance_semantic, semantic_nc_relevance)
ndcg_results.append((query, 'Semantic and Noun-chunk', semantic_nc_ndcg_results[0], semantic_nc_ndcg_results[1], semantic_nc_ndcg_results[2]))

NDCG@15 -- 0.8702134212763282
NDCG@10 -- 0.7602830880644119
NDCG@5 -- 0.611111111111111


#### 3. Co-relation analyis between ranked result **keyword** vectors and mean/max abstracts **keyword** vectors

In [185]:
mean_doc_vec_keyword = get_pool_vec(arxiv_query_df.keyword_mean_vec.values, pool='mean')
max_doc_vec_keyword = get_pool_vec(arxiv_query_df.keyword_mean_vec.values, pool='max')

final_df['mean_sim_keyword'] = final_df.apply(lambda x:get_cosine_sim(x['keyword_mean_vec'], mean_doc_vec_keyword), axis=1)
final_df['max_sim_keyword'] = final_df.apply(lambda x:get_cosine_sim(x['keyword_mean_vec'], max_doc_vec_keyword), axis=1)

In [186]:
mean_keyword_cor = final_df['gt_rank'].corr(final_df['mean_sim_keyword'], method='spearman')
mean_keyword_cor

-0.004232389556904848

In [187]:
max_keyword_cor = final_df['gt_rank'].corr(final_df['max_sim_keyword'], method='spearman')
max_keyword_cor

-0.09099637547345422

In [188]:
# final_df[['gt_rank', 'mean_sim_keyword', 'max_sim_keyword']].sort_values('gt_rank')

#### 4. Co-relation analyis between ranked result **paragraph** vectors and abstracts summarization vectors

In [189]:
mean_paragraph_cor = final_df['gt_rank'].corr(final_df['mean_sim_summ'], method='spearman')
mean_paragraph_cor

0.038620554706756745

In [190]:
# final_df.to_pickle('dataframes/quantum_technologie/final_df.pkl')
# quant_df.to_pickle('dataframes/quantum_technologie/quant_df.pkl')

In [191]:
ndcg_df = pd.DataFrame(ndcg_results, columns=['Query', 'method', 'ndcg@15', 'ndcg@10', 'ndcg@5'])
ndcg_df

Unnamed: 0,Query,method,ndcg@15,ndcg@10,ndcg@5
0,robotik,BM-25,0.834376,0.673732,0.595159
1,robotik,Semantic,0.961738,0.925388,0.888889
2,robotik,BM-25 and Noun-chunk,0.826372,0.725051,0.574569
3,robotik,Semantic and Noun-chunk,0.870213,0.760283,0.611111


In [192]:
ndcg_df.to_csv(ndcg_results_filepath, encoding='utf-8', index=False, sep='|')

In [193]:
correal_results = [(query, mean_doc_cor, max_doc_cor, mean_nounchunk_cor, max_nounchunk_cor, mean_keyword_cor, max_keyword_cor, mean_paragraph_cor)]
corr_df = pd.DataFrame(correal_results, columns=['Query', 'mean doc corr', 'max doc corr', 'mean nounchunk corr', 'max nounchunk corr', 'mean keyword corr', 'max keyword corr', 'mean paragraph corr'])
corr_df

Unnamed: 0,Query,mean doc corr,max doc corr,mean nounchunk corr,max nounchunk corr,mean keyword corr,max keyword corr,mean paragraph corr
0,robotik,0.109513,-0.03915,-0.124855,0.053963,-0.004232,-0.090996,0.038621


In [194]:
corr_df.to_csv(correlation_results_filepath, encoding='utf-8', index=False, sep='|')

## II. Can arxiv abstracts be used for knowledge set creation? (related keywords)