In [None]:
import bibtexparser
import itertools
import numpy as np
import pandas as pd
import scipy
import scipy.cluster.hierarchy as shc
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

In [None]:
# Global config - sorry about this
# Epoch 0 = 1993 - 2003
# Epoch 1 = 2004 - 2013
# Epoch 2 = 2014 - 2023
# Epoch 3 = 1993 - 2023
EPOCH = 3
WRITE_FILES = False
SHOW_FIGURES = True

In [None]:
def get_articles_by_year(year_range, article_list):
    return_articles = []
    for article in article_list:
        if article.year in year_range:
            return_articles.append(article)
    return return_articles

def count_keywords_in_articles(article_list):
    kwidx_and_count = {}
    for article in article_list:
        for idx in article.keyword_indices:
            if idx not in kwidx_and_count:
                kwidx_and_count[idx] = 1
            else:
                kwidx_and_count[idx] += 1
    return kwidx_and_count

def parse_authors(authors):
    return authors.split(' and ')

def parse_keywords(keywords):
    if ';' in keywords:
        delimiter = ';'
    else:
        delimiter = ','
    return [s.strip().lower().replace(' ', '__').replace('-', '_') for s in keywords.split(delimiter)]

def association_strength(i, j, C):
    return C[i][j] / (C[i][i] * C[j][j])

def cosine_index(i, j, C):
    return C[i][j] / ((C[i][i] * C[j][j]) ** 0.5)

def inclusion_index(i, j, C):
    return C[i][j] / min(C[i][i], C[j][j])

def jaccard_index(i, j, C):
    return C[i][j] / (C[i][i] + C[j][j] - C[i][j])

years = range(1994, 2024)

class Article():
    def __init__(self, key, year, title, authors, venue, abstract, cc, ieee_terms, keywords):
        self.key = key
        self.year = year
        self.title = title
        self.authors = parse_authors(authors)
        self.venue = venue
        self.abstract = abstract
        self.citation_count = cc
        self.ieee_terms = parse_keywords(ieee_terms)
        self.keywords = parse_keywords(keywords)
        self.keyword_indices = set()
        self.topic_indices = set()
    def set_keyword_indices(self, in_dict):
        for kw in self.keywords:
            self.keyword_indices.add(in_dict.get(kw, 0))

class BigramSimilarity():
    def __init__(self, i, j, C, idx_to_kw):
        self.i = idx_to_kw[i]
        self.j = idx_to_kw[j]
        self.association_strength = association_strength(i, j, C)
        self.cosine_index = cosine_index(i, j, C)
        self.inclusion_index = inclusion_index(i, j, C)
        self.jaccard_index = inclusion_index(i, j, C)

In [None]:
#
# Load the article list from a tab-delimited txt file
#

articles_dict = {}

with open('all_vr.txt') as infile:
    for line in infile:
        line_cells = line.split('\t')
        year = int(line_cells[0])
        key = line_cells[1]
        title = line_cells[2]
        authors = line_cells[3]
        venue = line_cells[4]
        abstract = line_cells[5]
        citation_count = int(line_cells[6])
        ieee_terms = line_cells[7]
        keywords = line_cells[8]
        articles_dict[key] = Article(key, year, title, authors, venue, abstract, citation_count, ieee_terms, keywords)

In [None]:
# Load processed keywords from disk
# 'prettify' strings coming from Excel
# Generate index values to go along with keyword lists

indices_to_keywords_dict = {}
keywords_to_indices_dict = {}
index_count_dict = {}

with open('vr_keywords_CLEAN_AND_PRUNED.txt') as infile:
    for idx, line in enumerate(infile):
        index_count_dict[idx] = 0
        rough_words = line.strip().split('\t')
        words = []
        for w in rough_words:
            words.append(w.strip().replace(' ', '__').replace('-', '_'))
        indices_to_keywords_dict[idx] = words
        for word in words:
            if len(word):
                keywords_to_indices_dict[word] = idx

In [None]:
#
# Count occurrences of each keyword index across all articles
#

for _, article in articles_dict.items():
    article.set_keyword_indices(keywords_to_indices_dict)
    for idx in article.keyword_indices:
        index_count_dict[idx] += 1

In [None]:
#
# Get a year range of articles, count the keywords that appear in those articles
#

if EPOCH == 0:
    r = range(1993, 2004)
elif EPOCH == 1:
    r = range(2004, 2014)
elif EPOCH == 2:
    r = range(2014, 2024)
elif EPOCH == 3:
    r = range(1993, 2024)

article_list = get_articles_by_year(r, articles_dict.values())

num_articles = len(article_list)

keyword_count_dict = count_keywords_in_articles(article_list)

In [None]:
# Sort keywords in descending order of associated count

sorted_kw_indices_by_count = sorted(keyword_count_dict.items(), key=lambda x:x[1], reverse=True)[1:]

In [None]:
#
# Get the top 100 (plus ties) keywords OR all keywords that appear more than once,
#     whichever is smaller
#

top100_dict = {}
top100_indices_dict = {}
keyword_indices_from_top100_idx = {}
checking_ties = False
tie_count = -1
repeated_keywords_dict = {}

for _, v in sorted_kw_indices_by_count:
    if v not in repeated_keywords_dict:
        repeated_keywords_dict[v] = 1
    else:
        repeated_keywords_dict[v] += 1

if WRITE_FILES:
    with open('repeated_keywords.csv', 'w') as outfile:
        for k, v in repeated_keywords_dict.items():
            for _ in range(v):
                outfile.write('{}\n'.format(k))

num_keywords = 0
k, v = sorted_kw_indices_by_count[num_keywords]

while v >= 2 and (num_keywords <= 100 or checking_ties):
    if num_keywords < 100:
        top100_dict[k] = v
        top100_indices_dict[k] = num_keywords - 1
        keyword_indices_from_top100_idx[num_keywords - 1] = k
        num_keywords += 1
    elif num_keywords == 100:
        checking_ties = True
        tie_count = v
        top100_dict[k] = v
        top100_indices_dict[k] = num_keywords - 1
        keyword_indices_from_top100_idx[num_keywords - 1] = k
        num_keywords += 1
    elif checking_ties:
        if v == tie_count:
            top100_dict[k] = v
            top100_indices_dict[k] = num_keywords - 1
            keyword_indices_from_top100_idx[num_keywords - 1] = k
            num_keywords += 1
        else:
            checking_ties = False
    k, v = sorted_kw_indices_by_count[num_keywords]

num_columns = num_keywords - 1

In [None]:
#
# Generate the occurrence matrix O for the top100 keywords
# (Each row is an article, each column is a keyword, 1 indicates keyword on article)
#

occurrence_matrix = np.zeros((num_articles, num_columns))
for a_idx, article in enumerate(article_list):
    for kw_idx in article.keyword_indices:
        if kw_idx in top100_dict:
            occurrence_matrix[a_idx][top100_indices_dict[kw_idx]] = 1

In [None]:
#
# Generate co-occurrence matrix C
# C = OTO (O-transpose-O), see (van Eck and Waltman, 2009)
#

co_occurrence_matrix = np.matmul(occurrence_matrix.T, occurrence_matrix)

In [None]:
#
# Generate direct similarity scores for each pair of top100 keywords
# Again, for definitions, see (van Eck and Waltman, 2009)
#

kw_bigrams = itertools.combinations(range(num_columns), 2)
bigram_scores = [BigramSimilarity(i, j, co_occurrence_matrix, keyword_indices_from_top100_idx)
                    for (i, j) in kw_bigrams]

In [None]:
#
# Prepare for hierarchical clustering by creating a pandas dataframe of the occurrence matrix
#

kw_labels = []
for i in range(num_columns):
    kw_idx = keyword_indices_from_top100_idx[i]
    kw_labels.append('{}'.format(indices_to_keywords_dict[kw_idx][0]))

occurrence_df = pd.DataFrame(occurrence_matrix.T, index=kw_labels)

In [None]:
#
# Compute the cosine distances between keywords
#

cosine_distance_matrix = scipy.spatial.distance.pdist(occurrence_matrix.T, 'cosine')

In [None]:
#
# Perform hierarchical clustering using Ward's method
#

clusters = shc.linkage(cosine_distance_matrix, method='ward')

if EPOCH == 0:
    cluster_threshold = 1.39
elif EPOCH == 1:
    cluster_threshold = 1.42
elif EPOCH == 2:
    cluster_threshold = 1.36
elif EPOCH == 3:
    cluster_threshold = 1.25

T = shc.fcluster(clusters,
                 t=cluster_threshold,
                 criterion='distance')


In [None]:
#
# Prepare for the strategic diagrams
#

topic_clusters = [[] for _ in range(len(T))]
cluster_counts = [0 for _ in range(len(T))]

for idx, cluster_id in enumerate(T):
    kw_idx = keyword_indices_from_top100_idx[idx]
    temp = '{}'.format(indices_to_keywords_dict[kw_idx][0])
    topic_clusters[cluster_id - 1].append(temp)
    cluster_counts[cluster_id - 1] += top100_dict[kw_idx]

threshholded_topic_clusters = {i:topic_clusters[i] for i in range(len(T)) if cluster_counts[i] >= 1}
TTC = threshholded_topic_clusters   # Renamed for convenience
topic_kws_dict = {idx:topic_clusters[idx] for idx in TTC}
topic_sizes_dict = {idx:len(topic_clusters[idx]) for idx in TTC}
cluster_counts_dict = {idx:cluster_counts[idx] for idx in TTC}

total_papers_count = len(article_list)

In [None]:
#
# Unique-ify the keyword list
#

unique_keywords = set()
for article in article_list:
    unique_keywords.update(article.keywords)
total_unique_kws = len(unique_keywords)

if WRITE_FILES:
    with open('keywords.csv', 'w') as outfile:
        outfile.write('{},{},{},{},{}\n'.format('index',
                                             'keyword',
                                             'cluster',
                                             'topic',
                                             'observations'))
        for kw_idx, (k, v) in enumerate(top100_dict.items()):
            for topic_idx in TTC:
                keyword = indices_to_keywords_dict[k][0]
                if keyword in topic_kws_dict[topic_idx]:
                    outfile.write('{},{},{},{},{}\n'.format(k,
                                                            keyword,
                                                            topic_idx,
                                                            topic_kws_dict[topic_idx][0],
                                                            v))
                    break

In [None]:
#
# Compute frequency, co-word frequency, and cohesion
#

frequency_dict = {}
coword_frequency_dict = {}


for topic_idx in TTC:
    frequency_dict[topic_idx] = 0
    coword_frequency_dict[topic_idx] = 0

    for article in article_list:
        topic_match = False
        co_words = False
        for kw in article.keywords:
            if topic_match and not co_words:
                co_words = True
            if kw in topic_kws_dict[topic_idx]:
                frequency_dict[topic_idx] += 1
                topic_match = True
        if topic_match:
            article.topic_indices.add(topic_idx)
            if co_words:
                coword_frequency_dict[topic_idx] += 1

for topic_idx in TTC:
    frequency_dict[topic_idx] /= num_articles
    coword_frequency_dict[topic_idx] /= num_articles
    # cohesion is probability of multiple kws on a paper given probability of single kw
    # P(multiple | single) = p(multiple AND single) / p(single) = p(multiple) / p(single) = coword_freq / freq
    #   probability of a single keyword on a paper is frequency
    #   probability of multiple keywords on a paper is coword_frequency

cohesion_dict = {idx:(coword_frequency_dict[idx] / frequency_dict[idx]) for idx in TTC}

In [None]:
# Compute connectedness and centrality

list_of_kws_in_topics = []

for topic_idx in TTC:
    for kw in topic_kws_dict[topic_idx]:
        list_of_kws_in_topics.append(kw)

connected_kws_1step_dict = {}

for kw in list_of_kws_in_topics:
    connected_kws_1step_dict[kw] = set()
    for article in article_list:
        if kw in article.keywords:
            connected_kws_1step_dict[kw].update(article.keywords)

connected_kws_2step_dict = {}

for kw in list_of_kws_in_topics:
    connected_kws_2step_dict[kw] = set()
    for kw2 in connected_kws_1step_dict[kw]:
        if kw2 in connected_kws_1step_dict:
            connected_kws_2step_dict[kw].update(connected_kws_1step_dict[kw2])

kw_connectedness_dict = {}

for kw in list_of_kws_in_topics:
    kw_connectedness_dict[kw] = len(connected_kws_2step_dict[kw]) / total_unique_kws

centrality_dict = {}
for topic_idx in TTC:
    centrality_dict[topic_idx] = 0
    for kw in topic_kws_dict[topic_idx]:
        centrality_dict[topic_idx] += kw_connectedness_dict[kw]
    centrality_dict[topic_idx] /= len(topic_kws_dict[topic_idx])

In [None]:
# Compute density

density_dict = {}

for topic_idx in TTC:
    n = topic_sizes_dict[topic_idx]
    kw_cooccurrence_matrix = np.zeros((n, n))
    for article in article_list:
        if topic_idx in article.topic_indices:
            kws_present = []
            for kw_idx, kw in enumerate(topic_kws_dict[topic_idx]):
                if kw in article.keywords:
                    kws_present.append(kw_idx)
            if len(kws_present) >= 2:
                kw_permutations = itertools.permutations(kws_present, 2)
                for perm in kw_permutations:
                    kw_cooccurrence_matrix[perm] += 1
    non_diag_matrix_cells = (n * n) - n
    cooccurrence = np.count_nonzero(kw_cooccurrence_matrix)
    density_dict[topic_idx] = cooccurrence / non_diag_matrix_cells

In [None]:
# Produce output

print('{}\t{}\t{}\t{}\t{}\t{}\t{}'.format('idx',
                                                          'size',
                                                          'Freq',
                                                          'CW-F',
                                                          'Cohes',
                                                          'Cent',
                                                          'Dens'))
print('=' * 80)
for topic_idx in TTC:
    print('{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(topic_idx,
                                                              topic_sizes_dict[topic_idx],
                                                              frequency_dict[topic_idx],
                                                              coword_frequency_dict[topic_idx],
                                                              cohesion_dict[topic_idx],
                                                              centrality_dict[topic_idx],
                                                              density_dict[topic_idx]))
print('=' * 80)

if EPOCH == 0:
    filename = 'topicclusters_1993-2003.csv'
elif EPOCH == 1:
    filename = 'topicclusters_2004-2013.csv'
elif EPOCH == 2:
    filename = 'topicclusters_2014-2023.csv'
elif EPOCH == 3:
    filename = 'topicclusters_1993-2023.csv'

if WRITE_FILES:
    with open(filename, 'w') as outfile:
        outfile.write('{},{},{},{},{},{},{},{}\n'.format('idx',
                                                              'size',
                                                              'Observations',
                                                              'Frequency',
                                                              'Co-word Frequency',
                                                              'Cohesion',
                                                              'Centrality',
                                                              'Density'))

        for topic_idx in TTC:
            outfile.write('{},{},{},{},{},{},{},{}\n'.format(topic_idx,
                                                                  topic_sizes_dict[topic_idx],
                                                                  cluster_counts_dict[topic_idx],
                                                                  frequency_dict[topic_idx],
                                                                  coword_frequency_dict[topic_idx],
                                                                  cohesion_dict[topic_idx],
                                                                  centrality_dict[topic_idx],
                                                                  density_dict[topic_idx]))

    if EPOCH == 0:
        filename = 'latex_table_1993-2003.csv'
    elif EPOCH == 1:
        filename = 'latex_table_2004-2013.csv'
    elif EPOCH == 2:
        filename = 'latex_table_2014-2023.csv'
    elif EPOCH == 3:
        filename = 'latex_table_1993-2023.csv'

    with open(filename, 'w') as outfile:
        for topic_idx in TTC:
            outfile.write('D{} & XXX & {} & {} & {:.3f} & {:.3f} & {:.3f} & {:.3f} & {:.3f}\\\\'.format(topic_idx,
                                                                                    topic_kws_dict[topic_idx],
                                                                                    cluster_counts_dict[topic_idx],
                                                                                    frequency_dict[topic_idx],
                                                                                    coword_frequency_dict[topic_idx],
                                                                                    cohesion_dict[topic_idx],
                                                                                    centrality_dict[topic_idx],
                                                                                    density_dict[topic_idx]
                                                                                    ))

#
# Plot Strategic Diagram of topic_clusters (Density vs Centrality)
#

# Plotly
strategic_diagram_nparray = np.zeros((len(TTC), 4))

for idx, topic_idx in enumerate(TTC):
    strategic_diagram_nparray[idx][0] = topic_idx
    strategic_diagram_nparray[idx][1] = frequency_dict[topic_idx]
    strategic_diagram_nparray[idx][2] = centrality_dict[topic_idx]
    strategic_diagram_nparray[idx][3] = density_dict[topic_idx]

strategic_diagram_df = pd.DataFrame(data=strategic_diagram_nparray,
                                    columns=['id', 'frequency', 'centrality', 'density'])

topic_leaders = []
for topic_idx in TTC:
    topic_leaders.append(topic_kws_dict[topic_idx][0])
strategic_diagram_df['leader'] = topic_leaders
strategic_diagram_df['id'] = strategic_diagram_df['id'].astype(int).astype(str)

fig = go.Figure()

fig = px.scatter(strategic_diagram_df,
                    x='centrality',
                    y='density',
                    size='frequency',
                    color='leader',
                    size_max=60)

if EPOCH == 0:
    fig_title = 'Strategic diagram for VRAIS/VR 1993-2003'
elif EPOCH == 1:
    fig_title = 'Strategic diagram for VR 2004-2013'
elif EPOCH == 2:
    fig_title = 'Strategic diagram for VR 2014-2023'
elif EPOCH == 3:
    fig_title = 'Strategic diagram for VRAIS/VR 1993-2023'

fig.update_layout(title=dict(
                        text=fig_title,
                        x=0.5,
                        y=0.975,
                        xanchor='center',
                        yanchor='top'
                    ),
                    xaxis=dict(
                        title=dict(text='Density of topic',
                                    standoff=460),
                        gridcolor='white',
                        gridwidth=2,
                        linecolor='black',
                        linewidth=4,
                        anchor='free',
                        position=0.5
                    ),
                    yaxis=dict(
                        title=dict(text='Centrality of topic',
                                    standoff=475),
                        gridcolor='white',
                        gridwidth=2,
                        linecolor='black',
                        linewidth=4,
                        anchor='free',
                        position=0.5
                    ),
                    paper_bgcolor='rgb(243, 243, 243)',
                    plot_bgcolor='rgb(243, 243, 243)')

if WRITE_FILES:
    if EPOCH == 0:
        filename = 'strategicdiagram_1993-2003.png'
    elif EPOCH == 1:
        filename = 'strategicdiagram_2004-2013.png'
    elif EPOCH == 2:
        filename = 'strategicdiagram_2004-2023.png'
    elif EPOCH == 3:
        filename = 'strategicdiagram_1993-2023.png'

    pio.write_image(fig,
                    filename,
                    scale=12,
                    width=1308,
                    height=1080)

if SHOW_FIGURES:
    fig.show()