In [1]:
# Fusion ART Model for Professor Data
# Python fusionART.py code by Dr. Budhitama Subagdja, modified for this dataset by Patrick Tjahjadi

# ----- Imported Libraries -----
from fusionART import *

from collections import defaultdict

import pandas as pd

import math

import random

from numpy import array
from numpy import argmax

import numpy as np

from itertools import islice

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 

import gensim
from gensim.models import Word2Vec
from gensim.test.utils import datapath
import gensim.downloader as api
from gensim import corpora

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import csv

import wikipedia

import dill
# -------------------------------

data = pd.read_csv('SCSE ProfProfile.csv', encoding='ISO-8859-1')

In [2]:
# Function to convert 1D lists into 2D, based on the number of elements per list.
def convert_to_2d(list_to_convert, list_per_entry): 
    it = iter(list_to_convert) 
    return [list(islice(it, i)) for i in list_per_entry] 

In [3]:
def search_for_ngrams(ngram_list, text_list):
    ngrammed_text = []
    for i in range(0, len(text_list)):
        bigram = text_list[i-1].lower()+"_"+text_list[i].lower()
        trigram = text_list[i-2].lower()+"_"+text_list[i-1]+"_"+text_list[i].lower()
        fourgram = text_list[i-3].lower()+"_"+text_list[i-2].lower()+"_"+text_list[i-1].lower()+"_"+text_list[i].lower()
        fivegram = text_list[i-4].lower()+"_"+text_list[i-3].lower()+"_"+text_list[i-2].lower()+"_"+text_list[i-1].lower()+"_"+text_list[i].lower()
        sixgram = text_list[i-5].lower()+"_"+text_list[i-4].lower()+"_"+text_list[i-3].lower()+"_"+text_list[i-2].lower()+"_"+text_list[i-1].lower()+"_"+text_list[i].lower()
        if bigram in ngram_list:
            ngrammed_text.append(bigram)
        if trigram in ngram_list:
            ngrammed_text.append(trigram)
        if fourgram in ngram_list:
            ngrammed_text.append(fourgram)
        if fivegram in ngram_list:
            ngrammed_text.append(fourgram)
        if sixgram in ngram_list:
            ngrammed_text.append(fourgram)
        ngrammed_text.append(text_list[i])
    return ngrammed_text

In [4]:
# Function to clean words from punctuation and remove capital case to standardise text tokens
def clean_text(word):
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_123456789~'''
    no_punct = ""
    for char in word:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct.lower()

In [5]:
# Function accepts a vector and normalise its values to a range between 0 and 1
def normalise_vector(vector):
    normalised_vector = [0] * len(vector)
    max_value = max(vector)
    min_value = min(vector)
    for index in range(0, len(vector)):
        normalised_vector[index] = (vector[index] - min_value) / (max_value - min_value)
    return normalised_vector

In [6]:
# Convert each attribute of the dataset to separate lists
name = []
group = []
university = []
research_interest = []
for idx, row in data.iterrows():
    name.append(row[0])
    group.append(row[1])
    university.append(row[2])
    research_interest.append(row[3])

# Problem: research_interest and university uses semicolons to denote multiple interests and universities
# Change their format to a list of universities and research interests of a person

university_processed = []
for entry in university:
    uni = entry.split(";")
    university_processed.append(uni)

research_interest_processed = []
for entry in research_interest:
    interests = entry.split(";")
    research_interest_processed.append(interests)    

    
# Break university_processed and research_interest_processed to 1D lists for one hot encoding
university_1d = []
for universities in university_processed:
    for uni in universities:
        university_1d.append(uni)
        
research_interest_1d = []
for interests in research_interest_processed:
    for interest in interests:
        research_interest_1d.append(interest)
        


In [7]:
# Determine the number of universities and research interests per professor

universities_per_person = []
for universities in university_processed:
    universities_per_person.append(len(universities))
    

research_interest_per_person = []
for research_interests in research_interest_processed:
    research_interest_per_person.append(len(research_interests))


In [8]:
# Research interest preprocessing, to consider each research keywords as a phrase
for index in range(0, len(research_interest_1d)):
    research_interest_1d[index] = research_interest_1d[index].replace(" ", "_")
    research_interest_1d[index] = research_interest_1d[index].lower()
    research_interest_1d[index] = research_interest_1d[index].replace(":", "")
    research_interest_1d[index] = research_interest_1d[index].replace(",", "")


In [9]:
# One Hot Encoding for name, group, university and research interests
name_label_encoder = LabelEncoder()
uni_label_encoder = LabelEncoder()
group_label_encoder = LabelEncoder()
research_label_encoder = LabelEncoder()

onehot_encoder = OneHotEncoder(sparse=False)

name_integer_encoded = name_label_encoder.fit_transform(name)
name_integer_encoded = name_integer_encoded.reshape(len(name_integer_encoded), 1)
name_onehot = onehot_encoder.fit_transform(name_integer_encoded)

group_integer_encoded = group_label_encoder.fit_transform(group)
group_integer_encoded = group_integer_encoded.reshape(len(group_integer_encoded), 1)
group_onehot = onehot_encoder.fit_transform(group_integer_encoded)

university_integer_encoded = uni_label_encoder.fit_transform(array(university_1d))
university_integer_encoded = university_integer_encoded.reshape(len(university_integer_encoded), 1)
university_onehot = onehot_encoder.fit_transform(university_integer_encoded)

research_interest_integer_encoded = research_label_encoder.fit_transform(array(research_interest_1d))
research_interest_integer_encoded = research_interest_integer_encoded.reshape(len(research_interest_integer_encoded), 1)
research_interest_onehot = onehot_encoder.fit_transform(research_interest_integer_encoded)


In [10]:
# Convert the numpy arrays to lists
name_onehotlist = name_onehot.tolist()
group_onehotlist = group_onehot.tolist()
university_onehotlist = university_onehot.tolist()
research_interest_onehotlist = research_interest_onehot.tolist()


In [41]:
# Perform the one-hot research keywords noise flipping here

def one_hot_keyword_flip(query_research, noise_limit):
    
    query_research = query_research.split(";")
    
    # Preprocess the research keyword queries to the format used for word embedding (phrases)
    for index in range(0, len(query_research)):
        query_research[index] = query_research[index].replace(" ", "_")
        query_research[index] = query_research[index].lower()
        query_research[index] = query_research[index].replace(":", "")
        query_research[index] = query_research[index].replace(",", "")

    # Convert all the research keywords into input vectors
    for research_index in range(0, len(query_research)):
        for onehot_vec in research_interest_onehotlist:
            if (str(query_research[research_index]).rstrip("\n") == 
                research_label_encoder.inverse_transform([argmax(onehot_vec)])[0]):
                query_research[research_index] = onehot_vec
                

        
    # Remove keywords that could not be converted to input vectors
    index = 0
    while (index < len(query_research)):
        if type(query_research[index]) != list:
            del query_research[index]
        else:
            index += 1


    # Combine all the research keywords inputs into one vector that contains all research keywords
    research_vector = [sum(research_ints) for research_ints in zip(*query_research)]
    
    # With a defined probability, flip the binary research keyword vectors due to noise
    num_change = 0
    for index in range(0, len(research_vector)):
        noise_value = random.random()
        if (noise_value < noise_limit):
            num_change += 1
            if (research_vector[index] == 0):
                research_vector[index] = 1
            else:
                research_vector[index] = 0
    print(num_change)
    return research_vector


In [42]:
# Research keywords query for 0% noise
name_keyword_dict = {}

# Use a batch file as input for the research keywords query
with open('research_query.bat', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    for batch_name, keywords in reader:
        flipped_vector = one_hot_keyword_flip(keywords, 0.1)
        name_keyword_dict[batch_name] = flipped_vector



25
28
28
20
35
24
36
28
31
27
28
26
24
24
26
24
16
30
24
23
21
31
20
23
24
24
35
31
26
20
35
25
22
20
20
27
29
22
19
26
32
17
26
23
27
30
18
22
24
22
31
22
21
23
18
29
30
19
18
26
30
28
31
27
36
24
25
39
24
25
17
27
25
26
33
24
26
23
25
24
30
21
31


In [39]:
print(research_interest_1d[0])

computer_graphics


In [43]:
noisy_research_interest_1d = []
research_interest_per_person = []
for professor in name:
    num_res_ints = 0
    noisy_research_vector = name_keyword_dict[professor]
    for index in range(0, len(noisy_research_vector)):
        if noisy_research_vector[index] == 1.0:
            ind_list = [0] * len(noisy_research_vector)
            ind_list[index] = 1
            num_res_ints += 1
            research_name = research_label_encoder.inverse_transform([argmax(ind_list)])[0]
            noisy_research_interest_1d.append(research_name)
    research_interest_per_person.append(num_res_ints)


['big_data_analytics', 'cam', 'collaborative_internet_computing_technologies_and_applications', 'compositional_verification_and_synthesis', 'computational_economics', 'computational_social_choice', 'computer_graphics', 'cybermedicine', 'data_analytics', 'deep_learning', 'distributed_and_network_systems', 'distributed_machine_learning', 'distributed_systems_security', 'intelligent_multi_agents', 'intelligent_techniques', 'medical_visualization', 'multi_modal_biometrics_in_homeland_security', 'natural_language_processing', 'object_recognition', 'optimization', 'pattern_recognition', 'perceptual_visual_quality_gauging', 'pervasive_computing', 'radio_resource_management', 'scientific_visualization', 'security_and_privacy', 'shape_modelling', 'software_and_system_security', 'spectral_graph_theory', 'virtual_reality', 'visualization_on_the_grid', 'web_visualization', 'application_specific_processors', 'bioinformatics', 'blockchain_protocols_and_applications', 'brain_computer_interface', 'col

In [13]:
# Create a list of unique names, groups, universities and research interests to define the model schema
unique_name = []
unique_group = []
unique_university = []
unique_research_interest = []
[unique_name.append(i) for i in name if i not in unique_name]
[unique_group.append(i) for i in group if i not in unique_group]
[unique_university.append(i) for i in university_1d if i not in unique_university]
[unique_research_interest.append(i) for i in research_interest_1d if i not in unique_research_interest]
pass

In [17]:
# Create a word2vec model that contains all research keywords
research_tokens = convert_to_2d(research_interest_1d, research_interest_per_person)
research_word2vec = Word2Vec(research_tokens, min_count = 0)


In [18]:
# Gather text tokens from Wikipedia articles for all research keywords 
wiki_text = []
for keyword in research_interest_1d:
    try:
        p = wikipedia.page(keyword)
        wiki_text.append(p.content.split())
        
    # Include exception handling if the Wikipedia article can not be found
    except wikipedia.exceptions.PageError:
        pass
    except wikipedia.DisambiguationError:
        pass




  lis = BeautifulSoup(html).find_all('li')


In [19]:
# Clean all text from Wikipedia articles
for text_index in range(0, len(wiki_text)):
    for word_index in range(0, len(wiki_text[text_index])):
        wiki_text[text_index][word_index] = clean_text(wiki_text[text_index][word_index])
        

In [20]:
# Search for research keyword n-grams from Wikipedia articles
ngram_wiki_tokens = []

for text in wiki_text:
    ngram_wiki_tokens.append(search_for_ngrams(unique_research_interest, text))

In [21]:
# Train the word2vec model from the cleaned, n-grammed Wikipedia tokens
research_word2vec.build_vocab(ngram_wiki_tokens, update = True)
research_word2vec.train(ngram_wiki_tokens, total_examples = research_word2vec.corpus_count, epochs = research_word2vec.iter)

  This is separate from the ipykernel package so we can avoid doing imports until


(4509269, 5687835)

In [22]:
# Open the custom corpus text file and tokenise them. Group them with 2500 words each
testtext = []
with open('testtext.txt','r', encoding="utf-8") as f:
    for line in f:
        for word in line.split():
            testtext.append(word)  
        
test_tokens = convert_to_2d(testtext, [2500] * (math.ceil(len(testtext)/2500)))


In [23]:
# Search for research keyword n-grams from the custom corpus and clean them
ngram_test_tokens = []

for text in test_tokens:
    cleaned_text = []
    for word in text:
        cleaned_text.append(clean_text(word))
    ngram_test_tokens.append(search_for_ngrams(unique_research_interest, cleaned_text))

In [24]:
# Train the word2vec model by including the cleaned, n-grammed custom corpus tokens
research_word2vec.build_vocab(ngram_test_tokens, update = True)
research_word2vec.train(ngram_test_tokens, total_examples = research_word2vec.corpus_count, epochs = research_word2vec.iter)

  This is separate from the ipykernel package so we can avoid doing imports until


(168940, 219210)

In [25]:
# Vocabulary list for the word2vec model
list(research_word2vec.wv.vocab)

['computer_graphics',
 'shape_modelling',
 'virtual_reality',
 'web_visualization',
 'visualization_on_the_grid',
 'cybermedicine',
 'scientific_visualization',
 'high_level_synthesis',
 'application_specific_processors',
 'heterogeneous_mpsoc',
 'synthesis_for_emerging_technology',
 'distributed_systems',
 'security_and_privacy',
 'data_analytics',
 'algorithms',
 'self_organization',
 'graphs_querying_and_mining',
 'databases',
 'data_mining',
 'machine_learning',
 'real_time_systems',
 'cyber_physical_systems',
 'formal_methods',
 'ultra_wideband_radio',
 'modulation_and_multiple_access',
 'communication_algorithms',
 'digital_signal_processing',
 'multi_agent_systems',
 'game_theory',
 'optimization',
 'intelligent__e-commerce',
 'image_coding',
 'video_coding',
 'wireless_video',
 'multimedia_networking',
 'parallel_and_distributed_simulation',
 'programming_environments_and_tools',
 'parallel_algorithms_and_architectures',
 'system_performance_analysis',
 'computer_vision',
 'int

In [26]:
# Find the top 5 most similar keywords for all research keywords
for keyword in sorted(unique_research_interest):
    print("Top 5 similar words from "+keyword+": ")
    print(research_word2vec.wv.most_similar(keyword, topn = 5))
    print()

Top 5 similar words from ad_hoc_and_mobile_networks: 
[('nonspam', 0.3338487148284912), ('polyglot', 0.322151243686676), ('cupboards', 0.3184388279914856), ('moreadvanced', 0.318264365196228), ('processingunits', 0.31678178906440735)]

Top 5 similar words from agent_oriented_software_engineering: 
[('software_engineering', 0.8922833800315857), ('assemblycode', 0.8708775043487549), ('brunelière', 0.8693846464157104), ('kit', 0.8632129430770874), ('accredited', 0.8614062070846558)]

Top 5 similar words from algorithms: 
[('techniques', 0.8492424488067627), ('methods', 0.8371645212173462), ('models', 0.786705493927002), ('problems', 0.7808295488357544), ('statistical', 0.7649778127670288)]

Top 5 similar words from algorithms_and_data_structure: 
[('subjectmatter', 0.676770806312561), ('badly', 0.6631745100021362), ('algebras', 0.6544654369354248), ('searchbased', 0.6356987953186035), ('dietest', 0.6233838796615601)]

Top 5 similar words from animation_and_visualization: 
[('archetypes', 

[('mmu', 0.3471124768257141), ('discountrate', 0.2987755835056305), ('human_robot_interaction', 0.2776610851287842), ('low_power_reconfigurable_computing', 0.26580125093460083), ('agentalso', 0.26534050703048706)]

Top 5 similar words from collaborative_virtual_environments: 
[('nonrisc', 0.728187084197998), ('equally', 0.6940321326255798), ('systemsembedded', 0.6876730918884277), ('batterypowered', 0.6776008605957031), ('yuv', 0.6669276356697083)]

Top 5 similar words from common_sense_reasoning: 
[('equilibria', 0.7952524423599243), ('provable', 0.7873103618621826), ('xrightright', 0.7841353416442871), ('metalevel', 0.7736964821815491), ('grows', 0.7723196744918823)]

Top 5 similar words from communication_algorithms: 
[('callouts', 0.2932128608226776), ('chores', 0.27867817878723145), ('canvases', 0.2743675708770752), ('conditionalindependence', 0.26048940420150757), ('populationdensities', 0.2588825225830078)]

Top 5 similar words from compositional_verification_and_synthesis: 
[('

[('entertainment_and_mobile_computing', 0.29304397106170654), ('is—even', 0.2871752977371216), ('energized', 0.2863956689834595), ('psychologysocial', 0.27965855598449707), ('senderreceiver', 0.27086302638053894)]

Top 5 similar words from fuzzy_neural_systems: 
[('simultaneoussequential', 0.35625413060188293), ('algorithmica', 0.31296640634536743), ('fame', 0.3065129220485687), ('algocracy', 0.30645325779914856), ('campbell', 0.3003383278846741)]

Top 5 similar words from game_theory: 
[('noncooperative', 0.909516453742981), ('economics', 0.868655800819397), ('computational_social_choice', 0.8605436086654663), ('game', 0.8539938926696777), ('number_theory', 0.8419334888458252)]

Top 5 similar words from geometric_modeling: 
[('geometric', 0.9156708121299744), ('julia', 0.8999641537666321), ('autocarto', 0.8999099731445312), ('armin', 0.8995251655578613), ('faceted', 0.8993490934371948)]

Top 5 similar words from geospatial_textual_and_mobility_data_management: 
[('atelierb', 0.3299170

[('datalogs', 0.9232571125030518), ('magnetometers', 0.918083131313324), ('programmingimplementations', 0.912300705909729), ('profiling', 0.9062663316726685), ('utilizes', 0.9060459136962891)]

Top 5 similar words from multimedia_annotation: 
[('exhaustible', 0.24198715388774872), ('extraneous', 0.22315728664398193), ('multimedia_computing_and_networking', 0.22125545144081116), ('novelty—three', 0.2196469008922577), ('rightslink', 0.21720756590366364)]

Top 5 similar words from multimedia_communication: 
[('messaging', 0.7557110786437988), ('spus', 0.7473167777061462), ('combines', 0.7466021776199341), ('keyboards', 0.7418762445449829), ('super', 0.737523078918457)]

Top 5 similar words from multimedia_computing_and_networking: 
[('mining', 0.33928221464157104), ('data_mining', 0.3238320052623749), ('democratizing', 0.3086342215538025), ('warehousing”', 0.3085840940475464), ('text', 0.30632704496383667)]

Top 5 similar words from multimedia_information_retrieval: 
[('information_retrie

[('multinationals', 0.347334623336792), ('profitable', 0.3385174870491028), ('sindhind', 0.3211149573326111), ('taskindependent', 0.32005733251571655), ('usercustomer', 0.3129497468471527)]

Top 5 similar words from software_engineering: 
[('agent_oriented_software_engineering', 0.8922833800315857), ('nists', 0.8869683742523193), ('sei', 0.871846079826355), ('gnu', 0.8543223142623901), ('hamilton', 0.8494508266448975)]

Top 5 similar words from software_security: 
[('livelihood', 0.4091106653213501), ('nodeswireless', 0.36943379044532776), ('everybody', 0.34122025966644287), ('addressability', 0.3330814838409424), ('connection', 0.32659170031547546)]

Top 5 similar words from sparse_convex_optimization: 
[('ksvd', 0.8800416588783264), ('reinforcement', 0.8541874885559082), ('reinforcement_learning', 0.8464661836624146), ('transfer_learning', 0.8457825183868408), ('federated', 0.8440229892730713)]

Top 5 similar words from spatio_temporal_data_management_and_mining: 
[('habituation', 0.

In [27]:
# Map each research keyword as a weight vector that measures its similarity with all other research keywords
research_interest_weight_dict = defaultdict(list)
for keyword1 in sorted(unique_research_interest):
    keyword1_weights = []
    for keyword2 in sorted(unique_research_interest):
        # Round to 5 decimal places for all similarity scores
        keyword1_weights.append(round(research_word2vec.wv.similarity(w1=keyword1, w2=keyword2), 5))
    research_interest_weight_dict[keyword1] = keyword1_weights
    


In [28]:
# Provide weight vectors for each research interest
research_interest_weights = []
for res_int in research_interest_1d:
    research_interest_weights.append(research_interest_weight_dict[res_int])

In [29]:
# Problem: each professor has multiple universities and research interests
# Convert these 1D arrays to 2D, to associate them to each professor

university_onehotlist_processed = []
research_interest_onehotlist_processed = []

university_onehotlist_processed = convert_to_2d(university_onehotlist, universities_per_person)
research_interest_processed = convert_to_2d(research_interest_weights, research_interest_per_person)

In [30]:
# Create an array that allows multiple universities and research interests per professor by summation
university_2d = []

for universities_per_professor in university_onehotlist_processed:
    university_2d.append([sum(x) for x in zip(*universities_per_professor)])

    
research_interest_2d = []
for research_interests_per_professor in research_interest_processed:
    research_interest_2d.append([sum(x) for x in zip(*research_interests_per_professor)])

# Since universities can be duplicate, change so that the maximum value of each university is 1
for universities_per_professor in university_2d:
    for index in range(0, len(universities_per_professor)):
        if universities_per_professor[index] > 1:
            universities_per_professor[index] = 1.0


In [31]:
# Normalise the values of all research keyword vectors to a range between 0 and 1
normalised_research_interest_2d = []
for research_vector in research_interest_2d:
    normalised_research_interest_2d.append(normalise_vector(research_vector))

In [67]:
# Define the schema for the FusionART model
model_schema = [{'name': data.columns[0], 'compl': False, 'attrib': sorted(unique_name)}, 
                {'name': data.columns[1], 'compl': False, 'attrib': sorted(unique_group)},
                {'name': data.columns[2], 'compl': False, 'attrib': sorted(unique_university)},
                {'name': data.columns[3], 'compl': False, 'attrib': sorted(unique_research_interest)}]

In [68]:
# Initialise the FusionART model
model = FusionART(schema = model_schema, beta = [1.0, 1.0, 1.0, 1.0], alpha = [0.1, 0.1, 0.1, 0.1], 
               gamma = [0.25, 0.25, 0.25, 0.25], rho = [1, 1, 1, 1])
model.F1Fields

[{'name': 'name',
  'compl': False,
  'attrib': ['AS Madhukumar',
   'Alexei Sourin',
   'Anupam Chattopadhyay',
   'Anwitaman Datta',
   'Arijit Khan',
   'Arvind Easwaran',
   'Bo An',
   'Cai Jianfei',
   'Cai Wentong',
   'Cham Tat Jen',
   'Chan Syin',
   'Chia Liang Tien Clement',
   'Chng Eng Siong',
   'Deepu Rajan',
   'Deng Ruilong',
   'Douglas Leslie Maskell',
   'Dusit Niyato',
   'Eric Cambria',
   'Gao Cong',
   'Goh Wooi Boon',
   'Guan Cuntai',
   'He Ying',
   'Huang Shell Ying',
   'Hui Siu Cheung',
   'Jagath C. Rajapakse',
   'Ke Yiping Kelly',
   'Kong Wai Kin Adams',
   'Kwoh Chee Keong',
   'Lam Kwok Yan',
   'Lam Siew Kei',
   'Lau Chiew Tong',
   'Lee Bu Sung Francis',
   'Li Fang Flora',
   'Li Mo',
   'Li Yi',
   'Liang Qianhui Althea',
   'Lin Feng',
   'Lin Guosheng',
   'Lin Shang Wei',
   'Lin Weisi',
   'Liu Weichen',
   'Liu Yang',
   'Loke Yuan Ren',
   'Long Cheng',
   'Loy Chen Change (Cavan)',
   'Lu Shijian',
   'Luo Jun',
   'Mahardhika Pratama',

In [69]:
# Store the data to the Fusion ART model

for i in range(0, len(name)):
    model.updateF1bySchema([{'name': data.columns[0], 'val': name_onehotlist[i]}, 
                            {'name': data.columns[1], 'val': group_onehotlist[i]}, 
                            {'name': data.columns[2], 'val': university_2d[i]}, 
                            {'name': data.columns[3], 'val': normalised_research_interest_2d[i]}])

    
    print("resonance search: ")
    J = model.resSearch()
    print("selected ", J)
    if model.uncommitted(J):
        print ('uncommitted')

    model.autoLearn(J)
    model.doReadoutAllFields(J)
                    
    
model.displayNetwork()

resonance search: 
selected  0
uncommitted
resonance search: 
selected  1
uncommitted
resonance search: 
selected  2
uncommitted
resonance search: 
selected  3
uncommitted
resonance search: 
selected  4
uncommitted
resonance search: 
selected  5
uncommitted
resonance search: 
selected  6
uncommitted
resonance search: 
selected  7
uncommitted
resonance search: 
selected  8
uncommitted
resonance search: 
selected  9
uncommitted
resonance search: 
selected  10
uncommitted
resonance search: 
selected  11
uncommitted
resonance search: 
selected  12
uncommitted
resonance search: 
selected  13
uncommitted
resonance search: 
selected  14
uncommitted
resonance search: 
selected  15
uncommitted
resonance search: 
selected  16
uncommitted
resonance search: 
selected  17
uncommitted
resonance search: 
selected  18
uncommitted
resonance search: 
selected  19
uncommitted
resonance search: 
selected  20
uncommitted
resonance search: 
selected  21
uncommitted
resonance search: 
selected  22
uncommitte

In [78]:
# Perform the research keywords query here

def query_by_research_with_noise(batch_name, query_research, output_file, noise_limit):
    
    query_research = query_research.split(";")
    output_file.write("Research Keywords: ")
    output_file.write(', '.join(query_research)+"\n")
    
    
    # Preprocess the research keyword queries to the format used for word embedding (phrases)
    for index in range(0, len(query_research)):
        query_research[index] = query_research[index].replace(" ", "_")
        query_research[index] = query_research[index].lower()
        query_research[index] = query_research[index].replace(":", "")
        query_research[index] = query_research[index].replace(",", "")
    
    # Convert all the research keywords into input vectors
    for research_index in range(0, len(query_research)):
        for onehot_vec in research_interest_onehotlist:
            if (str(query_research[research_index]).rstrip("\n") == 
                research_label_encoder.inverse_transform([argmax(onehot_vec)])[0]):
                query_research[research_index] = onehot_vec
                
    # Remove keywords that could not be converted to input vectors
    index = 0
    while (index < len(query_research)):
        if type(query_research[index]) != list:
            del query_research[index]
        else:
            index += 1
    
    # Combine all the research keywords inputs into one vector that contains all research keywords
    research_vector = [sum(research_ints) for research_ints in zip(*query_research)]
    
    # With a defined probability, flip the binary research keyword vectors due to noise
    for index in range(0, len(research_vector)):
        noise_value = random.random()
        if (noise_value < noise_limit):
            research_vector[index] = 1 - research_vector[index]
            

    model.setParam('gamma', [0,0,0,1])
    model.setParam('rho', [0,0,0,0])
    model.updateF1bySchema([{'name': data.columns[3], 'val': research_vector}])
    model.compChoice()

    # Find the nodes that have the highest F2 value
    maxF2value = model.codes[0]['F2']
    maxF2indexes = []

    for i in range(0, len(model.codes)-1):
        F2value = model.codes[i]['F2']
        if (F2value > maxF2value):
            maxF2value = F2value
            maxF2indexes = [i]
        elif (F2value == maxF2value):
            maxF2indexes.append(i)
            
    print(maxF2indexes)
    
    # Retrieve the data of the node(s) that have the highest F2 value
    output_file.write("Professor that follows these research keywords:")    
    for node in maxF2indexes:
        name_vector = model.codes[node]['weights'][0]
        for index in range(0, len(name_vector)):
            # Iterate through the vector and find the indexes that has a value of 1
            if (name_vector[index] == 1):
                # Retrieve the data from the model's 'attrib' attribute in F1 
                retrieved_name = model.F1Fields[0]['attrib'][index]
                output_file.write(retrieved_name)
    
    output_file.write("\n\n")
    return(retrieved_name)


In [79]:
# Research keywords query for 0% noise
output_file = open("output_zero_noise.txt", "w+")
correct_match = 0

# Use a batch file as input for the research keywords query
with open('research_query.bat', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    for batch_name, keywords in reader:
        retrieved_name = query_by_research_with_noise(batch_name, keywords, output_file, 0)
        if (retrieved_name == batch_name):
            correct_match += 1

query_accuracy = (correct_match / len(name)) * 100
output_file.write("\nQuery Accuracy: "+str(query_accuracy)+"%")
output_file.close()


[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
[30]
[31]
[32]
[33]
[34]
[35]
[36]
[37]
[38]
[39]
[40]
[41]
[42]
[43]
[37]
[45]
[46]
[47]
[48]
[49]
[50]
[51]
[52]
[53]
[54]
[55]
[56]
[57]
[37]
[59]
[60]
[61]
[62]
[25]
[64]
[65]
[66]
[67]
[68]
[69]
[70]
[71]
[72]
[73]
[74]
[75]
[76]
[77]
[78]
[79]
[80]
[81]
[82]
