In [1]:
# Fusion ART Model for Professor Data using Word2Vec
# Python fusionART.py code by Dr. Budhitama Subagdja, modified for this dataset by Patrick Tjahjadi

# ----- Imported Libraries -----
from fusionART import *

from collections import defaultdict

import pandas as pd

import random

from numpy import array
from numpy import argmax


from itertools import islice

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 

import gensim
from gensim.models import Word2Vec
from gensim.test.utils import datapath
import gensim.downloader as api

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import csv
# -------------------------------

data = pd.read_csv('SCSE ProfProfile.csv', encoding='ISO-8859-1')

In [2]:
def convert_to_2d(list_to_convert, list_per_entry): 
    it = iter(list_to_convert) 
    return [list(islice(it, i)) for i in list_per_entry] 

In [3]:
# Convert each attribute of the dataset to separate lists
name = []
group = []
university = []
research_interest = []
for idx, row in data.iterrows():
    name.append(row[0])
    group.append(row[1])
    university.append(row[2])
    research_interest.append(row[3])

# Problem: research_interest and university uses semicolons to denote multiple interests and universities
# Change their format to a list of universities and research interests of a person

university_processed = []
for entry in university:
    uni = entry.split(";")
    university_processed.append(uni)

research_interest_processed = []
for entry in research_interest:
    interests = entry.split(";")
    research_interest_processed.append(interests)    

    
# Break university_processed and research_interest_processed to 1D lists for one hot encoding
university_1d = []
for universities in university_processed:
    for uni in universities:
        university_1d.append(uni)
        
research_interest_1d = []
for interests in research_interest_processed:
    for interest in interests:
        research_interest_1d.append(interest)
        
print(len(research_interest_1d))

339


In [4]:
# Determine the number of universities and research interests per professor

universities_per_person = []
for universities in university_processed:
    universities_per_person.append(len(universities))
    

research_interest_per_person = []
for research_interests in research_interest_processed:
    research_interest_per_person.append(len(research_interests))
    

In [5]:
# Research interest preprocessing, to consider each research keywords as a phrase
for index in range(0, len(research_interest_1d)):
    research_interest_1d[index] = research_interest_1d[index].replace(" ", "_")
    research_interest_1d[index] = research_interest_1d[index].lower()
    research_interest_1d[index] = research_interest_1d[index].replace(":", "")
    research_interest_1d[index] = research_interest_1d[index].replace(",", "")


In [6]:
# Create a list of unique names, groups, universities and research interests to define the model schema
# (I refrained from using sets because it would randomise the one-hot ordering)
unique_name = []
unique_group = []
unique_university = []
unique_research_interest = []
[unique_name.append(i) for i in name if i not in unique_name]
[unique_group.append(i) for i in group if i not in unique_group]
[unique_university.append(i) for i in university_1d if i not in unique_university]
[unique_research_interest.append(i) for i in research_interest_1d if i not in unique_research_interest]
pass


In [9]:
# Add an additional corpus to train the word2vec model. This is equivalent to finding a pretrained model.
corpus = api.load('text8')

text8 = []
with open('testtext.txt','r', encoding='utf-8') as f:
    for line in f:
        for word in line.split():
            text8.append(word)  
        
text8_tokens = convert_to_2d(text8, [1] * len(text8))

In [14]:
# Convert research keywords into tokens for word embedding
research_tokens = []
for keyword in research_interest_1d:
    for token in sent_tokenize(keyword):
        words_list = []
        
        for word in word_tokenize(token):
            words_list.append(word.lower())
        research_tokens.append(words_list)

# Initialise the word embedding (word2vec) model here. This will take a while
research_word2vec = gensim.models.Word2Vec(min_count = 1, size = 20) 
research_word2vec.build_vocab(text8_tokens)
research_word2vec.train(text8_tokens, total_examples = research_word2vec.corpus_count, epochs = 10, report_delay = 1)
research_word2vec.build_vocab(research_tokens, update = True)
research_word2vec.train(research_tokens, total_examples = research_word2vec.corpus_count, epochs = 10, report_delay = 1)


(2572, 3390)

In [15]:
research_word2vec["algorithms"]

  """Entry point for launching an IPython kernel.


array([ 0.0087668 ,  0.00518225,  0.01588449, -0.01902688, -0.00289596,
        0.02028284, -0.02342401,  0.01160187,  0.00939397, -0.01336284,
        0.00382263,  0.00798467,  0.02302893,  0.01601373,  0.00599154,
        0.02051213,  0.01383519,  0.00964117, -0.00962269,  0.01667475],
      dtype=float32)

In [10]:
# Map each research keyword as a weight vector that measures its similarity with all other research keywords
research_interest_weight_dict = defaultdict(list)
for keyword1 in sorted(unique_research_interest):
    keyword1_weights = []
    for keyword2 in sorted(unique_research_interest):
        keyword1_weights.append(research_word2vec.wv.similarity(w1=keyword1, w2=keyword2))
    research_interest_weight_dict[keyword1] = keyword1_weights


In [18]:
# One Hot Encoding for name, group and university
name_label_encoder = LabelEncoder()
uni_label_encoder = LabelEncoder()
group_label_encoder = LabelEncoder()
research_label_encoder = LabelEncoder()

onehot_encoder = OneHotEncoder(sparse=False)

name_integer_encoded = name_label_encoder.fit_transform(name)
name_integer_encoded = name_integer_encoded.reshape(len(name_integer_encoded), 1)
name_onehot = onehot_encoder.fit_transform(name_integer_encoded)

group_integer_encoded = group_label_encoder.fit_transform(group)
group_integer_encoded = group_integer_encoded.reshape(len(group_integer_encoded), 1)
group_onehot = onehot_encoder.fit_transform(group_integer_encoded)

university_integer_encoded = uni_label_encoder.fit_transform(array(university_1d))
university_integer_encoded = university_integer_encoded.reshape(len(university_integer_encoded), 1)
university_onehot = onehot_encoder.fit_transform(university_integer_encoded)


In [19]:
# Convert the numpy arrays to lists
name_onehotlist = name_onehot.tolist()
group_onehotlist = group_onehot.tolist()
university_onehotlist = university_onehot.tolist()



In [11]:
# Provide weight vectors for each research interest
research_interest_weights = []
for res_int in research_interest_1d:
    research_interest_weights.append(research_interest_weight_dict[res_int])
    

[-0.49677694, 0.1066994, -0.118644685, 0.40415332, -0.19832473, -0.40608254, 0.9221008, -0.55682427, -0.2817553, -0.59580547, -0.3385826, 0.67835087, 0.26444915, 0.15570839, 0.19639196, -0.7126493, -0.4452606, 0.37539482, 0.2664469, -0.5881864, -0.09841175, -0.42444468, -0.19838499, -0.022641424, 0.52507997, 0.48079786, 0.5462976, 0.9261795, 0.23366787, -0.085772865, 0.5515535, -0.32736385, 0.15046833, -0.12647739, 0.59969145, -0.52948034, -0.44073048, 0.95910805, -0.3593909, 0.8756511, 0.44077715, 0.019135168, -0.547876, -0.39659113, -0.56514364, 0.1378775, 0.2675411, -0.5256015, -0.097412735, -0.3379861, 1.0, 0.47397444, -0.8472109, -0.38374823, -0.25641125, 0.5686275, 0.6752799, 0.041542526, -0.28418386, -0.29410338, -0.7655185, 0.3796749, -0.088432655, 0.9310238, 0.038161628, 0.0062015187, -0.6506736, -0.050263662, -0.36300707, 0.040310852, 0.004231941, 0.18718244, 0.65080565, -0.8860516, 0.6587013, -0.14501646, -0.45632702, 0.71250504, -0.43784487, -0.29466188, 0.008142829, -0.343

In [21]:
# Problem: each professor has multiple universities and research interests
# Convert these 1D arrays to 2D, to associate them to each professor

university_onehotlist_processed = []
research_interest_onehotlist_processed = []

university_onehotlist_processed = convert_to_2d(university_onehotlist, universities_per_person)
research_interest_processed = convert_to_2d(research_interest_weights, research_interest_per_person)

In [22]:
# Create an array that allows multiple universities and research interests per professor by summation
university_2d = []
for universities_per_professor in university_onehotlist_processed:
    university_2d.append([sum(x) for x in zip(*universities_per_professor)])
    
research_interest_2d = []
for research_interests_per_professor in research_interest_processed:
    research_interest_2d.append([sum(x) for x in zip(*research_interests_per_professor)])

# Since universities can be duplicate, change so that the maximum value of each university is 1
for universities_per_professor in university_2d:
    for index in range(0, len(universities_per_professor)):
        if universities_per_professor[index] > 1:
            universities_per_professor[index] = 1.0


In [24]:
# Normalise the values of all research keyword vectors to the range of between 0 to 1

normalised_research_interest_2d = []
for vector in research_interest_2d:
    max_value = max(vector)
    min_value = min(vector)
    for index in range(0, len(vector)):
        vector[index] = (vector[index] - min_value) / (max_value - min_value)
    normalised_research_interest_2d.append(vector)

print(normalised_research_interest_2d[0])

[0.5847743797335204, 0.41249177316961694, 0.353667044079105, 0.8403369562922172, 0.7696029707997288, 0.41611003226564386, 0.5911853323507372, 0.3510487541795679, 0.6431556105945025, 0.7278483537434195, 0.44491530277808033, 0.6019151985935227, 0.736787271419172, 0.046443245629906954, 0.09083527013706316, 0.13864897897541537, 0.49631113527180293, 0.5298540899809486, 0.09580770132568295, 0.7460529191908084, 0.08916283391772273, 0.5473994225422364, 0.8696110175139193, 0.4963449352142971, 0.7592342514953979, 0.6783314417999922, 0.39455205981894, 0.7369633525686926, 0.22636172903385024, 0.3449804262559528, 0.7923720607840309, 0.6995979740530659, 0.9433867624180944, 0.1317438745776991, 0.7716840724112958, 0.32188044241780334, 0.8893982468822692, 0.7579560777239427, 0.19173446690657364, 0.2840718493535331, 0.19783170148909668, 0.7929950019932859, 0.3643880662527073, 0.7050284052804627, 0.08708895833577637, 0.951166367766936, 0.21742265099371602, 0.4604856608643815, 0.47913401796048555, 0.72723

In [22]:
# Define the schema for the FusionART model
model_schema = [{'name': data.columns[0], 'attrib': sorted(unique_name)}, 
                {'name': data.columns[1], 'attrib': sorted(unique_group)},
                {'name': data.columns[2], 'attrib': sorted(unique_university)},
                {'name': data.columns[3], 'attrib': sorted(unique_research_interest)}]

In [23]:
# Initialise the FusionART model
model = FusionART(schema = model_schema, beta = [1.0, 1.0, 1.0, 1.0], alpha = [0.1, 0.1, 0.1, 0.1], 
               gamma = [0.25, 0.25, 0.25, 0.25], rho = [0.2, 0.2, 0.5, 0.5])
model.F1Fields

[{'name': 'name',
  'attrib': ['AS Madhukumar',
   'Alexei Sourin',
   'Anupam Chattopadhyay',
   'Anwitaman Datta',
   'Arijit Khan',
   'Arvind Easwaran',
   'Bo An',
   'Cai Jianfei',
   'Cai Wentong',
   'Cham Tat Jen',
   'Chan Syin',
   'Chia Liang Tien Clement',
   'Chng Eng Siong',
   'Deepu Rajan',
   'Deng Ruilong',
   'Douglas Leslie Maskell',
   'Dusit Niyato',
   'Eric Cambria',
   'Gao Cong',
   'Goh Wooi Boon',
   'Guan Cuntai',
   'He Ying',
   'Huang Shell Ying',
   'Hui Siu Cheung',
   'Jagath C. Rajapakse',
   'Ke Yiping Kelly',
   'Kong Wai Kin Adams',
   'Kwoh Chee Keong',
   'Lam Kwok Yan',
   'Lam Siew Kei',
   'Lau Chiew Tong',
   'Lee Bu Sung Francis',
   'Li Fang Flora',
   'Li Mo',
   'Li Yi',
   'Liang Qianhui Althea',
   'Lin Feng',
   'Lin Guosheng',
   'Lin Shang Wei',
   'Lin Weisi',
   'Liu Weichen',
   'Liu Yang',
   'Loke Yuan Ren',
   'Long Cheng',
   'Loy Chen Change (Cavan)',
   'Lu Shijian',
   'Luo Jun',
   'Mahardhika Pratama',
   'Miao Chunyan'

In [25]:
# Store the data to the Fusion ART model

for i in range(0, len(name)):
    model.updateF1bySchema([{'name': data.columns[0], 'val': name_onehotlist[i]}, 
                            {'name': data.columns[1], 'val': group_onehotlist[i]}, 
                            {'name': data.columns[2], 'val': university_2d[i]}, 
                            {'name': data.columns[3], 'val': normalised_research_interest_2d[i]}])

    
    print("resonance search: ")
    J = model.resSearch()
    print("selected ", J)
    if model.uncommitted(J):
        print ('uncommitted')

    model.autoLearn(J)
    model.doReadoutAllFields(J)
                    
    
model.displayNetwork()

resonance search: 
selected  0
resonance search: 
selected  1
resonance search: 
selected  2
resonance search: 
selected  3
resonance search: 
selected  4
resonance search: 
selected  5
resonance search: 
selected  6
resonance search: 
selected  7
resonance search: 
selected  8
resonance search: 
selected  9
resonance search: 
selected  10
resonance search: 
selected  11
resonance search: 
selected  12
resonance search: 
selected  13
resonance search: 
selected  14
resonance search: 
selected  15
resonance search: 
selected  16
resonance search: 
selected  17
resonance search: 
selected  18
resonance search: 
selected  19
resonance search: 
selected  20
resonance search: 
selected  21
resonance search: 
selected  22
resonance search: 
selected  23
resonance search: 
selected  24
resonance search: 
selected  25
resonance search: 
selected  26
resonance search: 
selected  27
resonance search: 
selected  28
resonance search: 
selected  29
resonance search: 
selected  30
resonance search: 

Code: 39 {'F2': 0.18867483642207564, 'weights': [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

In [19]:
# Perform the name query here

def query_by_name(query_name):
    
    name_found = 0
    
    # Find the corresponding one-hot vector for the input name
    for onehot_vec in name_onehotlist:
        if (query_name == name_label_encoder.inverse_transform([argmax(onehot_vec)])[0]):
            query_name = onehot_vec
            name_found = 1

    if (name_found == 1):
        model.setParam('gamma', [1,0,0,0])
        model.updateF1bySchema([{'name': data.columns[0], 'val': query_name}])
        model.compChoice()

        # Find the node(s) that have the highest F2 value
        maxF2value = 0
        maxF2indexes = []

        for i in range(0, len(model.codes)):
            F2value = model.codes[i]['F2']
            if (F2value > maxF2value):
                maxF2value = F2value
                maxF2indexes = [i]
            elif (F2value == maxF2value):
                maxF2indexes.append(i)

        category_vector = ["Name:", "Group:", "Universities:", "Research Interests:"]
        # Retrieve the data of the node(s) that have the highest F2 value
        for node in maxF2indexes:
            # Iterate through the node based on category (Name -> Group -> Universities -> Research Interests)
            category_order = 0
            for category in model.codes[node]['weights']:
                print(category_vector[category_order])
                # Iterate through the vector and find the indexes that has a value of 1
                if (category_order != 3):
                    for index in range(0, len(category)):
                            if (category[index]) == 1:
                                # Retrieve the data from the model's 'attrib' attribute in F1
                                print(model.F1Fields[category_order]['attrib'][index])
                else:
                    for index in range(0, len(category)):
                        max_F2value = 0
                        max_F2index = -1
                        if (category[index] > max_F2value):
                            category[index] = max_F2value
                            max_F2index = index
                    print(model.F1Fields[category_order]['attrib'][max_F2index])
                print()
                category_order += 1
    else:
        print("Professor not found!")
        
query_name = input("Input the name of the professor: ")
query_by_name(query_name)

Input the name of the professor: Alexei Sourin
Name:
Alexei Sourin

Group:
Graphics and Interactive Computing

Universities:
Moscow Engineering Physics Institute

Research Interests:
wireless_video



In [15]:
# Perform the research keywords query here

def query_by_research_with_noise(query_research, output_file, noise_limit):
    
    query_research = query_research.split(";")
    output_file.write("Research Keywords: ")
    output_file.write(', '.join(query_research)+"\n")

    # Convert all the research keywords into input vectors
    for research_index in range(0, len(query_research)):
        for onehot_vec in research_interest_onehotlist:
            if (str(query_research[research_index]).rstrip("\n") == 
                research_label_encoder.inverse_transform([argmax(onehot_vec)])[0]):
                query_research[research_index] = onehot_vec
                
    # Remove keywords that could not be converted to input vectors
    index = 0
    while (index < len(query_research)):
        if type(query_research[index]) != list:
            del query_research[index]
        else:
            index += 1


    # Combine all the research keywords inputs into one vector that contains all research keywords
    research_vector = [sum(research_ints) for research_ints in zip(*query_research)]
    
    # With a defined probability, flip the binary research keyword vectors due to noise
    for index in range(0, len(research_vector)):
        noise_value = random.random()
        if (noise_value < noise_limit):
            if (research_vector[index] == 0):
                research_vector[index] = 1
            else:
                research_vector[index] = 0

    model.setParam('gamma', [0,0,0,1])
    model.updateF1bySchema([{'name': data.columns[3], 'val': research_vector}])
    model.compChoice()

    # Find the nodes that have the highest F2 value
    maxF2value = 0
    maxF2indexes = []

    for i in range(0, len(model.codes)-1):
        F2value = model.codes[i]['F2']
        if (F2value > maxF2value):
            maxF2value = F2value
            maxF2indexes = [i]
        elif (F2value == maxF2value):
            maxF2indexes.append(i)

    # Retrieve the data of the node(s) that have the highest F2 value
    output_file.write("Professor that follows these research keywords:")    
    for node in maxF2indexes:
        name_vector = model.codes[node]['weights'][0]
        for index in range(0, len(name_vector)):
            # Iterate through the vector and find the indexes that has a value of 1
            if (name_vector[index] == 1):
                # Retrieve the data from the model's 'attrib' attribute in F1 
                retrieved_name = model.F1Fields[0]['attrib'][index]
                output_file.write(retrieved_name)
    
    output_file.write("\n\n")
    return(retrieved_name)


In [43]:
# Research keywords query for 0% noise
output_file = open("output_zero_noise.txt", "w+")
correct_match = 0

# Use a batch file as input for the research keywords query
with open('research_query.bat', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    for batch_name, keywords in reader:
        retrieved_name = query_by_research_with_noise(keywords, output_file, 0)
        if (retrieved_name == batch_name):
            correct_match += 1
            
query_accuracy = (correct_match / len(name)) * 100
output_file.write("\nQuery Accuracy: "+str(query_accuracy)+"%")
output_file.close()


In [59]:
# Research keywords query for 10% noise
output_file = open("output_ten_noise.txt", "w+")
correct_match = 0

# Use a batch file as input for the research keywords query
with open('research_query.bat', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    for batch_name, keywords in reader:
        retrieved_name = query_by_research_with_noise(keywords, output_file, 0.1)
        if (retrieved_name == batch_name):
            correct_match += 1
            
query_accuracy = (correct_match / len(name)) * 100
output_file.write("\nQuery Accuracy: "+str(query_accuracy)+"%")
output_file.close()

In [60]:
# Research keywords query for 20% noise
output_file = open("output_twenty_noise.txt", "w+")
correct_match = 0

# Use a batch file as input for the research keywords query
with open('research_query.bat', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    for batch_name, keywords in reader:
        retrieved_name = query_by_research_with_noise(keywords, output_file, 0.2)
        if (retrieved_name == batch_name):
            correct_match += 1
            
query_accuracy = (correct_match / len(name)) * 100
output_file.write("\nQuery Accuracy: "+str(query_accuracy)+"%")
output_file.close()

In [61]:
# Research keywords query for 30% noise
output_file = open("output_thirty_noise.txt", "w+")
correct_match = 0

# Use a batch file as input for the research keywords query
with open('research_query.bat', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    for batch_name, keywords in reader:
        retrieved_name = query_by_research_with_noise(keywords, output_file, 0.3)
        if (retrieved_name == batch_name):
            correct_match += 1
            
query_accuracy = (correct_match / len(name)) * 100
output_file.write("\nQuery Accuracy: "+str(query_accuracy)+"%")
output_file.close()

In [62]:
# Research keywords query for 40% noise
output_file = open("output_forty_noise.txt", "w+")
correct_match = 0

# Use a batch file as input for the research keywords query
with open('research_query.bat', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    for batch_name, keywords in reader:
        retrieved_name = query_by_research_with_noise(keywords, output_file, 0.4)
        if (retrieved_name == batch_name):
            correct_match += 1
            
query_accuracy = (correct_match / len(name)) * 100
output_file.write("\nQuery Accuracy: "+str(query_accuracy)+"%")
output_file.close()

In [63]:
# Research keywords query for 50% noise
output_file = open("output_fifty_noise.txt", "w+")
correct_match = 0

# Use a batch file as input for the research keywords query
with open('research_query.bat', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    for batch_name, keywords in reader:
        retrieved_name = query_by_research_with_noise(keywords, output_file, 0.5)
        if (retrieved_name == batch_name):
            correct_match += 1
            
query_accuracy = (correct_match / len(name)) * 100
output_file.write("\nQuery Accuracy: "+str(query_accuracy)+"%")
output_file.close()