<a href="https://colab.research.google.com/github/spatank/Curiosity/blob/master/v2/build_nets_for_eirene_KNOT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/Curiosity_IGT/KNOT')

In [None]:
!ls # run !ls to verify location 

In [None]:
import pandas as pd
!pip install wikipedia2vec
from wikipedia2vec import Wikipedia2Vec
import numpy as np
from scipy.spatial.distance import cosine
from scipy.io import savemat
import networkx as nx

In [None]:
wiki_df = pd.read_csv('KNOT_data_raw.csv')
wiki_df.head(3)

# Assign UIDs to Wikipedia Pages

In [None]:
def clean_entity_name(name):
  name = name.replace('/wiki/', '')
  name = name.replace('_', ' ')
  return name

First, we create unique identifiers (UIDs) for each page so that they can be used as nodes in a network representation. Then we clean the strings associated with each page by stripping redundant information such as `wiki/` and `_`. The UIDs and clean names are appended to the data frame as new columns.

In [None]:
# create UID for each page
source_nodes = set(wiki_df['SourceName'].tolist())
target_nodes = set(wiki_df['TargetName'].tolist())
source_nodes.update(target_nodes)
node_set = {entity: name for name, entity in enumerate(source_nodes)}
wiki_df['SourceUID'] = wiki_df['SourceName'].apply(lambda x: node_set[x])
wiki_df['SrcNameClean'] = wiki_df['SourceName'].apply(lambda x: clean_entity_name(x))
wiki_df['TargetUID'] = wiki_df['TargetName'].apply(lambda x: node_set[x])
wiki_df['TgtNameClean'] = wiki_df['TargetName'].apply(lambda x: clean_entity_name(x))
wiki_df.head(3)

# Measures of Trait Curiosity

Extract all curiosity measures from the dataframe for each participant.

In [None]:
# joyous exploration
# deprivation sensitivity
# stress tolerance
# social curiosity
# thrill seeking
five_D = wiki_df.groupby('ID', as_index = False)[['JE_5D', 'DS_5D', 'ST_5D', 'SC_5D', 'TS_5D']].mean()

In [None]:
five_D.head(5)

In [None]:
# filename = 'five_D.mat'
# mdic = {name: col.values for name, col in five_D.items()}
# savemat(filename, mdic)

# Create Individual Networks

Next, we split the data set by individual, and use the `SourceUID`, `TargetUID`, and `SemanticDist` columns to generate network representations of participants' Wikipedia exploration.

In [None]:
# split the data by individual
ID_groups = wiki_df.groupby('ID')
for ID, group in ID_groups:
  # enforce time ordering
  group.sort_values(by = ['TimeOrder'], inplace = True)
  network_df = group[['TimeOrder', 'SourceUID', 'SrcNameClean', 'TargetUID', 'TgtNameClean', 'SemanticDist']].reset_index(drop = True)
  # create an empty network
  G = nx.Graph()
  all_adj = []
  edge_info = []
  # incrementally add nodes and edges to the network
  for index, row in network_df.iterrows():
    from_node = row.get('SrcNameClean')
    to_node = row.get('TgtNameClean')
    edge_weight = row.get('SemanticDist')
    edge_info_dict = {'from': from_node, 'to': to_node, 'weight': edge_weight}
    edge_info.append(edge_info_dict)
    # add edge to the network
    G.add_edge(from_node, to_node, weight = edge_weight)
    adj_G = nx.linalg.graphmatrix.adjacency_matrix(G, weight = 'weight')
    all_adj.append(adj_G)
  # save subject data to .mat file
  filename = 'subj_' + str(ID) + '.mat'
  mdic = {'subj': ID, 'all_adj': all_adj, 'edge_info': edge_info}
  savemat(filename, mdic)