<a href="https://colab.research.google.com/github/spatank/Curiosity/blob/master/v2/build_nets_for_eirene_KNOT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/Curiosity_IGT/KNOT')

Mounted at /content/drive


In [2]:
!ls # run !ls to verify location 

five_D.mat		 KNOT_data_processed	     KNOT_data_raw.csv
growing_KNOT_nets.ipynb  KNOT_data_processed_Eirene  subj_101.mat


In [3]:
import pandas as pd
# !pip install wikipedia2vec
# from wikipedia2vec import Wikipedia2Vec
import numpy as np
# from scipy.spatial.distance import cosine
import scipy
from scipy.io import savemat
import networkx as nx

In [4]:
wiki_df = pd.read_csv('KNOT_data_raw.csv')
wiki_df.head(3)

Unnamed: 0,ID,SourceName,TargetName,Day,TimeOrder,Hyperlink,DistanceWeights,AgeYears,SexOrient,Race,GenderFactor,EducDeg,Income,JE_5D,DS_5D,ST_5D,SC_5D,TS_5D,Count,Weight
0,101,/wiki/Jeff_Bezos,/wiki/Cloud_infrastructure,1,1,no,1.0,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,1,0.0
1,101,/wiki/Cloud_infrastructure,/wiki/Cloud_computing_security,1,2,yes,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2,0.8
2,101,/wiki/Cloud_computing_security,/wiki/Cloud_infrastructure,1,3,no,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,3,0.8


# Assign UIDs to Wikipedia Pages

In [5]:
def clean_entity_name(name):
  name = name.replace('/wiki/', '')
  name = name.replace('_', ' ')
  return name

First, we create unique identifiers (UIDs) for each page so that they can be used as nodes in a network representation. Then we clean the strings associated with each page by stripping redundant information such as `wiki/` and `_`. The UIDs and clean names are appended to the data frame as new columns.

In [6]:
# create UID for each page
source_nodes = set(wiki_df['SourceName'].tolist())
target_nodes = set(wiki_df['TargetName'].tolist())
source_nodes.update(target_nodes)
node_set = {entity: name for name, entity in enumerate(source_nodes)}
wiki_df['SourceUID'] = wiki_df['SourceName'].apply(lambda x: node_set[x])
wiki_df['SrcNameClean'] = wiki_df['SourceName'].apply(lambda x: clean_entity_name(x))
wiki_df['TargetUID'] = wiki_df['TargetName'].apply(lambda x: node_set[x])
wiki_df['TgtNameClean'] = wiki_df['TargetName'].apply(lambda x: clean_entity_name(x))
wiki_df.head(3)

Unnamed: 0,ID,SourceName,TargetName,Day,TimeOrder,Hyperlink,DistanceWeights,AgeYears,SexOrient,Race,GenderFactor,EducDeg,Income,JE_5D,DS_5D,ST_5D,SC_5D,TS_5D,Count,Weight,SourceUID,SrcNameClean,TargetUID,TgtNameClean
0,101,/wiki/Jeff_Bezos,/wiki/Cloud_infrastructure,1,1,no,1.0,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,1,0.0,3778,Jeff Bezos,6710,Cloud infrastructure
1,101,/wiki/Cloud_infrastructure,/wiki/Cloud_computing_security,1,2,yes,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2,0.8,6710,Cloud infrastructure,3623,Cloud computing security
2,101,/wiki/Cloud_computing_security,/wiki/Cloud_infrastructure,1,3,no,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,3,0.8,3623,Cloud computing security,6710,Cloud infrastructure


# Measures of Trait Curiosity

Extract all curiosity measures from the dataframe for each participant.

In [7]:
# joyous exploration
# deprivation sensitivity
# stress tolerance
# social curiosity
# thrill seeking
# five_D = wiki_df.groupby('ID', as_index = False)[['JE_5D', 'DS_5D', 'ST_5D', 'SC_5D', 'TS_5D']].mean()

In [8]:
# five_D.head(5)

In [9]:
# filename = 'five_D.mat'
# mdic = {name: col.values for name, col in five_D.items()}
# savemat(filename, mdic)

# Create Individual Networks

Next, we split the data set by individual, and use the `SourceUID`, `TargetUID`, and `SemanticDist` columns to generate network representations of participants' Wikipedia exploration.

In [10]:
# split the data by individual
ID_groups = wiki_df.groupby('ID')
for ID, group in ID_groups:
  print("Subject %d" % ID)
  # enforce time ordering
  group.sort_values(by = ['TimeOrder'], inplace = True)
  network_df = group[['TimeOrder', 'SourceUID', 'SrcNameClean', 'TargetUID', 'TgtNameClean']].reset_index(drop = True)
  # create an empty network
  G = nx.Graph()
  edge_info = []
  for index, row in network_df.iterrows():
    from_node = row.get('SrcNameClean')
    to_node = row.get('TgtNameClean')
    edge_info_dict = {'from': from_node, 'to': to_node}
    edge_info.append(edge_info_dict)
    # add nodes to the network
    G.add_node(from_node)
    G.add_node(to_node)
    # add edge to the network
    G.add_edge(from_node, to_node)
  adj_G = nx.linalg.graphmatrix.adjacency_matrix(G, weight = 'weight')
  # save subject data to .mat file
  filename = 'KNOT_data_processed_Eirene/' + 'subj_' + str(ID) + '.mat'
  mdic = {'subj': ID, 'adj': scipy.sparse.csr_matrix.todense(adj_G), 
          'nodes': list(G.nodes), 'edge_info': edge_info}
  savemat(filename, mdic)

Subject 101
Subject 104
Subject 105
Subject 106
Subject 107
Subject 108
Subject 109
Subject 112
Subject 114
Subject 115
Subject 117
Subject 119
Subject 120
Subject 121
Subject 122
Subject 126
Subject 127
Subject 128
Subject 130
Subject 131
Subject 132
Subject 135
Subject 138
Subject 139
Subject 140
Subject 141
Subject 146
Subject 150
Subject 153
Subject 154
Subject 155
Subject 156
Subject 157
Subject 158
Subject 159
Subject 162
Subject 164
Subject 165
Subject 167
Subject 169
Subject 171
Subject 173
Subject 174
Subject 176
Subject 177
Subject 179
Subject 183
Subject 185
Subject 188
Subject 189
Subject 190
Subject 191
Subject 192
Subject 194
Subject 196
Subject 197
Subject 198
Subject 199
Subject 201
Subject 204
Subject 206
Subject 207
Subject 208
Subject 209
Subject 210
Subject 211
Subject 212
Subject 214
Subject 216
Subject 217
Subject 219
Subject 220
Subject 221
Subject 223
Subject 224
Subject 225
Subject 226
Subject 228
Subject 229
Subject 231
Subject 232
Subject 234
Subject 235
Subj