<a href="https://colab.research.google.com/github/spatank/Curiosity/blob/master/growing_KNOT_nets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Make data available to Colab by mounting your Drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/Curiosity_IGT/KNOT')

Mounted at /content/drive


In [None]:
!ls # run !ls to verify location 

growing_KNOT_nets.ipynb  KNOT_data_processed  KNOT_data_raw.csv


# Import Packages

In [None]:
import pandas as pd
!pip install wikipedia2vec
from wikipedia2vec import Wikipedia2Vec
import numpy as np
from scipy.spatial.distance import cosine
from scipy.io import savemat
import networkx as nx

Collecting wikipedia2vec
[?25l  Downloading https://files.pythonhosted.org/packages/d8/88/751037c70ca86581d444824e66bb799ef9060339a1d5d1fc1804c422d7cc/wikipedia2vec-1.0.4.tar.gz (1.2MB)
[K     |████████████████████████████████| 1.2MB 9.2MB/s 
Collecting marisa-trie
[?25l  Downloading https://files.pythonhosted.org/packages/20/95/d23071d0992dabcb61c948fb118a90683193befc88c23e745b050a29e7db/marisa-trie-0.7.5.tar.gz (270kB)
[K     |████████████████████████████████| 276kB 40.2MB/s 
[?25hCollecting mwparserfromhell
[?25l  Downloading https://files.pythonhosted.org/packages/c6/00/03ccc2676e592f73ce455fd0343eb38d3779878332ba01ef4c0281a7d2a9/mwparserfromhell-0.6-cp36-cp36m-manylinux1_x86_64.whl (174kB)
[K     |████████████████████████████████| 184kB 18.8MB/s 
Building wheels for collected packages: wikipedia2vec, marisa-trie
  Building wheel for wikipedia2vec (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia2vec: filename=wikipedia2vec-1.0.4-cp36-cp36m-linux_x86_64.whl size=

# Data Wrangling

This section pre-processes the KNOT data to get it into a format suited to network analysis. The data contains the names of Wikipedia pages visited by a participant in two columns: `SourceName` and `TargetName`. 


In [None]:
wiki_df = pd.read_csv('KNOT_data_raw.csv')
wiki_df.head(3)

Unnamed: 0,ID,SourceName,TargetName,Day,TimeOrder,Hyperlink,DistanceWeights,AgeYears,SexOrient,Race,GenderFactor,EducDeg,Income,JE_5D,DS_5D,ST_5D,SC_5D,TS_5D,Count,Weight
0,101,/wiki/Jeff_Bezos,/wiki/Cloud_infrastructure,1,1,no,1.0,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,1,0.0
1,101,/wiki/Cloud_infrastructure,/wiki/Cloud_computing_security,1,2,yes,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2,0.8
2,101,/wiki/Cloud_computing_security,/wiki/Cloud_infrastructure,1,3,no,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,3,0.8


In [None]:
def clean_entity_name(name):
  name = name.replace('/wiki/', '')
  name = name.replace('_', ' ')
  return name

First, we create unique identifiers (UIDs) for each page so that they can be used as nodes in a network representation. Then we clean the strings associated with each page by stripping redundant information such as `wiki/` and `_`. The UIDs and clean names are appended to the data frame as new columns.

In [None]:
# create UID for each page
source_nodes = set(wiki_df['SourceName'].tolist())
target_nodes = set(wiki_df['TargetName'].tolist())
source_nodes.update(target_nodes)
node_set = {entity: name for name, entity in enumerate(source_nodes)}
wiki_df['SourceUID'] = wiki_df['SourceName'].apply(lambda x: node_set[x])
wiki_df['SrcNameClean'] = wiki_df['SourceName'].apply(lambda x: clean_entity_name(x))
wiki_df['TargetUID'] = wiki_df['TargetName'].apply(lambda x: node_set[x])
wiki_df['TgtNameClean'] = wiki_df['TargetName'].apply(lambda x: clean_entity_name(x))
wiki_df.head(3)

Unnamed: 0,ID,SourceName,TargetName,Day,TimeOrder,Hyperlink,DistanceWeights,AgeYears,SexOrient,Race,GenderFactor,EducDeg,Income,JE_5D,DS_5D,ST_5D,SC_5D,TS_5D,Count,Weight,SourceUID,SrcNameClean,TargetUID,TgtNameClean
0,101,/wiki/Jeff_Bezos,/wiki/Cloud_infrastructure,1,1,no,1.0,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,1,0.0,15859,Jeff Bezos,1065,Cloud infrastructure
1,101,/wiki/Cloud_infrastructure,/wiki/Cloud_computing_security,1,2,yes,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,2,0.8,1065,Cloud infrastructure,4050,Cloud computing security
2,101,/wiki/Cloud_computing_security,/wiki/Cloud_infrastructure,1,3,no,0.2,23.27945,Heterosexual,AsiaAm,0,BachDegree,20to49k,4.4,4.25,1.6,2.8,2.0,3,0.8,4050,Cloud computing security,1065,Cloud infrastructure


Extract all curiosity measures from the dataframe for each participant.

In [None]:
# joyous exploration
# deprivation sensitivity
# stress tolerance
# social curiosity
# thrill seeking
five_D = wiki_df.groupby('ID', as_index = False)[['JE_5D', 'DS_5D', 'ST_5D', 'SC_5D', 'TS_5D']].mean()

In [None]:
five_D

Unnamed: 0,ID,JE_5D,DS_5D,ST_5D,SC_5D,TS_5D
0,101,4.4,4.25,1.6,2.8,2.00
1,104,3.2,2.50,2.4,4.0,0.00
2,105,3.0,3.00,3.0,2.4,1.00
3,106,5.6,3.50,1.4,5.2,1.25
4,107,3.4,2.75,2.5,2.4,0.75
...,...,...,...,...,...,...
144,355,4.6,3.20,3.2,4.8,1.00
145,356,4.6,4.40,1.4,4.2,3.00
146,359,5.6,5.00,1.8,5.8,4.40
147,363,4.6,4.80,2.6,4.4,2.80


In [None]:
filename = 'five_D.mat'
mdic = {name: col.values for name, col in five_D.items()}
savemat(filename, mdic)

# Wikipedia2Vec

Each row in the data frame represents a transition made by a participant from one Wikipedia page to another. The associated edge weight can be obtained as the semantic distance between the contents of the two pages. In order to quantify the semantic distance between Wikipedia entities, we use a pre-trained model that represents each page as an n-dimensional vector. The distance between two pages `SemanticDist` is then computed as the cosine (dis)similarity between their vector representations.

In [None]:
model_file = 'enwiki_20180420_300d.pkl'
wiki2vec = Wikipedia2Vec.load(model_file)

In [None]:
def check_entity_vector(entity):
  try:
    vec = wiki2vec.get_entity_vector(entity)
    return 0
  except KeyError:
    return 1

In [None]:
no_vec_entities = []
for k, v in node_set.items():
  entity = clean_entity_name(k)
  no_vec_entities.append(check_entity_vector(entity))

In [None]:
len(no_vec_entities)

18378

In [None]:
sum(no_vec_entities)

2207

12% of the pages visited by participants in the KNOT data do not have corresponding vector embeddings. We represent these pages by a random vector.

In [None]:
def semantic_dist(entity_1, entity_2):
  # get entity 1 vector
  try:
    v1 = wiki2vec.get_entity_vector(entity_1)
  except KeyError:
    v1 = np.random.random(300)
  # get entity 2 vector
  try:
    v2 = wiki2vec.get_entity_vector(entity_2)
  except KeyError:
    v2 = np.random.random(300)

  return cosine(v1, v2)

In [None]:
wiki_df['SemanticDist'] = wiki_df.apply(lambda x: semantic_dist(x['SrcNameClean'], x['TgtNameClean']), axis = 1)

# Create Individual Networks

Next, we split the data set by individual, and use the `SourceUID`, `TargetUID`, and `SemanticDist` columns to generate network representations of participants' Wikipedia exploration.

In [None]:
# split the data by individual
ID_groups = wiki_df.groupby('ID')
for ID, group in ID_groups:
  # enforce time ordering
  group.sort_values(by = ['TimeOrder'], inplace = True)
  network_df = group[['TimeOrder', 'SourceUID', 'SrcNameClean', 'TargetUID', 'TgtNameClean', 'SemanticDist']].reset_index(drop = True)
  # create an empty network
  G = nx.Graph()
  all_adj = []
  edge_info = []
  # incrementally add nodes and edges to the network
  for index, row in network_df.iterrows():
    from_node = row.get('SrcNameClean')
    to_node = row.get('TgtNameClean')
    edge_weight = row.get('SemanticDist')
    edge_info_dict = {'from': from_node, 'to': to_node, 'weight': edge_weight}
    edge_info.append(edge_info_dict)
    # add edge to the network
    G.add_edge(from_node, to_node, weight = edge_weight)
    adj_G = nx.linalg.graphmatrix.adjacency_matrix(G, weight = 'weight')
    all_adj.append(adj_G)
  # save subject data to .mat file
  filename = 'subj_' + str(ID) + '.mat'
  mdic = {'subj': ID, 'all_adj': all_adj, 'edge_info': edge_info}
  savemat(filename, mdic)