In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/GraphRL/')

Mounted at /content/drive


In [2]:
#@title Imports

import sys
import pandas as pd
import codecs
import copy
import re
import numpy as np
import scipy
from scipy.spatial.distance import cosine
from scipy.io import savemat
import networkx as nx
from networkx.readwrite import json_graph
from urllib.parse import unquote
import time
import gzip
import csv
from collections import defaultdict
from copy import deepcopy
from tqdm import tqdm
import json

def process_URL(name):
  
  name = name.replace('/wiki/', '') # remove leading URL sub-string
  name = name.split("_")
  processed_string = []

  for token in name:
    decoded_token = unquote(token)
    processed_string.append(decoded_token.lower())

  return ' '.join(processed_string)

In [3]:
base_path = '/content/drive/My Drive/GraphRL_v2/KNOT/'

In [4]:
#@title Extract nodes from KNOT data

wiki_df = pd.read_csv(os.path.join(base_path, 'KNOT_data_raw.csv'))
source_nodes = set(wiki_df['SourceName'].tolist())
target_nodes = set(wiki_df['TargetName'].tolist())
source_nodes.update(target_nodes)
node_set = {entity: name for name, entity in enumerate(source_nodes)}
wiki_df['SourceUID'] = wiki_df['SourceName'].apply(lambda x: node_set[x])
wiki_df['SrcNameClean'] = wiki_df['SourceName'].apply(lambda x: process_URL(x))
wiki_df['TargetUID'] = wiki_df['TargetName'].apply(lambda x: node_set[x])
wiki_df['TgtNameClean'] = wiki_df['TargetName'].apply(lambda x: process_URL(x))
nodes_KNOT = set(wiki_df['SrcNameClean'].unique())
tgt_nodes_KNOT = set(wiki_df['TgtNameClean'].unique())
nodes_KNOT.update(tgt_nodes_KNOT)

In [5]:
wiki_df.head(150)

Unnamed: 0,ID,SourceName,TargetName,Day,TimeOrder,Hyperlink,DistanceWeights,AgeYears,SexOrient,Race,...,DS_5D,ST_5D,SC_5D,TS_5D,Count,Weight,SourceUID,SrcNameClean,TargetUID,TgtNameClean
0,101,/wiki/Jeff_Bezos,/wiki/Cloud_infrastructure,1,1,no,1.0,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,1,0.0,17081,jeff bezos,15486,cloud infrastructure
1,101,/wiki/Cloud_infrastructure,/wiki/Cloud_computing_security,1,2,yes,0.2,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,2,0.8,15486,cloud infrastructure,2171,cloud computing security
2,101,/wiki/Cloud_computing_security,/wiki/Cloud_infrastructure,1,3,no,0.2,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,3,0.8,2171,cloud computing security,15486,cloud infrastructure
3,101,/wiki/Cloud_infrastructure,/wiki/Information_technology,1,4,yes,0.8,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,4,0.2,15486,cloud infrastructure,2665,information technology
4,101,/wiki/Information_technology,/wiki/Computer_language,1,5,no,0.6,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,5,0.4,2665,information technology,11338,computer language
5,101,/wiki/Computer_language,/wiki/Programming_language,1,6,yes,0.6,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,6,0.4,11338,computer language,12201,programming language
6,101,/wiki/Programming_language,/wiki/Java_(programming_language),1,7,yes,0.8,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,7,0.2,12201,programming language,18106,java (programming language)
7,101,/wiki/Java_(programming_language),/wiki/Java_compiler,1,8,yes,0.2,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,8,0.8,18106,java (programming language),8077,java compiler
8,101,/wiki/Java_compiler,/wiki/Java_class_file,1,9,yes,0.5,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,9,0.5,8077,java compiler,2304,java class file
9,101,/wiki/Java_class_file,/wiki/Class_(programming),1,10,yes,0.8,23.27945,Heterosexual,AsiaAm,...,4.25,1.6,2.8,2.0,10,0.2,2304,java class file,12951,class (programming)


In [6]:
avg_session_length = wiki_df.groupby(['ID','Day'])['Day'].count().mean()
avg_session_length

10.487064116985376

In [134]:
temp_df = pd.DataFrame(wiki_df.groupby(['ID', 'Hyperlink'])['Hyperlink'].count().reset_index(name = "Count"))
temp_df_2 = temp_df.groupby(['ID', 'Hyperlink']).agg({'Count': 'sum'})
temp_df_3 = temp_df_2.groupby(level = 0).apply(lambda x: x / (x.sum())).reset_index()
temp_df_3.groupby(['Hyperlink'])['Count'].mean()

Hyperlink
no     0.432162
yes    0.567838
Name: Count, dtype: float64

In [None]:
avg_unique_src_pages = wiki_df.groupby(['ID','Day'])['SrcNameClean'].nunique().mean()
avg_unique_src_pages

8.574053243344581

In [None]:
avg_unique_tgt_pages = wiki_df.groupby(['ID','Day'])['TgtNameClean'].nunique().mean()
avg_unique_tgt_pages

8.7461567304087

In [None]:
#@title Helpers

def get_neighbors(set_of_nodes, filepath_WikiLinks):
  """
  Build a dictionary of the neighbors of nodes in the supplied set; keys 
  correspond to pages and values correspond to the pages that the keys connect
  to with hyperlinks.
  """

  nodes_neighbors = defaultdict(list) # store neighbors of nodes in lists

  with gzip.open(filepath_WikiLinks, mode = "rt") as f:
    file_reader = csv.reader(f, delimiter = '\t')

    for line in file_reader:
      from_node_id = line[0]
      from_node = line[1].lower()
      to_node_id = line[2]
      to_node = line[3].lower()

      if from_node in set_of_nodes:
        nodes_neighbors[from_node].append(to_node)

  return nodes_neighbors

def merge_defaultdicts(d1, d2):

  d_merged = d1.copy()

  for k, v in d2.items():
    if (k in d_merged):
      # Note: this does not extend/replace older value/s
      assert set(d_merged[k]) == set(d2[k])
    else:
      d_merged[k] = d2[k]
      
  return d_merged

def build_KNOT_graph(visited_nodes, nodes_WikiLinks):
  """
  From a list `visited_nodes' and a dictionary of nodes --> connections,
  build a NetworkX graph object. Also add edges from connections --> visited_nodes 
  wherever they may exist.
  """

  G = nx.Graph()
  nodes_not_found = []
  neighbors_not_found = []

  for idx, node in enumerate(visited_nodes):

    if node not in nodes_WikiLinks:
      nodes_not_found.append(node)
      continue

    G.add_node(node) # add the visited node
    neighbors = nodes_WikiLinks[node] # get the visited node's neighbors
    
    for neighbor in neighbors:
      G.add_node(neighbor) # add each neighbor
      G.add_edge(node, neighbor) # add an edge between visited node and neighbor

      # check if neighbor is connected to any of the visited nodes
      
      if neighbor not in nodes_WikiLinks:
        neighbors_not_found.append((node, neighbor))
        continue

      neighbors_of_neighbor = nodes_WikiLinks[neighbor]
      edge_stubs = list(set(neighbors_of_neighbor).intersection(set(visited_nodes)))

      if edge_stubs:

        for edge_stub in edge_stubs:
          G.add_edge(neighbor, edge_stub)

  return G, nodes_not_found, neighbors_not_found

In [None]:
#@title Build dictionary of all edges for KNOT nodes and their neighbors

# filepath_WikiLinks = os.path.join(base_path, 'enwiki.wikilink_graph.2018-03-01.csv.gz')

# nodes_and_neighbors_KNOT = get_neighbors(nodes_KNOT, filepath_WikiLinks)

# neighbors_KNOT = set()

# for neighbors in nodes_and_neighbors_KNOT.values():
#   neighbors_KNOT.update(neighbors)

In [None]:
# print(len(nodes_KNOT)) # 18374

In [None]:
# len(set(nodes_and_neighbors_KNOT)) # 18193

In [None]:
# len(nodes_KNOT) - len(set(nodes_and_neighbors_KNOT)) # 181 --> ~1% nodes not found

In [None]:
# nodes_KNOT.difference(set(nodes_and_neighbors_KNOT))

In [None]:
# neighbors_and_neighbors_KNOT = get_neighbors(neighbors_KNOT, filepath_WikiLinks)

In [None]:
# len(neighbors_KNOT) # 807180

In [None]:
# len(set(neighbors_and_neighbors_KNOT)) # 806322

In [None]:
# len(neighbors_KNOT) - len(set(neighbors_and_neighbors_KNOT)) # 858 --> ~0.1% neighbors not found

In [None]:
# neighbors_KNOT.difference(set(neighbors_and_neighbors_KNOT))

In [None]:
# len(set(nodes_and_neighbors_KNOT).union(set(neighbors_and_neighbors_KNOT))) # 807812

In [None]:
# all_KNOT_edges = dict(merge_defaultdicts(nodes_and_neighbors_KNOT, neighbors_and_neighbors_KNOT))

# save_filename = os.path.join(base_path, 'all_KNOT_edges.json')
# with open(save_filename, 'w') as f:
#   json.dump(all_KNOT_edges, f)

load_filename = os.path.join(base_path, 'all_KNOT_edges.json')
with open(load_filename, 'r') as f:
  all_KNOT_edges = json.load(f)

In [None]:
len(all_KNOT_edges)

807812

In [None]:
#@title Build JSON file with processed data for all KNOT participants

# # split data by individual
# ID_groups = wiki_df.groupby('ID')
# subj_dicts = {}

# for subj_num, (ID, group) in enumerate(ID_groups):

#   subj_dict = {}

#   # enforce time ordering
#   group.sort_values(by = ['TimeOrder'], inplace = True)
#   visited_nodes = set()
#   transitions = []
#   for index, row in group.iterrows():
#       from_node = row['SrcNameClean']
#       to_node = row['TgtNameClean']
#       hyperlink = row['Hyperlink']
#       day = row['Day']
#       transitions.append((from_node, to_node, hyperlink, day))
#       visited_nodes.update([from_node, to_node])

#   visited_nodes = list(visited_nodes)
#   G, nodes_not_found, neighbors_not_found = build_KNOT_graph(visited_nodes, all_KNOT_edges)

#   subj_dict['ID'] = row['ID']
#   subj_dict['AgeYears'] = row['AgeYears']
#   subj_dict['SexOrient'] = row['SexOrient']
#   subj_dict['Race'] = row['Race']
#   subj_dict['GenderFactor'] = row['GenderFactor']
#   subj_dict['EducDeg'] = row['EducDeg']
#   subj_dict['Income'] = row['Income']
#   subj_dict['JE_5D'] = row['JE_5D']
#   subj_dict['DS_5D'] = row['DS_5D']
#   subj_dict['ST_5D'] = row['ST_5D']
#   subj_dict['SC_5D'] = row['SC_5D']
#   subj_dict['TS_5D'] = row['TS_5D']
#   subj_dict['visited_nodes'] = visited_nodes
#   subj_dict['transitions'] = transitions
#   subj_dict['graph_data'] = json_graph.node_link_data(G)
#   subj_dict['nodes_not_found'] = nodes_not_found
#   subj_dict['neighbors_not_found'] = neighbors_not_found

#   subj_dicts[subj_num] = subj_dict

In [None]:
# save_filename = os.path.join(base_path, 'all_KNOT_data.json')
# with open(save_filename, 'w') as f:
#   json.dump(subj_dicts, f)

In [None]:
load_filename = os.path.join(base_path, 'all_KNOT_data.json')
with open(load_filename, 'r') as f:
  all_KNOT_data = json.load(f)

In [None]:
len(all_KNOT_data)

149

In [None]:
type(all_KNOT_data)

dict