In [1]:
import pandas as pd

In [2]:
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from graph_tool.all import *

import heapq
import requests
import json
import time
import csv
import itertools

Format: <[1970-01-01, 1970-01-01]>

In [4]:
def hex_to_int(id):
    '''Map patent id (string) to graph id (int).'''
    return int(id, 16)

def int_to_hex(id):
    '''Map graph id (int) to patent id (string).'''
    return '{0:0x}'.format(id)

In [5]:
DATA = './data/'
graph = load_graph(DATA + 'citations_graph_reversed.xml.gz')
graph

<Graph object, directed, with 8274991 vertices and 86284396 edges at 0x7f30cc3e3e10>

In [6]:
class ShortestPathVisitor(BFSVisitor):
    def __init__(self, name, dist, edge_set, vertex_set, max_dist):
        self.name = name
        self.dist = dist
        self.edge_set = edge_set
        self.vertex_set = vertex_set
        self.max_dist = max_dist
            
    def examine_edge(self, e):
        # Getting distance of next node
        next_dist = self.dist[e.source()] + 1
        self.dist[e.target()] = next_dist
        
        if next_dist <= self.max_dist:
            self.edge_set.add((int(e.source()), int(e.target())))
            self.vertex_set.add(int(e.target()))

In [39]:
dist = graph.new_vertex_property("int")
edge_set = set()
vertex_set = set()

source = '6285999'
source_vertex = find_vertex(graph, graph.vertex_properties.id, hex_to_int(source))[0]
vertex_set.add(int(source_vertex))

bfs_search(graph, int(source_vertex), ShortestPathVisitor('SPV_' + source, dist, edge_set, vertex_set, 3))

In [41]:
len(edge_set)

33021

In [9]:
reader = csv.reader(open(DATA + 'patent_dates.csv', 'r'))
dates = {}
for row in reader:
    k, v = row
    dates[k] = v

In [42]:
dates['6285999']

'2001-09-04'

In [49]:
def vertices_to_csv(vertex_set, filename):
    vertices = []
    
    for v in vertex_set: # For all the vertices in that path
        patent_id = int_to_hex(graph.vp.id[int(v)])
        if patent_id in dates:
            year = '[' + dates[patent_id][:4] + ',2017]'
        else:
            year = '[0,0]'
        vertices.append([patent_id, year])
            
    output_file = open(filename, 'w')
    with output_file:  
        writer = csv.writer(output_file)
        writer.writerow(['Id', 'Timestamp'])
        writer.writerows(vertices)

    return vertices

In [50]:
dna_vertices = vertices_to_csv(vertex_set, './pr_vertices.csv')

In [45]:
def edges_to_csv(edge_set, filename):
    edges = []
    
    for e in edge_set: # For all the edges in that path
        source_index = int(e[0])
        source_id = int_to_hex(graph.vp.id[source_index])
        target_index = int(e[1])
        target_id = int_to_hex(graph.vp.id[target_index])
            
        edges.append([target_id, source_id])
            
    output_file = open(filename, 'w')
    with output_file:  
        writer = csv.writer(output_file)
        writer.writerow(['Source', 'Target'])
        writer.writerows(edges)
        
    return edges

In [46]:
dna_edges = edges_to_csv(edge_set, './pr_edges.csv')