In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from graph_tool.all import *

import heapq
import requests
import json
import time

from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD

from matplotlib.mlab import PCA

In [2]:
def hex_to_int(id):
    '''Map patent id (string) to graph id (int).'''
    return int(id, 16)

def int_to_hex(id):
    '''Map graph id (int) to patent id (string).'''
    return '{0:0x}'.format(id)

In [3]:
DATA = './data/'
graph = load_graph(DATA + 'citations_graph.xml.gz')
graph

<Graph object, directed, with 8274991 vertices and 86284396 edges at 0x7f2b1818b438>

In [None]:
pr = pagerank(graph, damping=0.85, pers=None, weight=None, prop=None, epsilon=1e-06, max_iter=None, ret_iter=False)

In [None]:
pr_array = pr.get_array()

In [None]:
max_pageranks = heapq.nlargest(10, enumerate(pr_array), key=lambda x: x[1])

In [None]:
print("Top 10 patents of largest page rank:\n")
for idx, deg in enumerate(max_pageranks):
    patent_id = int_to_hex(graph.vp.id[deg[0]])
    patent_URL = 'http://www.patentsview.org/api/patents/query?q={"patent_number":"' + patent_id + '"}'
    patent_info = requests.get(patent_URL).json()
    if patent_info['patents']:
        patent_title = patent_info['patents'][0]['patent_title']
    
        print('Nr {}:\t{}'.format(idx+1, patent_title))
        print('\tPage rank = {}, Patent ID = {}\n'.format(deg[1], patent_id))

In [None]:
vp, ep = betweenness(graph)

## Largest Components

In [68]:
comp_prop = label_largest_component(graph, directed=False)

In [69]:
comp_arr = comp_prop.get_array()

In [70]:
comp_arr.shape

(8274991,)

In [74]:
comp_arr[comp_arr == 1].shape

(8261059,)

## SVD, PCA

In [None]:
lp = laplacian(graph, deg='in')

In [None]:
lp

In [None]:
pca = PCA(lp)

In [None]:

mlab_pca = mlabPCA(all_samples.T)

print('PC axes in terms of the measurement axes scaled by the standard deviations:\n', mlab_pca.Wt)

plt.plot(mlab_pca.Y[0:20,0],mlab_pca.Y[0:20,1], 'o', markersize=7, color='blue', alpha=0.5, label='class1')
plt.plot(mlab_pca.Y[20:40,0], mlab_pca.Y[20:40,1], '^', markersize=7, color='red', alpha=0.5, label='class2')

plt.xlabel('x_values')
plt.ylabel('y_values')
plt.xlim([-4,4])
plt.ylim([-4,4])
plt.legend()
plt.title('Transformed samples with class labels from matplotlib.mlab.PCA()')

plt.show()