In [1]:
import os
import numpy as np
import networkx as nx
import pandas as pd
from ast import literal_eval as make_tuple

In [49]:
data_loc = '/home/gebhart/projects/rfunklab/data/patents_20190722'
data_file = 'collaboration.homology'

In [50]:
def read_txt(data_loc):
    with open(data_loc, "r") as f:
        return f.read()

def parse_interval_text(txt):
    intervals = []
    pi_txt = '# persistence intervals in dimension {}'
    end_txt = 'The remaining homology groups are trivial.'
    i = 0
    while i >= 0:
        ifmt = pi_txt.format(i)
        i1fmt = pi_txt.format(i+1)
        idxi = txt.find(ifmt)
        idxi1 = txt.find(i1fmt)
        if idxi > -1 and idxi1 > -1:
            # internal interval
            intervals.append(txt[idxi+len(ifmt):idxi1].strip().replace(' ', '').split('\n'))
            i += 1
        elif idxi > -1 and idxi1 == -1:
            # last interval
            endidx = txt.find(end_txt)
            if endidx == -1:
                raise IndexError('Cannot find final interval break `{}`'.format(end_txt))
            else:
                intervals.append(txt[idxi+len(ifmt):endidx].strip().replace(' ', '').split('\n'))
            i = -1
        else:
            raise IndexError('Cannot find interval {}'.format(i))
            i = -1 
    return intervals

def parse_betti_text(txt):
    bettis = []
    st_txt = '# Betti numbers:'
    end_txt = '# Cell counts:'
    lines = txt[txt.find(st_txt)+len(st_txt):txt.find(end_txt)].replace('#\t\t', '').split('\n')
    lines = [int(line[line.find('=')+1:]) for line in lines if line != '']
    return lines

def parse_cell_count_text(txt):
    cells = []
    st_txt = '# Cell counts:'
    lines = txt[txt.find(st_txt)+len(st_txt):].replace('#\t\t', '').split('\n')
    lines = [int(line[line.find('=')+1:]) for line in lines if line != '']
    return lines

def parse_euler_characteristic_text(txt):
    st_txt = '# Euler characteristic: '
    stidx = txt.find(st_txt)+len(st_txt)
    line = txt[stidx:txt.find('\n', stidx)]
    return int(line)

def tuplefy(interval):
    return make_tuple(interval.replace('[','('))

def parse_intervals(data_loc):
    str_intervals = parse_interval_text(read_txt(data_loc))
    return [[tuplefy(interval) if interval != '' else () for interval in intervals] for intervals in str_intervals]

def parse_betti(data_loc):
    return parse_betti_text(read_txt(data_loc))

def parse_cell_counts(data_loc):
    return parse_cell_count_text(read_txt(data_loc))
    
def parse_euler_characteristic(data_loc):
    return parse_euler_characteristic_text(read_txt(data_loc))

In [51]:
intervals = parse_intervals(os.path.join(data_loc, data_file))

In [52]:
bettis = parse_betti(os.path.join(data_loc,data_file))

In [53]:
cell_counts = parse_cell_counts(os.path.join(data_loc,data_file))

In [54]:
euler_characteristic = parse_euler_characteristic(os.path.join(data_loc,data_file))

In [55]:
euler_characteristic

391

In [56]:
intervals

[[(0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),
  (0, 1),


In [57]:
bettis

[471, 84, 4, 0]

In [58]:
cell_counts

[5009, 5001, 422, 39, 0]

In [60]:
euler_characteristic

391