In [7]:
# generate graph data for clustered vectors from '/Volumes/Storage/Programming/dissertation-research/apps/opentext/public/data/graphData/clusters_top-edges-only-nofilter.json'

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

def generateCosineSimilaritiesUsingSKLearn(vectors):
    vectorsWithoutHeaders = vectors[1:]
    vectorIds = [vector[0] for vector in vectorsWithoutHeaders]
    vectorsWithoutStageId = [vector[1:] for vector in vectorsWithoutHeaders]
    numpy_string_vectors = np.array(vectorsWithoutStageId)
    numpy_vectors = numpy_string_vectors.astype(np.float)
    sparse_vectors = sparse.csr_matrix(numpy_vectors)
    similarities = cosine_similarity(sparse_vectors)
    similarities_trimmed = [value[0:2] for value in similarities]
    # Add IDs back into vector array
    similaritiesWithIds = []
    for count, vector in enumerate(similarities):
        similaritiesWithIds.append([str(vectorIds[count]), *[round(value,2) for value in vector]])
    return similaritiesWithIds

def sumVectorsFromCluster(listOfVectorsToCombine):
    vectorsWithoutIDs = [vector[1:] for vector in listOfVectorsToCombine]
    numpy_string_vectors = np.array(vectorsWithoutIDs)
    vectorLabels = [vector[0] for vector in listOfVectorsToCombine]
    numpy_vectors = numpy_string_vectors.astype(np.float)
    outputVector = [int(value) for value in numpy_vectors[0]]
    for vector in numpy_vectors[1:]:
        outputVector = np.add(outputVector, vector)
    return (vectorLabels, outputVector)

In [10]:
# Map cluster numbers from gephi export to xml file for episodes
import json
with open('/Users/ryderwishart/gephi-data/temporally-sequenced-similarities-modular-83-all-nodes-included.json', encoding='utf-8') as inputFile:
    clusterDictionary = json.load(inputFile)
nodes = clusterDictionary['nodes']

from lxml import etree
with open('/Volumes/Storage/Programming/dissertation-research/python/episodes-ranges.xml', encoding='utf-8') as episodeXMLFile:
    episodeTree = etree.parse(episodeXMLFile)
episodes = episodeTree.xpath("//episode")

print('number of nodes', len(nodes))
print('number of episodes', len(episodes))

count = 0
matchDict = dict()
for node in nodes:
    nodeID = '-'.join(node['label'].split('§')).split(' ')[0]
    for episode in episodes:
        episodeID = (episode.attrib['section']).split(' ')[0]
        if nodeID == episodeID:
            matchDict[nodeID] = episodeID
            episode.attrib['cluster'] = ''
            episode.attrib['cluster'] = node['attributes']['modularity_class']
            count += 1
print(count)
print(len(matchDict.keys()))
for node in nodes:
    if node not in list(matchDict.keys()):
        print('missing', node['label'])
    # episode.attrib['cluster'] = 

number of nodes 282
number of episodes 627
267
267
missing 02§30 The Last Discourses of Christ in Galilee
missing 03§61 The Lord’s Third Prediction of His Passion
missing 01§24 The Call of Matthew and His Feast
missing 01§34 The Woe upon the Galilean Cities
missing 03§72a The Walk to Gethsemane and the Agony
missing 01§88b Events at Gethsemane
missing 04§14 The Feeding of the Five Thousand
missing 03§66b Parable of the Wicked Tenants
missing 01§33 Christ’s Testimony Concerning John
missing 01§63 01§64 The Reward of the Apostles; Parable of the Laborers in the Vineyard
missing 03§39 03§40 Christ Casts Out a Demon and Rebukes The Generation
missing 03§77 03§78 The Emmaus Disciples and the Last Appearances of Christ
missing 02§46 The Capture of Jesus
missing 04§12 04§13 Jesus Defends Himself Against Jews Persecuting Him
missing 02§33 The Rich Young Man
missing 02§39 The Parable of the Vineyard
missing 03§12 The Beginning of Christ’s Ministry and His Teaching in Nazareth
missing 02§52 The 

In [11]:
with open('episodes-ranges-update.xml', 'a', encoding='utf-8') as out:
    for episode in episodes:
        out.write(f'{etree.tostring(episode)}\n')

In [12]:
# create a lookup table of which node ids belong in which clusters
clusterDict = {str(number): [node['id'] for node in nodes if node['attributes']['modularity_class'] == str(number)] for number in range(0,59)}

In [5]:
def generateNodeFromCosineSimilarities(cosineSimilaritiesRow):
    cells = cosineSimilaritiesRow.split(',')
    id = cells[0]
    label = cells[0]
    averageSimilarity = sum([float(cell) for cell in cells[1:]]) / len(cells[1:])
    nodeFields = (id, label, averageSimilarity)
    node = '\t'.join([str(field) for field in nodeFields])
    return node

def generateEdgesFromCosineSimilarities(columnHeaders, cosineSimilaritiesRow):
    # 'ID\tSOURCE\tTARGET\tWEIGHT\n'
    cells = cosineSimilaritiesRow.split(',')
    edges = []
    columnHeadersWithoutIDColumn = columnHeaders[1:]
    source = cells[0]
    for count, cell in enumerate(cells):
        if count == 0:
            pass
        else:
            cell = cells[count]
            if float(cell) > 0:
                target = columnHeaders[count]
                if target == source:
                    pass
                else:
                    # edgesSets.append(targetAndSource)
                    edge = (f'{source}-{target}', source, target, cell)
                    edgeString = '\t'.join(edge)
                    if len(edgeString.split('\t')) != 4:
                        print('FAIL', edgeString.split('\t'))
                    edges.append(edgeString)
    return edges

def generateTopEdgesFromCosineSimilarities(columnHeaders, cosineSimilaritiesRow):
    # 'ID\tSOURCE\tTARGET\tWEIGHT\n'
    cells = cosineSimilaritiesRow.split(',')
    edges = []
    edgeTuples = []
    columnHeadersWithoutIDColumn = columnHeaders[1:]
    source = cells[0]
    for count, cell in enumerate(cells):
        if count == 0:
            pass
        else:
            cell = cells[count]
            if float(cell) > 0:
                target = columnHeaders[count]
                if target == source:
                    pass
                else:
                    # edgesSets.append(targetAndSource)
                    # edge = (f'{source}-{target}', source, target, cell)
                    # edgeString = '\t'.join(edge)
                    # if len(edgeString.split('\t')) != 4:
                        # print('FAIL', edgeString.split('\t'))
                    edgeTuples.append((source,target,float(cell)))
    edgeTuples.sort(key=lambda edgeTuple: edgeTuple[2])
    topEdgeValue = edgeTuples[-1]
    topEdges = [edge for edge in edgeTuples if edge[2] == topEdgeValue[2] ]
    edges = ['\t'.join((f'{edge[0]}-{edge[1]}', edge[0], edge[1], str(edge[2]))) for edge in topEdges]
    return edges


In [6]:
# Generate cluster-based cosine similarities

inputFilePath = '/Volumes/Storage/Programming/dissertation-research/python/all-dynamic-features-vectors-gospels.tsv'

nodes = []
edges = []

with open(inputFilePath, 'r', encoding='utf8') as featuresCSVData:
    with open('cluster-cosineValues.tsv', 'a', encoding='utf-8') as outputFile:
        featuresData = [line for line in featuresCSVData]
        columnHeaders = featuresData[0].split(',')
        
        vectorsByCluster = {str(number): [line.strip().split(',') for line in featuresData if line.split(',')[0] in clusterDict[str(number)]] for number in range(0,59)} 
        clusterVectors = []
        for count in range(0,59):

            vectorLabels, combinedVector = sumVectorsFromCluster(vectorsByCluster[str(count)])
            labels = ';'.join(vectorLabels)
            vectorWithID = [f'{count} {labels}', *combinedVector]
            clusterVectors.append(vectorWithID)
        
            numberOfTextsInCluster = len(vectorsByCluster[str(count)])
            node = f'{count}\tCluster {count}\t{numberOfTextsInCluster}\t{vectorLabels}'
            nodes.append(node)
        similarities = generateCosineSimilaritiesUsingSKLearn(clusterVectors)
        
        for line in similarities:
            outputFile.write(line[0] + ',')
        
        for line in similarities:
            print(line)
            outputData = [str(value) for value in line]
            outputFile.write(','.join(outputData))
            outputFile.write('\n')
            # rowEdges = generateTopEdgesFromCosineSimilarities(columnHeaders, line)
            # for edge in rowEdges:
            #     cells = edge.split('\t')
            #     targetAndSourceArray = [
            #         cells[2], 
            #         cells[1]
            #     ]
            #     if targetAndSourceArray[0] != targetAndSourceArray[1]:
            #         targetAndSource = set(targetAndSourceArray)
            #         pairIndex = '-'.join(targetAndSource)
            #         edgesDict[pairIndex] = edge




IndexError: index 0 is out of bounds for axis 0 with size 0

In [63]:
def generateEdgesFromCosineSimilarities(columnHeaders, cosineSimilaritiesRow):
    # 'ID\tSOURCE\tTARGET\tWEIGHT\n'
    cells = cosineSimilaritiesRow.split(',')
    print('CELLS:', cells)
    edges = []
    columnHeadersWithoutIDColumn = columnHeaders[1:]
    source = cells[0]
    for count, cell in enumerate(cells):
        if count == 0:
            pass
        else:
            cell = cells[count]
            if float(cell) > 0:
                target = columnHeaders[count]
                if target == source:
                    pass
                else:
                    # edgesSets.append(targetAndSource)
                    edge = (f'{source}-{target}', source, target, cell)
                    edgeString = '\t'.join(edge)
                    if len(edgeString.split('\t')) != 4:
                        print('FAIL', edgeString.split('\t'))
                    edges.append(edgeString)
    return edges

def generateClusterNodeFromCosineSimilarities(cosineSimilaritiesRow):
    cells = cosineSimilaritiesRow.split(',')
    id = cells[0]
    label = cells[0]
    averageSimilarity = sum([float(cell) for cell in cells[1:]]) / len(cells[1:])
    
    nodeFields = (id, label, averageSimilarity)
    
    node = '\t'.join([str(field) for field in nodeFields])
    return node

In [67]:
# Generate cluster-based cosine graph data
cosineSimilaritiesInputFilePath = '/Volumes/Storage/Programming/dissertation-research/python/cluster-cosineValues.tsv'

nodes = []
edges = []
edgesDict = dict()
with open(cosineSimilaritiesInputFilePath, 'r', encoding='utf8') as cosineCSVData:
    isFirstRow = True
    for line in cosineCSVData:
        if isFirstRow:
            columnHeaders = line.split(',')
            isFirstRow = False
        else:
            node = generateClusterNodeFromCosineSimilarities(line)
            nodes.append(node)
            rowEdges = generateEdgesFromCosineSimilarities(columnHeaders, line)
            for edge in rowEdges:
                cells = edge.split('\t')
                targetAndSourceArray = [
                    cells[2], 
                    cells[1]
                ]
                if targetAndSourceArray[0] != targetAndSourceArray[1]:
                    targetAndSource = set(targetAndSourceArray)
                    pairIndex = '-'.join(targetAndSource)
                    edgesDict[pairIndex] = edge

with open('nodes_clusters.tsv', 'a', encoding='utf8') as nodeFile:
    nodeFile.write('id\tlabel\taverage_similarity\n')
    for node in nodes:
        nodeFile.write(node)
        nodeFile.write('\n')
with open('edges_clusters.tsv', 'a', encoding='utf8') as edgeFile:
    edgeFile.write('id\tsource\ttarget\tweight\n')
    # for edge in prunedEdges:
    for count,edgePair in enumerate(edgesDict):
        edgeFile.write(edgesDict[edgePair])
        edgeFile.write('\n')

.75', '0.6', '0.72', '0.82', '0.55', '0.7', '1.0', '0.9', '0.81', '0.91', '0.89', '0.85', '0.68', '0.73', '0.62', '0.53', '0.84', '0.54', '0.65', '0.71', '0.76', '0.6', '0.5', '0.47', '0.83', '0.58', '0.87', '0.76', '0.74', '0.69', '0.81', '0.52', '0.63', '0.53', '0.69', '0.74', '0.84\n']
CELLS: ['29 01§46 Christ Walks on the Sea;02§12 Christ Stilling the Tempest;03§25 The Storm on the Sea;03§37 Mary and Martha', '0.61', '0.75', '0.48', '0.59', '0.62', '0.58', '0.37', '0.57', '0.7', '0.7', '0.63', '0.68', '0.84', '0.81', '0.79', '0.77', '0.78', '0.76', '0.81', '0.7', '0.79', '0.77', '0.63', '0.71', '0.9', '0.67', '0.79', '0.9', '1.0', '0.84', '0.85', '0.81', '0.79', '0.73', '0.7', '0.77', '0.68', '0.85', '0.46', '0.64', '0.7', '0.77', '0.61', '0.6', '0.57', '0.82', '0.74', '0.91', '0.78', '0.7', '0.82', '0.82', '0.53', '0.65', '0.68', '0.67', '0.71', '0.87\n']
CELLS: ['30 01§52 “Christ the Son of the Living God”;04§18d The Jewish Elites Disbelieve;04§34 04§35 04§36 04§37 04§38 04§39 04

In [72]:
def generateTopEdgesFromCosineSimilarities(columnHeaders, cosineSimilaritiesRow):
    # 'ID\tSOURCE\tTARGET\tWEIGHT\n'
    cells = cosineSimilaritiesRow.split(',')
    edges = []
    edgeTuples = []
    columnHeadersWithoutIDColumn = columnHeaders[1:]
    source = cells[0]
    for count, cell in enumerate(cells):
        if count == 0:
            pass
        else:
            cell = cells[count]
            if float(cell) > 0:
                target = columnHeaders[count]
                if target == source:
                    pass
                else:
                    edgeTuples.append((source,target,float(cell)))
    edgeTuples.sort(key=lambda edgeTuple: edgeTuple[2])
    topEdgeValues = edgeTuples[-2:]
    topEdges = [edge for edge in edgeTuples if edge[2] == topEdgeValues[0][2] or edge[2] == topEdgeValues[1][2]]
    edges = ['\t'.join((f'{edge[0]}-{edge[1]}', edge[0], edge[1], str(edge[2]))) for edge in topEdges]
    print('Edges saved: ',len(edges))
    return edges
    

nodes = []
edges = []
edgesDict = dict()
with open(cosineSimilaritiesInputFilePath, 'r', encoding='utf8') as cosineCSVData:
    isFirstRow = True
    for line in cosineCSVData:
        if isFirstRow:
            columnHeaders = line.split(',')
            isFirstRow = False
        else:
            node = generateNodeFromCosineSimilarities(line)
            nodes.append(node)
            rowEdges = generateTopEdgesFromCosineSimilarities(columnHeaders, line)
            for edge in rowEdges:
                cells = edge.split('\t')
                targetAndSourceArray = [
                    cells[2], 
                    cells[1]
                ]
                if targetAndSourceArray[0] != targetAndSourceArray[1]:
                    targetAndSource = set(targetAndSourceArray)
                    pairIndex = '-'.join(targetAndSource)
                    edgesDict[pairIndex] = edge

with open('nodes_clusters_topEdgesOnly.tsv', 'a', encoding='utf8') as nodeFile:
    nodeFile.write('ID\tLABEL\tAVERAGE_SIMILARITY\n')
    for node in nodes:
        nodeFile.write(node)
        nodeFile.write('\n')
with open('edges_clusters_topEdgesOnly.tsv', 'a', encoding='utf8') as edgeFile:
    edgeFile.write('ID\tSOURCE\tTARGET\tWEIGHT\n')
    # for edge in prunedEdges:
    for count,edgePair in enumerate(edgesDict):
        edgeFile.write(edgesDict[edgePair])
        edgeFile.write('\n')

Edges saved:  3
Edges saved:  3
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  3
Edges saved:  2
Edges saved:  3
Edges saved:  2
Edges saved:  3
Edges saved:  4
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  3
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  3
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
Edges saved:  2
