In [2]:
# %load processing.py
from sklearn.metrics.pairwise import cosine_similarity as cos
import pickle
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import copy
import statistics 
import glob
import csv
from scipy.stats import pearsonr
import numpy as np
import pandas as pd


"""
===================== SCU ======================
"""

class SCU():
    def __init__(self, scu_id, weight, segment_embeddings):
        self.id = scu_id
        self.embeddings = segment_embeddings
        self.weight = weight
    def averageSimilarity(self, segment_embedding):
        normalizer = len(self.embeddings)
        similarity = 0
        for embedding in self.embeddings:
            similarity += cos(embedding, segment_embedding)[0][0]
        return [similarity / normalizer, self.weight]


"""
============== Sentence Graph ====================
"""

class SentenceGraph():
    """ Given a Sentence, build a graph from segmentations """
    def __init__(self, sentence_id, segmentations, scus):
        self.sentence_id = sentence_id
        self.segmentations = segmentations
        self.graph = self.buildGraph(scus)
    def buildGraph(self, scus):
        segmentations = self.segmentations
        graph = []
        for segmentation, segments in segmentations.items():
            hypernode = []
            for segment_id, segment_embedding in segments.items():
                scu_list = self.findSCUs(segment_embedding, scus)
                hypernode.append(Vertex(segment_id, scu_list))
            graph.append(hypernode)
        return graph
    def findSCUs(self, segment_embedding, scus):
        scores = {}
        for scu in scus:
            scores[scu.id] = scu.averageSimilarity(segment_embedding)
        scores = sorted(scores.items(), key=lambda x:x[1][0], reverse=True)[:2]
        scores = [(score[0], score[1][0]) for score in scores]
        return scores

class Vertex():
    def __init__(self, segment_id, scu_list):
        self.id = segment_id
        self.scu_list = scu_list
        self.neighbors = []
        self.useMe = True
    def getWeight(self):
        ###### Weight Scheme 1
        #return sum([scu[1] for scu in self.scu_list]) / len(self.scu_list)
        ###### Weight Scheme 2
        return sum([scu[0] for scu in self.scu_list])
        ###### Weight Scheme 3
        #return max([scu[1] for scu in self.scu_list])

    def add_neighbor(self, neighbor):
        self.neighbors.append(neighbor)
        neighbor.we_are_neighbors(self)
    def add_neighbors(self, *args):
        for neighbor in args:
            self.neighbors.append(neighbor)
            neighbor.we_are_neighbors(self)
    def we_are_neighbors(self, neighbor):
        self.neighbors.append(neighbor)
    def delete(self):
        self.useMe = False
        for neighbor in self.neighbors:
            neighbor.useMe = False


class SummaryGraph():
    def __init__(self, sentences, scus):
        self.sentences = [SentenceGraph(n, segmentations, scus) for n, segmentations in sentences.items()]
        for sentence in self.sentences:
            self.buildInnerEdgesList(sentence.graph)
        self.vertices = self.buildOuterEdgesList()
        self.independentSet = self.buildIndependentSet()
    def buildInnerEdgesList(self, sentenceGraph):
        nodes = list(sentenceGraph)
        while len(nodes) > 0:
            node = nodes[0]
            for vertex in node:
                for n in nodes[1:]:
                    for vert in n:
                        vertex.add_neighbor(vert)
            nodes = nodes[1:]
    def buildOuterEdgesList(self):
        sentences = list(self.sentences)
        vertex_list = []
        while (len(sentences) > 0):
            sentence = sentences[0]
            for node in sentence.graph:
                for vertex in node:
                    vertex_list.append(vertex)
                    for sent in sentences[1:]:
                        for n in sent.graph:
                            for vert in n:
                                if set(vertex.scu_list) & set(vert.scu_list):
                                    vertex.add_neighbor(vert)
            sentences.remove(sentence)
        return vertex_list
    def buildIndependentSet(self):
        vertices = copy.deepcopy([vertex for vertex in self.vertices if vertex.useMe == True])
        independentSet = []
        while len(vertices) != 0:
            vertex = max(vertices, key=lambda x: x.getWeight())
            independentSet.append(vertex)
            vertex.delete()
            vertices = [vert for vert in vertices if vert.useMe == True]
        return independentSet

"""
============= Function Definitions ===============
"""

def buildSCUlist(scu_pickle):
    with open(scu_pickle, 'rb') as f:
        SCUs = pickle.load(f)
    f.close()
    scus =[]
    for scu_id, weight_embeddings in SCUs.items():
        scus.append(SCU(scu_id, int(weight_embeddings[0]), weight_embeddings[1]))
    return scus

def sentencesFromSegmentations(fname):
    f = open(fname, 'r')
    segments = f.readlines()
    sentences = {}
    for segment in segments:
        segment = segment.split('&')
        if segment[1] in sentences.keys():
            embedding = segment[4].strip('\n').replace('[', '').replace(']', '')
            embedding = [float(i) for i in embedding.split(',')]
            sentences[segment[1]].append({'&'.join(segment[:4]):embedding})
        else:
            embedding = segment[4].strip('\n').replace('[', '').replace(']', '')
            embedding = [float(i) for i in embedding.split(',')]
            sentences[segment[1]] = [{'&'.join(segment[:4]):embedding}]
    sentences = sorted(sentences.items(), key=lambda x: int(x[0]))
    sentences = [sentence[1] for sentence in sentences]
    sents = []
    for sentence in sentences:
        segmentations = {}
        for segment in sentence:
            for segment_id, embedding in segment.items():
                if segment_id.split('&')[2] in segmentations.keys():
                    segmentations[segment_id.split('&')[2]][segment_id] = embedding
                else:
                    segmentations[segment_id.split('&')[2]] = {}
                    segmentations[segment_id.split('&')[2]][segment_id] = embedding
        sents.append(segmentations)
    return dict(enumerate(sents))

def buildSCUcandidateList(vertices):
    scu_and_segments = {}
    vertices = sorted(vertices, key = lambda x: int(x.id.split('&')[1]))
    for vertex in vertices:
        for scu in vertex.scu_list:
            if scu[0] in scu_and_segments.keys():
                scu_and_segments[scu[0]][vertex.id] = scu[1]
            else:
                scu_and_segments[scu[0]] = {}
                scu_and_segments[scu[0]][vertex.id] = scu[1]
    return scu_and_segments

def processResults(scu_and_segments, independentSet):
    scu_and_segments = copy.deepcopy(scu_and_segments)
    chosen = []
    chosen_scus = []
    segment_and_scu = {}
    for scu, segments in scu_and_segments.items():
        for segment in segments.keys():
            if segment in chosen:
                del segments[segment] 
        if len(segments) != 0:
            median = statistics.median_high(segments.values())
        for segment, value in segments.items():
            if value == median:
                segment_and_scu[segment] = scu
                #print(segment, scu)
                chosen.append(segment)
                chosen_scus.append(scu)
                del segments[segment]
    return segment_and_scu

def scusBySentences(segment_scu):
    sentences = {}
    for segment, scu in segment_scu.items():
        sentence_id = segment.split('&')[1]
        if sentence_id in sentences.keys():
            sentences[sentence_id][segment] = scu
        else:
            sentences[sentence_id] = {}
            sentences[sentence_id][segment] = scu
    return sentences

def getScore(sentences, scus):
    sentence_scores = {}
    matched_cus = 0
    for sentence, segments in sentences.items():
        lil_score = 0
        for segment, scu in segments.items():
            for s in scus:
                if scu == s.id:
                    lil_score += s.weight
                    matched_cus += 1
        sentence_scores[sentence] = lil_score
    return sum(sentence_scores.values()), matched_cus

def filename(fname):
    slash = fname.rfind('/') + 1
    dot = fname.rfind('.')
    return fname[slash:dot]



'''
================== Scores and Results ================
'''

def recall(results, fname):
    path = 'pan/op_' + filename(fname) + 'pan'
    orig_scus = []
    with open(path, 'r') as f:
        for line in f:
            line.split('\t')
            if type(line[0]) == int:
                if line[0] not in orig_scus:
                    orig_scus.append(line[0])


'''
=================== Pipeline ===========================
'''

# #fnames = sys.argv[1:]
# name_score = {}

# #WMIN_types = {'max': max, 'min': min, 'median_high':statistics.median_high, 'median_low': statistics.median_low}

# results_file = '../results.csv'

# f = open(results_file, 'w')
# f.close()

# fnames = list(glob.iglob('peer_summaries/*'))
# pyramids = list(glob.iglob('pyramids/*'))

# summary_score = {}
# for fname in fnames:
#     f = filename(fname)
#     summary_score[f] = []

# for pyramid in pyramids:
#     print('Pyrmid: {}'.format(pyramid))

#     for fname in fnames: 
#         print('\tFilename: {}'.format(fname))

#         scu_pickle = pyramid 

#         sentences = sentencesFromSegmentations(fname)
#         scus = buildSCUlist(scu_pickle)

#         Graph = SummaryGraph(sentences, scus)
#         independentSet = Graph.independentSet

#         candidates = buildSCUcandidateList(independentSet)
#         results = processResults(candidates, independentSet)
#         rearranged_results = scusBySentences(results)

#         scores, score = getScore(rearranged_results, scus)

#         #print('\n\nScores by Sentences for {}:'.format(fname))
#         #for sentence, s in scores.items():
#             #print('\tSentence {}: {}'.format(sentence, s))
#         #print('Overall Score: {}'.format(score))

#         summary_score[filename(fname)].append(score)

#         #print('\n')

# with open(results_file, 'a') as f:
#     w = csv.writer(f)
#     w.writerow(['Summary'] + [pyramid[pyramid.rfind('/')+ 1:] for pyramid in pyramids])
#     for summary, scores in summary_score.items():
#         w.writerow([s for s in scores])
#         print([s for s in scores])



raw_HUMAN_SCORES = [47,38,38,22,
                    46,51,34,60,
                    13,10,22,19,
                    54,26,44,43,
                    16,36,25,40]

quality_HUMAN_SCORES = [0.7705,0.6552,0.5938,0.4231,
                        0.7188,0.8361,0.7907,0.8571,
                        0.3333,0.2857,0.5641,0.7037,
                        0.7714,0.6667,0.8269,0.8,
                        0.5161,0.6923,0.641,0.8163]

coverage_HUMAN_SCORES = [0.5222,0.4222,0.4222,0.2444,
                        0.5111,0.5667,0.3778,0.6667,
                        0.1444,0.1111,0.2442,0.2111,
                        0.6,0.2889,0.4778,0.4889,
                        0.1778,0.4,0.2778,0.4444,]

comprehension_HUMAN_SCORES = [0.6463,0.5387,0.508,0.3337,
                            0.6149,0.7014,0.5842,0.7619,
                            0.2389,0.1984,0.4042,0.4574,
                            0.6857,0.4778,0.6523,0.6444,
                            0.3469,0.5462,0.4594,0.6303]


class thresholdTable():
    def __init__(self, threshold, thresholdCells):
        self.rawScoreTable = self.buildTable(thresholdCells, 'raw')
        self.qualityScoreTable = self.buildTable(thresholdCells, 'quality')
        self.coverageScoreTable = self.buildTable(thresholdCells, 'coverage')
        self.comprehensionScoreTable = self.buildTable(thresholdCells, 'comprehension')

    def buildTable(self, thresholdCells, tableType):
        Table = {}
        for cell in thresholdCells:
            if cell.a in Table.keys():
                Table[cell.a].append((cell.b, getattr(cell, tableType)))
            else:
                Table[cell.a] = []
                Table[cell.a].append((cell.b, getattr(cell, tableType)))
        for a, b_list in Table.items():
            s_b_list = sorted(b_list, key=lambda x: x[0])
            Table[a] = np.array([s_b[1] for s_b in s_b_list])
        return Table


class thresholdTableCell():
    def __init__(self, threshold, a, b, raw_scores, quality_scores, coverage_scores, comprehension_scores, 
                                                raw_human_scores=raw_HUMAN_SCORES, quality_human_scores = quality_HUMAN_SCORES, 
                                                coverage_human_scores = coverage_HUMAN_SCORES, comprehension_human_scores = comprehension_HUMAN_SCORES):
        self.__rawHumanScores = raw_human_scores
        self.__qualityHumanScores = quality_human_scores
        self.__coverageHumanScores = coverage_human_scores
        self.__comprehensionHumanScores = comprehension_human_scores

        self.threshold = threshold
        self.a = a
        self.b = b

        self.raw = self.__getRawCorrelation(raw_scores)
        self.quality = self.__getQualityCorrelation(quality_scores)
        self.coverage = self.__getCoverageCorrelation(coverage_scores)
        self.comprehension = self.__getComprehensionCorrelation(comprehension_scores)


    def __getRawCorrelation(self, raw_scores):
        return pearsonr(self.__rawHumanScores, raw_scores)[0]
    def __getQualityCorrelation(self, quality_scores):
        return pearsonr(self.__qualityHumanScores, quality_scores)[0]
    def __getCoverageCorrelation(self, coverage_scores):
        return pearsonr(self.__coverageHumanScores, coverage_scores)[0]
    def __getComprehensionCorrelation(self, comprehension_scores):
        return pearsonr(self.__comprehensionHumanScores, comprehension_scores)[0]

class specialCell():
    def __init__(self, a, b, operation, data):
        self.raw = self.TableByOperationType(operation, data)
        self.quality = self.TableByOperationType(operation, data)
        self.coverage = self.TableByOperationType(operation, data)
        self.comprehension = self.TableByOperationType(operation, data)
        self.a = a
        self.b = b
    def TableByOperationType(operation, data):
        return operation(data)



def maxRawScore(count_by_weight, num):
    counts = sorted(count_by_weight.items(), key=lambda x:x[0], reverse=True)
    result = 0
    for count in counts:
        if num >= count[1]:
            num = num - count[1]
            result = result + (count[0]*count[1])
        else:
            result = result + (num*count[0])
            num = 0
    return result

def getLayerSizes(fname):
    f = open(fname, 'r')
    lines = f.readlines()
    count_by_weight = {}
    count = 0
    for n, line in enumerate(lines):
        count_by_weight[n + 1] = int(line.strip())
        count += (n+1) * int(line.strip())
    avg = count/(n+1)
    return count_by_weight, avg

def setDataFrame(table):
    attrs = ['rawScoreTable', 'qualityScoreTable', 'coverageScoreTable', 'comprehensionScoreTable']
    indices = ['1.0', '1.5', '2.0', '2.5', '3.0']
    columns = ['125', '150', '175', '200', '225', '250']
    for attribute in attrs:
        df = pd.DataFrame(getattr(table, attribute), index=indices)
        setattr(table, attribute, df)

pyramids = list(glob.iglob('pyramids/readable_pyramid_t80_a250_*'))
summaries = list(glob.iglob('peer_summaries/*'))

pyramids_by_threshold = {}
acc = {}
for pyramid in pyramids:
    threshold_indicator = pyramid.rfind('t') + 1
    threshold = int(pyramid[threshold_indicator:(threshold_indicator + 2)])
    if threshold in pyramids_by_threshold.keys():
        pyramids_by_threshold[threshold].append(pyramid)
    else:
        pyramids_by_threshold[threshold] = []
        acc[threshold] = {}
        pyramids_by_threshold[threshold].append(pyramid)

table_types = ['raw', 'quality', 'coverage', 'comprehension']
a_vals = [125, 150, 175, 200, 225, 250]
b_vals = [1.0, 1.5, 2.0, 2.5, 3.0]

for threshold in acc.keys():
    for t_type in table_types:
        acc[threshold][t_type] = {}
        for a in a_vals:
            acc[threshold][t_type][a] = {}
            for b in b_vals:
                acc[threshold][t_type][a][b] = []

tables = {}
for threshold, pyramids in pyramids_by_threshold.items():
    print 'THRESHOLD {}'.format(threshold)
    tableCells = []

    for pyramid in pyramids:
        a_ind = pyramid.rfind('a') + 1
        a = int(pyramid[a_ind:(a_ind + 3)])
        b_ind = pyramid.rfind('b') + 1
        b = float(pyramid[b_ind:(b_ind + 3)])
        print '\ta: {}\tb: {}'.format(a, b)
        raw_scores = {}
        quality_scores = {}
        coverage_scores = {}
        comprehension_scores = {}
        for summary in summaries:
            summary_slash= summary.rfind('/') + 1
            summary_dot = summary.rfind('.')
            summary_name = int(summary[summary_slash:summary_dot])
            print '\t\t{}'.format(summary_name)
            sentences = sentencesFromSegmentations(summary)
            scus = buildSCUlist(pyramid)
            Graph = SummaryGraph(sentences, scus)
            independentSet = Graph.independentSet
            candidates = buildSCUcandidateList(independentSet)
            results = processResults(candidates, independentSet)
            rearranged_results = scusBySentences(results)
            score, matched_cus = getScore(rearranged_results, scus)
            size_file = pyramid.replace('.p', '.size').replace('pyramids/', 'sizes/')
            count_by_weight, avg = getLayerSizes(size_file)
            raw_scores[summary_name] = score
            quality = float(score)/maxRawScore(count_by_weight, matched_cus)
            coverage = float(score)/maxRawScore(count_by_weight, avg)
            comprehension = float((quality + coverage)) / 2
            quality_scores[summary_name] = quality
            coverage_scores[summary_name] = coverage
            comprehension_scores[summary_name] = comprehension
        raw_scores = sorted(raw_scores.items(), key=lambda x: x[0])
        raw_scores = [s[1] for s in raw_scores]
        quality_scores = sorted(quality_scores.items(), key=lambda x: x[0])
        quality_scores = [s[1] for s in quality_scores]
        coverage_scores = sorted(coverage_scores.items(), key=lambda x: x[0])
        coverage_scores = [s[1] for s in coverage_scores]
        comprehension_scores = sorted(comprehension_scores.items(), key=lambda x: x[0])
        comprehension_scores = [s[1] for s in comprehension_scores]
        thresholdCell = thresholdTableCell(threshold, a, b, raw_scores, quality_scores, coverage_scores, comprehension_scores)
        tableCells.append(thresholdCell)
        acc[threshold]['raw'][a][b].append(thresholdCell.raw)
        acc[threshold]['quality'][a][b].append(thresholdCell.quality)
        acc[threshold]['coverage'][a][b].append(thresholdCell.coverage)
        acc[threshold]['comprehension'][a][b].append(thresholdCell.comprehension)
    tables[threshold] = thresholdTable(threshold, tableCells)

for threshold, table in tables.items():
    setDataFrame(table)

















    












THRESHOLD 80
	a: 250	b: 1.0
		881197544
		881520246
		881745597
		883076719
		883178560
		883690042
		883707584
		884203758
		885640486
		885925399
		887016576
		887051477
		887582679
		887645490
		887897589
		887898669
		888277107
		888717500
		889198081
		889920032
	a: 250	b: 1.5
		881197544
		881520246
		881745597
		883076719
		883178560
		883690042
		883707584
		884203758
		885640486
		885925399
		887016576
		887051477
		887582679
		887645490
		887897589
		887898669
		888277107
		888717500
		889198081
		889920032
	a: 250	b: 2.0
		881197544
		881520246
		881745597
		883076719
		883178560
		883690042
		883707584
		884203758
		885640486
		885925399
		887016576
		887051477
		887582679
		887645490
		887897589
		887898669
		888277107
		888717500
		889198081
		889920032
	a: 250	b: 2.5
		881197544
		881520246
		881745597
		883076719
		883178560
		883690042
		883707584
		884203758
		885640486
		885925399
		887016576
		887051477
		887582679
		887645490
		887897589
		887898669
		888277107
		8

In [None]:
operations = {'average': statistics.mean, 'variance': statistics.stdev}
averageCells = []
varianceCells = []
for threshold, t_type in acc.items():
    for t_type, a_b in t_type.items():
        for a, b_list in a_b.items():
            for b, values in b_list.items():
                averageCells.append(specialCell(a, b, operations['average'], values))
                varianceCells.append(specialCell(a, b, operations['variance'], values))