# FlexPlot a tool to display the accuracy of the FlexGrams algorithm

This tester has been written to provide a analytical tool to measure the accuracy of the FlexGrams algorithm on the basis of a list of known quotations [= masterlist]. The list contains exact citations, adaptations, allusions, and indeterminable quotations of the Gospel according to John as found in the Paedagogus of Clement of Alexandria.

This tool provides several functions to automatize the creation of plots and tables on the basis of predefined parameter ranges for the algoritm. It's main purpose is to detect those parameters that output the best results:

1) All (or nearly all) exact citations [marked by 'C'] are detected,
2) All (or nearly all) adaptations [marked by 'Ad'] are found ('adaptations' are quotations marked by interpolations, omissions, or adaptations compared to the text quoted),
3) Allusions [marked by 'All'] are found as much as possible (however, allusions are many times very loose and difficult to trace by any qualititive tools like the FlexGrams algorithm)
4) Less false negatives (quotations that are not found, but present in the masterlist) is better,
5) Less false positives (matches found by the algorithm, but not present in the masterlist) is better.

### Imports of Python libraries

In [None]:
from collections import namedtuple, OrderedDict
from pprint import pprint
import pandas as pd

# import plotly
# import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, plot, iplot #, download_plotlyjs, 

# Run plotly offline...
init_notebook_mode(connected=False)

# If it plot() or iplot() does not run properly, try to download plotly.js by saying:
# init_notebook_mode(connected=True)


### Definition of helper functions
Several functions are defined to process the citations schemes properly (`refToTuple()`, `tupleToRef()`, and `addSimpleRefs()`). Finally, a function has been defined that provides the functionality to sort the pandas DataFrames (produced by the FlexGrams algorithm) on the basis of the full citation schemes of the sources.

In [None]:
ref1 = '1.2.4'
ref2 = '1.2.4-5'
ref3 = '1.2.4-1.3.1'
tup1 = (('1', '2', '4'),)
tup2 = (('1', '2', '4'), ('1', '2', '5'))
tup3 = (('1', '2', '4'), ('1', '3', '1'))
tup4 = (('6', '64'), ('6', '65'))
tup5 = ('1', '2', '3')
       
def refToTuple(ref, single=False):
    if single == False:
        if '-' in ref:
            spl = ref.split('-')
            if '.' in spl[1]:
                return tuple(spl[0].split('.')), tuple(spl[1].split('.'))
            else:
                return tuple((tuple(spl[0].split('.')[:-1] + \
                              [str(r)]) for r in range(int(spl[0].split('.')[-1]), int(spl[1])+1)),)       
        else:
            return tuple(ref.split('.')),
    else:
        tuple(ref.split('.'))

def tupleToRef(tup, single=False):
    if single == False:
        if len(tup) == 1:
            return '.'.join(tup[0])
        else:
            if tup[0][-2] == tup[-1][-2]:
                return '.'.join(tup[0]) + '-' + tup[1][-1]
            else:
                return '.'.join(tup[0]) + '-' + '.'.join(tup[1])
    else:
        return '.'.join(tup)

print(refToTuple(ref1))
print(refToTuple(ref2))
print(refToTuple(ref3))
    
print(tupleToRef(tup1))
print(tupleToRef(tup2))
print(tupleToRef(tup3))
print(tupleToRef(tup4))
print(tupleToRef(tup5, single=True))

# data is expected to be a pandas DataFrame!
def addSimpleRefs(dataframe):
    bibl_simple = []
    patr_simple = []
    for i in dataframe.bibl_start:
        bibl_simple.append('.'.join(i.split('.')[:-1]))
    for j in dataframe.patr_start:
        patr_simple.append('.'.join(j.split('.')[:-1]))
    dataframe['bibl_simple'] = bibl_simple
    dataframe['patr_simple'] = patr_simple
    return dataframe

# Order pandas DataFrames on the basis of column values
def orderResults(data, sortIndex1, sortIndex2, column=None):
    data['sort_index1'] = data[column[0]].map(sortIndex1)
    data['sort_index2'] = data[column[1]].map(sortIndex2)
    data.sort_values(['sort_index1', 'sort_index2'], ascending = [True, True], inplace=True)
    data.drop('sort_index1', 1, inplace=True)
    data.drop('sort_index2', 1, inplace=True)
    return data

### Import and run the FlexGrams algorithm on the defined texts
The flexgrams module returns the `matches`-object that contains all the results and methods produced by the algorithm. By calling the `matches.refResult()` method, a pandas DataFrame will be returned together with the full citation schemes of both sources. These citation schemes will be used during later steps to process the references of the results properly. the `matches.refResult()` method has several parameters. The `order` parameter defines which input text will be used to sort the results. The `b_levels` and `c_levels` parameters define how the references of the base_path and the comp_path of the FlexGrams algorithm will be returned.

In [None]:
from flexgrams import FlexGrams
from tf.fabric import Timestamp
import os

REPO = '~/github/pthu/patristics'
VERSION = '1.0/'
TF_DIR = os.path.expanduser(f'{REPO}/tf/{VERSION}')

tm = Timestamp()

matches = FlexGrams(base_path=TF_DIR + 'new_testament/Brooke Foss Westcott, Fenton John Anthony Hort/New Testament - John', 
                    comp_path=TF_DIR + 'patristics/Clement Of Alexandria/Paedagogus',
                    ngram=4, skip=1, number=1, ngram_type='unordered', context=5, 
                    distance_base=5, distance_comp=5, self_match=False, mode=2)

parallels, refsBase, refsComp = matches.refResult(order='base', c_levels=(0, 2, 3))

tm.info('This is what it takes...')

In [None]:
# To activate plotly online, you have to fill in your username and api_key!
# from plotly import tools
# tools.set_credentials_file(username='...', api_key='...')

# Read the masterlist into memory...
masterlist = pd.read_csv('data/masterlist_clement_range', delimiter='\t', 
                         names=['bibl_start', 'bibl_stop', 'patr_start',
                                'patr_stop', 'typ', 'conf', 'source', 'found'],
                                index_col=False, dtype={'bibl_start': 'object', 
                                'bibl_stop': 'object', 'patr_start': 'object',
                                'patr_stop': 'object', 'found': 'bool',}
                        )

# Sort the masterlist and the results of FlexGrams
sortIndexBase = dict(zip(refsBase, range(len(refsBase)))) # Create sort index for the base text
sortIndexComp = dict(zip(refsComp, range(len(refsComp)))) # Create sort index for the comparison text
masterlist = orderResults(masterlist, sortIndexComp, sortIndexBase, column=['patr_start', 'bibl_start'])
parallels = orderResults(parallels, sortIndexComp, sortIndexBase, column=['patr_start', 'bibl_start'])
# print(parallels)

def checkMatches(masterlist, parallels):
# Compare the results with the matchlist and transmit the type data from the masterlist to the result (=parallels)
    matchList = []
    typList = []
    parList = []

    for match in masterlist.itertuples():
        bibl_match = set(refsBase[refsBase.index(match.bibl_start):refsBase.index(match.bibl_stop)+1])
        patr_match = set(refsComp[refsComp.index(match.patr_start):refsComp.index(match.patr_stop)+1])
        matchFound = False
        for parallel in parallels.itertuples():
            bibl_par = set(refsBase[refsBase.index(parallel.bibl_start):refsBase.index(parallel.bibl_stop)+1])
            patr_par = set(refsComp[refsComp.index(parallel.patr_start):refsComp.index(parallel.patr_stop)+1])
            if bibl_par & bibl_match and patr_par & patr_match:
                matchFound = True
        matchList.append(matchFound)


    for parallel in parallels.itertuples():
        bibl_par = set(refsBase[refsBase.index(parallel.bibl_start):refsBase.index(parallel.bibl_stop)+1])
        patr_par = set(refsComp[refsComp.index(parallel.patr_start):refsComp.index(parallel.patr_stop)+1])
        parFound = False
        typFound = False
        for match in masterlist.itertuples():
            bibl_match = set(refsBase[refsBase.index(match.bibl_start):refsBase.index(match.bibl_stop)+1])
            patr_match = set(refsComp[refsComp.index(match.patr_start):refsComp.index(match.patr_stop)+1])
            if bibl_par & bibl_match and patr_par & patr_match:
                parFound = True
                typFound = True
                typList.append(match.typ)
                break
        parList.append(parFound)
        if not typFound:
            typList.append('undefined')

    # Apply the results to the subsequent DataFrames...
    masterlist.found = matchList
    parallels.found = parList
    parallels.typ = typList
    
    return masterlist, parallels


### Define the prime plot function `flexPlot()` 

In [None]:
# Add simple references to be used as tick labels in the plots

masterlist, parallels = checkMatches(masterlist, parallels)

masterlist = addSimpleRefs(masterlist)
parallels = addSimpleRefs(parallels)

def flexPlot(masterlist, parallels, refs_base, refs_comp, categories=None, ignore_typs=None, table=False):
    # Filter masterlist and parallels on the basis of ignore_typs
    if ignore_typs == None:
        pass
    else:
        masterlist = masterlist[~masterlist['typ'].isin(ignore_typs)]    #(['Indet', 'All', 'Ad'])]
        parallels = parallels[~parallels['typ'].isin(ignore_typs)]    #(['Indet', 'All', 'Ad'])]

    # Define categories to be plotted
    truePosit = parallels[parallels['found'] == True]
    falsePosit = parallels[parallels['found'] == False]
    falseNegat = masterlist[masterlist['found'] == False]
    
    # Define and sort the ticks to be used on the yaxis
    xaxisList = [ref for ref in refs_base if ref in list(truePosit.bibl_start) + \
                                                    list(falsePosit.bibl_start) + \
                                                    list(falseNegat.bibl_start)]
    yaxisList = [ref for ref in refs_comp if ref in list(truePosit.patr_start) + \
                                                    list(falsePosit.patr_start) + \
                                                    list(falseNegat.patr_start)]

    # Define the data to be plotted
    data = []
    if 'truePos' in categories: 
        truePositives = go.Scatter(
                            x = truePosit.bibl_start,
                            y = truePosit.patr_start,
                            name = 'True Positive',
                            mode = 'markers',
                            marker = dict(
                                size = 15,
                                color = 'green',
                                ),
                            text = 'type: ' + truePosit.typ + '<br> John: ' + truePosit.base_text + '<br> Clem: ' + truePosit.comp_text,
                        )
        data.append(truePositives)
#     if 'falsePos' in categories:
#         falsePositives = go.Scatter(
#                             x = falsePosit.bibl_start,
#                             y = falsePosit.patr_start,
#                             name = 'False Positive',
#                             mode = 'markers',
#                             marker = dict(
#                                 size = 9,
#                                 color = 'orange',
#                                 ),
#                             text = 'type: ' + falsePosit.typ + '<br> John: ' + falsePosit.base_text + '<br> Clem: ' + falsePosit.comp_text,
#                         )
#         data.append(falsePositives)
    if 'falseNeg' in categories:
        falseNegatives = go.Scatter(
                            x = falseNegat.bibl_start,
                            y = falseNegat.patr_start,
                            name = 'False Negative',
                            mode = 'markers',
                            marker = dict(
                                size = 9,
                                color = 'red',
                                ),
                            text = 'type: ' + falseNegat.typ, #<br>{falseNegat.base_text}<br>{falseNegat.comp_text}',
                        )
        data.append(falseNegatives)

    # Define some layout parameters
    layout = go.Layout(
        title = f'Ngram = {matches.ngram}   Skip = {matches.skip}   Ngram-type = {matches.ngram_type}   Number = {matches.number}   Mode = {matches.mode}<br>Distance base = {matches.distance_base}   Distance comp = {matches.distance_comp}<br>',
        xaxis = dict(
            title = 'Gospel According to John',
#             categoryorder = 'array',
#             categoryarray = xaxisList,
#             type = 'category',
            dtick = dict(
# #                 tickval = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0,],
                ticktext = xaxisList)# [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
        ),
        yaxis = dict(
            categoryorder = 'array',
            categoryarray = yaxisList,
            type = 'category',
            showgrid = True,
            title = 'Paedagogus - Clement of Alexandria',
#             dtick = dict(
#                 tickval = yaxisList,
#                 ticktext = ['.'.join(ref.split('.')[:-1]) for ref in yaxisList],
#             ),
        ),
    )
    # Create the plot
    fig = dict(data=data, layout=layout)
    plot(fig, filename='styled-scatter.html')
#     iplot(fig, filename='scatter-mode')
    
    # If table == True: create tables of the results
    if table:
        tableMaster = ff.create_table(masterlist)
        tablePar = ff.create_table(parallels)
        plot(tableMaster, filename='jupyter-table1.html')
        plot(tablePar, filename='jupyter-table1.html')

Execute the `flexPlot()` function

In [None]:
flexPlot(masterlist, parallels, refsBase, refsComp, categories=['truePos', 'falsePos', 'falseNeg'],
          ignore_typs=[], table=False)
# fig = dict(data=data, layout=layout)
# iplot(fig, filename='styled-scatter')

In [None]:
masterlist = pd.read_csv('data/masterlist_clement_range', delimiter='\t', 
                         names=['bibl_start', 'bibl_stop', 'patr_start',
                                'patr_stop', 'typ', 'conf', 'source', 'found'],
                                index_col=False, dtype={'bibl_start': 'object', 
                                'bibl_stop': 'object', 'patr_start': 'object',
                                'patr_stop': 'object', 'found': 'bool',}
                        )

def flexAnalytics(
    masterlist, range_ngram=[4, 5], range_skip=[0, 1], 
    range_number=[1, 3], range_order=['ordered'], range_base_dist=[1, 1], 
    range_comp_dist=[1, 1], range_mode=[2, 2]):
    
    data = OrderedDict(ngram=[], skip=[], number=[], order=[], base_dist=[], comp_dist=[], mode=[],
                       total=[], 
                       truePosit=[], truePositC=[], truePositAd=[], truePositAll=[], truePositIndet=[],
                       falsePosit=[],
                       falseNegat=[], falseNegatC=[], falseNegatAd=[], falseNegatAll=[], falseNegatIndet=[],
                       )
    
    for n in range(range_ngram[0], range_ngram[1] + 1):
        for s in range(range_skip[0], range_skip[1] + 1):
            for num in range(range_number[0], range_number[1] + 1):
                for o in range_order:
                    for bd in range(range_base_dist[0], range_base_dist[1] + 1):
                        for cd in range(range_comp_dist[0] , range_comp_dist[1] + 1):
                            for m in range(range_mode[0], range_mode[1] + 1):
                                matches = FlexGrams(base_path=TF_DIR + 'new_testament/Brooke Foss Westcott, Fenton John Anthony Hort/New Testament - John', 
                                    comp_path=TF_DIR + 'patristics/Clement Of Alexandria/Paedagogus',
                                    ngram=n, skip=s, number=num, ngram_type=o, context=0, 
                                    distance_base=bd, distance_comp=cd, self_match=False, mode=m)
                                
                                parallels, refsBase, refsComp = matches.refResult(order='base', c_levels=(0, 2, 3))
                                
                                masterlist, parallels = checkMatches(masterlist, parallels)
                                
                                truePosit = parallels[parallels['found'] == True]
                                falsePosit = parallels[parallels['found'] == False]
                                falseNegat = masterlist[masterlist['found'] == False]
                                
                                data['ngram'].append(n)
                                data['skip'].append(s)
                                data['number'].append(num)
                                data['order'].append(o)
                                data['base_dist'].append(bd)
                                data['comp_dist'].append(cd)
                                data['mode'].append(m)
                                
                                data['total'].append( (len(truePosit) + len(falsePosit) + len(falseNegat)) )
                                
                                data['truePosit'].append( len(truePosit) )
                                data['truePositC'].append( len(truePosit[truePosit['typ'] == 'C']) )
                                data['truePositAd'].append( len(truePosit[truePosit['typ'] == 'Ad']) )
                                data['truePositAll'].append( len(truePosit[truePosit['typ'] == 'All']) )
                                data['truePositIndet'].append( len(truePosit[truePosit['typ'] == 'Indet']) )
                                
                                data['falsePosit'].append( len(falsePosit) )
                                
                                data['falseNegat'].append( len(falseNegat) )
                                data['falseNegatC'].append( len(falseNegat[falseNegat['typ'] == 'C']) )
                                data['falseNegatAd'].append( len(falseNegat[falseNegat['typ'] == 'Ad']) )
                                data['falseNegatAll'].append( len(falseNegat[falseNegat['typ'] == 'All']) )
                                data['falseNegatIndet'].append( len(falseNegat[falseNegat['typ'] == 'Indet']) )
    analytics = pd.DataFrame(data)
    print(analytics)
    return analytics

In [None]:
data = flexAnalytics(masterlist)

In [None]:
plotData = [
    go.Bar(
        x = data.index,
        y = data.truePosit,
        text = data.truePosit,
        textposition = 'auto',
        name = 'true positives',
        marker = dict(color='green',),
    ),
    go.Bar(
        x = data.index,
        y = data.falseNegat,
        text = data.falseNegat,
        textposition = 'auto',
        name = 'false negatives',
        marker = dict(color='red',),
    ),
    go.Bar(
        x = data.index,
        y = data.falsePosit,
        text = data.falsePosit,
        textposition = 'auto',
        name = 'false positives',
        marker = dict(color='orange',),
    ),
    
]

layout = go.Layout(
#     barmode='stack',
    barmode='group',
    title='results FlexGrams')

fig = go.Figure(data=plotData, layout=layout)

plot(fig, filename='pandas-bar-chart-layout.html')