In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import os.path
import sys

In [None]:
def errorfill(x, y, yerr, color=None, alpha_fill=0.1, ax=None, linestyle=None):
    ax = ax if ax is not None else plt.gca()
    if color is None:
        color = ax._get_lines.color_cycle.next()
    if np.isscalar(yerr) or len(yerr) == len(y):
        ymin = y - yerr
        ymax = y + yerr
        ind_min = np.where(ymin<0)
        ind_max = np.where(ymax>1)
        ymax[ind_max] = 1.0
        ymin[ind_min] = 0.0
    elif len(yerr) == 2:
        ymin, ymax = yerr
    ax.plot(x, y, color=color, linestyle=linestyle)
    ax.fill_between(x, ymax, ymin, color=color, alpha=alpha_fill)

# DM3

In [None]:
scenario = 'dm3'
tasks = ['dbpedia_dnb','dbpedia_viaf','abt_buy','amazon_google','wdc_watches','wdc_cameras','wdc_phones','wdc_headphones']

for task in tasks:

    if task == 'dbpedia_viaf':
        passive_learning = [0.954]
        pl2000 = [0.851]
        pl1000 = [0.804]
        ylim = (0.2,1)
    if task == 'dbpedia_dnb':
        passive_learning = [0.92]
        pl2000 = [0.779]
        pl1000 = [0.700]
        ylim = (0.2,1)
    if task == 'abt_buy':
        passive_learning = [0.667]
        pl2000 = [0.496]
        pl1000 = [0.345]
        ylim = (0.1,0.8)
    if task == 'amazon_google':
        passive_learning = [0.733]
        pl2000 = [0.558]
        pl1000 = [0.442]
        ylim = (0,0.8)
    if task == 'wdc_cameras':
        passive_learning = [0.746]
        pl2000 = [0.653]
        pl1000 = [0.621]
        ylim = (0.4,0.9)
    if task == 'wdc_watches':
        passive_learning = [0.917]
        pl2000 = [0.529]
        pl1000 = [0.448]
        ylim = (0.2,1)
    if task == 'wdc_phones':
        passive_learning = [0.922]
        pl2000 = [0.69]
        pl1000 = [0.513]
        ylim = (0,1)
    if task == 'wdc_headphones':
        passive_learning = [0.914]
        pl2000 = [0.778]
        pl1000 = [0.530]
        ylim = (0.2,1)


    al_random_init = pd.read_csv(os.path.join(scenario,task,'al_random_init_'+task+'.csv'))
    al_tl = pd.read_csv(os.path.join(scenario,task,'al_tl_'+task+'.csv'))
    al_tl_da = pd.read_csv(os.path.join(scenario,task,'al_tl_da_'+task+'.csv'))
    al_tl_include_source = pd.read_csv(os.path.join(scenario,task,'al_tl_include_source_'+task+'.csv'))
    #al_tl_da_to_ls = pd.read_csv(os.path.join(scenario,task,'al_tl_da_to_ls_'+task+'.csv'))
    #al_tl_da_thresh = pd.read_csv(os.path.join(scenario,task,'al_tl_da_thresh_'+task+'.csv'))

    query_num = np.arange(0,2100,100)
    if task in ['wdc_phones','wdc_headphones']:
        query_num = np.arange(0,220,20)
    if task in ['wdc_watches']:
        query_num = np.arange(0,4100,100)

    #query_num = np.arange(0,len(al_random_init['F1 Mean']))

    passive_learning = passive_learning*len(al_random_init['F1 Mean'])
    pl2000 = pl2000*len(al_random_init['F1 Mean'])
    pl1000 = pl1000*len(al_random_init['F1 Mean'])

    fig, ax = plt.subplots()

    pas_, =  ax.plot(query_num, passive_learning, color="#000000", label="PL")
    if task in ['wdc_phones','wdc_headphones']:
        pl2_, =  ax.plot(query_num, passive_learning, color="#000000", label="PL200", linestyle ='--')
        pl1_, =  ax.plot(query_num, passive_learning, color="#000000", label="PL100", linestyle='dotted')
    else:
        pl2_, =  ax.plot(query_num, passive_learning, color="#000000", label="PL2000", linestyle ='--')
        pl1_, =  ax.plot(query_num, passive_learning, color="#000000", label="PL1000", linestyle='dotted')
    alrandominit_, = ax.plot(query_num, al_random_init['F1 Mean'], color="#D55E00", label="al_random_init", linestyle ='--')
    altl_, = ax.plot(query_num, al_tl['F1 Mean'], color="#2F73B2", label="al_tl" , linestyle = 'dotted')
    altlda_, = ax.plot(query_num, al_tl_da['F1 Mean'], color="#4E9E73", label="al_tl_da", linestyle='dashdot')
    altlis_, = ax.plot(query_num, al_tl_include_source['F1 Mean'], color="#6B00D7", label="al_tl_include_source", linestyle='dotted')
    #altldatols_, = ax.plot(query_num, al_tl_da_to_ls['F1 Mean'], color="#6B00D7", label="al_tl_da_to_ls", linestyle='dashdot')
    #altldathresh_, = ax.plot(query_num, al_tl_da_thresh['F1 Mean'], color="#6CFF6C", label="al_tl_da_thresh", linestyle='dashdot')

    ax.plot(query_num,passive_learning, color="#000000")
    ax.plot(query_num,pl2000, color="#000000", linestyle ='--')
    ax.plot(query_num,pl1000, color="#000000", linestyle='dotted')

    #ax.plot(al_random_init['F1 Mean'], color="#D55E00", linestyle='--')
    errorfill(query_num, al_random_init['F1 Mean'].to_numpy(), al_random_init['F1 Std'].to_numpy(), color="#D55E00", linestyle ='--')

    #ax.plot(al_tl['F1 Mean'], color="#2F73B2" , linestyle = 'dotted')
    errorfill(query_num, al_tl['F1 Mean'].to_numpy(), al_tl['F1 Std'].to_numpy(), color="#2F73B2", linestyle = 'dotted')

    #ax.plot(al_tl_da['F1 Mean'], color="#2F73B2" , linestyle = 'dotted')
    errorfill(query_num, al_tl_da['F1 Mean'].to_numpy(), al_tl_da['F1 Std'].to_numpy(), color="#4E9E73", linestyle = 'dotted')

    #ax.plot(al_tl_da['F1 Mean'], color="#2F73B2" , linestyle = 'dotted')
    errorfill(query_num, al_tl_include_source['F1 Mean'].to_numpy(), al_tl_include_source['F1 Std'].to_numpy(), color="#6B00D7", linestyle = 'dotted')

    #ax.plot(al_tl_da_to_ls['F1 Mean'], color="#6B00D7" , linestyle = 'dotted')
    #errorfill(query_num, al_tl_da_to_ls['F1 Mean'].to_numpy(), al_tl_da_to_ls['F1 Std'].to_numpy(), color="#6B00D7", linestyle = 'dotted')

    #ax.plot(al_tl_da_thresh['F1 Mean'], color="#6CFF6C" , linestyle = 'dotted')
    #errorfill(query_num, al_tl_da_thresh['F1 Mean'].to_numpy(), al_tl_da_thresh['F1 Std'].to_numpy(), color="#6CFF6C", linestyle = 'dotted')

    plt.yticks(np.arange(0, 1.1, step=0.1))
    plt.ylim(ylim)

    ax.grid(True)
    gridlines = ax.get_xgridlines()
    for line in gridlines:
        line.set_linestyle('-.')
    ax.set_xlabel("# Labeled Target Examples", fontsize=10)
    ax.set_ylabel("F1", fontsize=10)

    #ax.legend(handles=[pas_, alrandominit_, altl_, altlda_, altldatols_, altldathresh_], fontsize=12)
    ax.legend(handles=[pas_, pl2_, pl1_, alrandominit_, altl_, altlda_, altlis_], fontsize=9, bbox_to_anchor=(1.01, 1), loc='upper left')

    #plt.title(task, fontsize=12)

    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)

    plt.savefig('%s.pdf' % (os.path.join(scenario,'graphs',task)), bbox_inches='tight', format='pdf')

# Ditto

In [None]:
scenario = 'ditto'
#tasks =
#['dbpedia_dnb','dbpedia_viaf','abt_buy','amazon_google','wdc_watches','wdc_cameras','wdc_phones','wdc_headphones']
#tasks = ['dbpedia_dnb','amazon_google','wdc_watches','wdc_cameras']
tasks = ['dbpedia_viaf']
for task in tasks:

    if task == 'dbpedia_viaf':
        passive_learning = [0.992]
        pl2000 = [0.964]
        pl1000 = [0.950]
        ylim = (0.2,1)
    if task == 'dbpedia_dnb':
        passive_learning = [0.989]
        pl2000 = [0.977]
        pl1000 = [0.964]
        ylim = (0.6,1)
    if task == 'abt_buy':
        passive_learning = [0.957]
        pl2000 = [0.887]
        pl1000 = [0.830]
        ylim = (0.1,1)
    if task == 'amazon_google':
        passive_learning = [0.821]
        pl2000 = [0.801]
        pl1000 = [0.721]
        ylim = (0.1,1)
    if task == 'wdc_cameras':
        passive_learning = [0.901]
        pl2000 = [0.849]
        pl1000 = [0.808]
        ylim = (0.3,1)
    if task == 'wdc_watches':
        passive_learning = [0.969]
        pl2000 = [0.864]
        pl1000 = [0.749]
        ylim = (0.2,1)
    #if task == 'wdc_phones':
    #    passive_learning = [0.919]
    #    pl2000 = [0.69]
    #    pl1000 = [0.513]
    #    ylim = (0,1)
    #if task == 'wdc_headphones':
    #    passive_learning = [0.963]
    #    pl2000 = [0.778]
    #    pl1000 = [0.530]
    #    ylim = (0.2,1)


    al_random_init = pd.read_csv(os.path.join(scenario,task,'al_random_init_'+task+'.csv'))
    al_tl = pd.read_csv(os.path.join(scenario,task,'al_tl_'+task+'.csv'))
    #al_tl_da = pd.read_csv(os.path.join(scenario,task,'al_tl_da_'+task+'.csv'))
    al_tl_include_source = pd.read_csv(os.path.join(scenario,task,'al_tl_include_source_'+task+'.csv'))
    #al_tl_da_to_ls = pd.read_csv(os.path.join(scenario,task,'al_tl_da_to_ls_'+task+'.csv'))
    #al_tl_da_thresh = pd.read_csv(os.path.join(scenario,task,'al_tl_da_thresh_'+task+'.csv'))

    query_num = np.arange(0,2100,100)
    if task in ['wdc_phones','wdc_headphones']:
        query_num = np.arange(0,220,20)
    if task in ['wdc_watches']:
        query_num = np.arange(0,4200,200)

    #query_num = np.arange(0,len(al_random_init['F1 Mean']))

    passive_learning = passive_learning*len(al_random_init['F1 Mean'])
    pl2000 = pl2000*len(al_random_init['F1 Mean'])
    pl1000 = pl1000*len(al_random_init['F1 Mean'])

    fig, ax = plt.subplots()

    pas_, =  ax.plot(query_num, passive_learning, color="#000000", label="PL")
    if task in ['wdc_phones','wdc_headphones']:
        pl2_, =  ax.plot(query_num, passive_learning, color="#000000", label="PL200", linestyle ='--')
        pl1_, =  ax.plot(query_num, passive_learning, color="#000000", label="PL100", linestyle='dotted')
    else:
        pl2_, =  ax.plot(query_num, passive_learning, color="#000000", label="PL2000", linestyle ='--')
        pl1_, =  ax.plot(query_num, passive_learning, color="#000000", label="PL1000", linestyle='dotted')
    alrandominit_, = ax.plot(query_num, al_random_init['F1 Mean'], color="#D55E00", label="al_random_init", linestyle ='--')
    altl_, = ax.plot(query_num, al_tl['F1 Mean'], color="#2F73B2", label="al_tl" , linestyle = 'dotted')
    #altlda_, = ax.plot(query_num, al_tl_da['F1 Mean'], color="#4E9E73", label="al_tl_da", linestyle='dashdot')
    altlis_, = ax.plot(query_num, al_tl_include_source['F1 Mean'], color="#6B00D7", label="al_tl_include_source", linestyle='dotted')
    #altldatols_, = ax.plot(query_num, al_tl_da_to_ls['F1 Mean'], color="#6B00D7", label="al_tl_da_to_ls", linestyle='dashdot')
    #altldathresh_, = ax.plot(query_num, al_tl_da_thresh['F1 Mean'], color="#6CFF6C", label="al_tl_da_thresh", linestyle='dashdot')

    ax.plot(query_num,passive_learning, color="#000000")
    ax.plot(query_num,pl2000, color="#000000", linestyle ='--')
    ax.plot(query_num,pl1000, color="#000000", linestyle='dotted')

    #ax.plot(al_random_init['F1 Mean'], color="#D55E00", linestyle='--')
    errorfill(query_num, al_random_init['F1 Mean'].to_numpy(), al_random_init['F1 Std'].to_numpy(), color="#D55E00", linestyle ='--')

    #ax.plot(al_tl['F1 Mean'], color="#2F73B2" , linestyle = 'dotted')
    errorfill(query_num, al_tl['F1 Mean'].to_numpy(), al_tl['F1 Std'].to_numpy(), color="#2F73B2", linestyle = 'dotted')

    #ax.plot(al_tl_da['F1 Mean'], color="#2F73B2" , linestyle = 'dotted')
    #errorfill(query_num, al_tl_da['F1 Mean'].to_numpy(), al_tl_da['F1 Std'].to_numpy(), color="#4E9E73", linestyle = 'dotted')

    #ax.plot(al_tl_da['F1 Mean'], color="#2F73B2" , linestyle = 'dotted')
    errorfill(query_num, al_tl_include_source['F1 Mean'].to_numpy(), al_tl_include_source['F1 Std'].to_numpy(), color="#6B00D7", linestyle = 'dotted')

    #ax.plot(al_tl_da_to_ls['F1 Mean'], color="#6B00D7" , linestyle = 'dotted')
    #errorfill(query_num, al_tl_da_to_ls['F1 Mean'].to_numpy(), al_tl_da_to_ls['F1 Std'].to_numpy(), color="#6B00D7", linestyle = 'dotted')

    #ax.plot(al_tl_da_thresh['F1 Mean'], color="#6CFF6C" , linestyle = 'dotted')
    #errorfill(query_num, al_tl_da_thresh['F1 Mean'].to_numpy(), al_tl_da_thresh['F1 Std'].to_numpy(), color="#6CFF6C", linestyle = 'dotted')

    plt.yticks(np.arange(0, 1.1, step=0.1))
    plt.ylim(ylim)

    ax.grid(True)
    gridlines = ax.get_xgridlines()
    for line in gridlines:
        line.set_linestyle('-.')
    ax.set_xlabel("# Labeled Target Examples", fontsize=10)
    ax.set_ylabel("F1", fontsize=10)

    #ax.legend(handles=[pas_, alrandominit_, altl_, altlda_, altldatols_, altldathresh_], fontsize=12)
    ax.legend(handles=[pas_, pl2_, pl1_, alrandominit_, altl_, altlis_], fontsize=9, bbox_to_anchor=(1.01, 1), loc='upper left')

    #plt.title(task, fontsize=12)

    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)

    plt.savefig('%s.pdf' % (os.path.join(scenario,'graphs',task)), bbox_inches='tight', format='pdf')

In [None]:
# plot data augmentation labels
df_da = pd.read_csv(os.path.join('final/deepmatcher','amazon_google','al_tl_da_amazon_google.csv'))
import json
mean_cm = df_da.apply(lambda x: [round(val/3) for val in [sum(a) for a in zip(json.loads(x['Run 1: da labels']), json.loads(x['Run 2: da labels']), json.loads(x['Run 3: da labels']))]],axis=1)
print(pd.DataFrame.from_dict(dict(zip(mean_cm.index, mean_cm.values))).to_latex(index=False))