In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.rcParams['axes.axisbelow'] = True

mpl.rcParams['grid.color'] = 'k'
mpl.rcParams['grid.linestyle'] = ':'
mpl.rcParams['grid.linewidth'] = 0.5

mpl.rcParams['lines.linewidth'] = 1.5 # was 1.0
mpl.rcParams['lines.dashed_pattern'] = [6, 6]
mpl.rcParams['lines.dashdot_pattern'] = [3, 5, 1, 5]
mpl.rcParams['lines.dotted_pattern'] = [1, 3]
mpl.rcParams['lines.scale_dashes'] = False
mpl.rcParams['axes.grid'] = True

# via https://matplotlib.org/users/dflt_style_changes.html#patch-edges-and-color
mpl.rcParams['patch.force_edgecolor'] = True
mpl.rcParams['patch.facecolor'] = 'b'

# via https://stackoverflow.com/questions/3899980/how-to-change-the-font-size-on-a-matplotlib-plot
mpl.rcParams.update({'font.size': 13})

In [None]:
import numpy as np
import pandas as pd
import subprocess
import json

Although on macOS, we use GNU grep for searching. It is installed via homebrew and prefixed with a `g`.
See https://apple.stackexchange.com/a/193300/224617

In [None]:
def get_allstats_files(static_positive_search_strings, static_negative_search_strings, dynamic_search_strings, if_two_take_first = False):
    search_folder = "<PROJECT_DIR>/output/XX_runs"
    search_files = "{}/**/all_stats.json".format(search_folder)
    
    
    static_grep_string = 'ggrep -l "{}" {}'.format(static_positive_search_strings[0], search_files)
    for s in static_positive_search_strings[1:]:
        static_grep_string += ' | xargs ggrep -l "{}"'.format(s)
    for s in static_negative_search_strings:
        static_grep_string += ' | xargs ggrep -L "{}"'.format(s)
    
    result_files = []
    for s in dynamic_search_strings:
        grep_string = static_grep_string + ' | xargs ggrep -l "{}"'.format(s)
        grep_process = subprocess.Popen(grep_string, stdout=subprocess.PIPE, shell=True)
        grep_out, grep_err = grep_process.communicate()
        grep_result_files = grep_out.decode().splitlines()
        
        if len(grep_result_files) == 1:
            # if only one result item
            result_files.append(grep_result_files[0])
        elif len(grep_result_files) == 2 and if_two_take_first:
            print("Two grep results, took first because if_two_take_first was set")
            result_files.append(grep_result_files[0])
        elif len(grep_result_files) != 1:
            print("Multiple results for \npos: {}, \nneg: {}, \ndynamic: {}, \nfiles: {} ".format(
                static_positive_search_strings, 
                static_negative_search_strings, 
                s,
                grep_result_files
            ))
            raise Exception
        
        if grep_err:
            print("grep_err: {}".format(grep_err))
            raise Exception
    
    return result_files

In [None]:
def get_allstats_data(files):
    mean_accuracies = []
    for file in files:
        with open(file) as f:
            data = json.load(f)
        mean_accuracies.append(data["mean_accuracy"])
    return {
        "mean_accuracies": mean_accuracies
    }

In [None]:
def plot_df(df, xlabel, ylabel):
    fig, ax = plt.subplots()
    df.plot.line(ax=ax, style='.-')
    ax.set_xticks(df.index)
    ax.set_xticklabels(df.index)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    plt.tight_layout()
    return ax

In [None]:
iterations = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,20]

ft_lp_iter = get_allstats_files(
    ["fasttext", "label_propagation"], 
    ["size", "n_neighbors"], 
    ['phase7.params.options.iter={}\\"'.format(i) for i in iterations])
ft_lp_iter_mean_accuracies = get_allstats_data(ft_lp_iter)["mean_accuracies"]

ft_ls_iter = get_allstats_files(
    ["fasttext", "label_spreading"], 
    ["size", "n_neighbors"], 
    ['phase7.params.options.iter={}\\"'.format(i) for i in iterations])
ft_ls_iter_mean_accuracies = get_allstats_data(ft_ls_iter)["mean_accuracies"]

ft_knn10_lp_iter = get_allstats_files(
    ["fasttext", "label_propagation", "n_neighbors"], 
    ["size"], 
    ['phase7.params.options.iter={}\\"'.format(i) for i in iterations], if_two_take_first=True)
ft_knn10_lp_iter_mean_accuracies = get_allstats_data(ft_knn10_lp_iter)["mean_accuracies"]

ft_knn10_ls_iter = get_allstats_files(
    ["fasttext", "label_spreading", "n_neighbors"], 
    ["size"], 
    ['phase7.params.options.iter={}\\"'.format(i) for i in iterations])
ft_knn10_ls_iter_mean_accuracies = get_allstats_data(ft_knn10_ls_iter)["mean_accuracies"]

df = pd.DataFrame({
    'Label Spreading, k=10': ft_knn10_ls_iter_mean_accuracies,
    'Label Propagation, k=10': ft_knn10_lp_iter_mean_accuracies,
    'Label Spreading, k=3': ft_ls_iter_mean_accuracies,
    'Label Propagation, k=3': ft_lp_iter_mean_accuracies,
}, index=iterations)
ax = plot_df(df, "Iterations", "Accuracy")
plt.ylim(0.15, 0.55)

vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])

plt.savefig('05_lp_technology_comparison_fasttext_accuracy.png', dpi = 300, bbox_inches='tight')

In [None]:
iterations = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,20]

w2v_lp_iter = get_allstats_files(
    ["word2vec", "label_propagation"], 
    ["size", "n_neighbors"], 
    ['phase7.params.options.iter={}\\"'.format(i) for i in iterations])
w2v_lp_iter_mean_accuracies = get_allstats_data(w2v_lp_iter)["mean_accuracies"]

w2v_ls_iter = get_allstats_files(
    ["word2vec", "label_spreading"], 
    ["size", "n_neighbors"], 
    ['phase7.params.options.iter={}\\"'.format(i) for i in iterations])
w2v_ls_iter_mean_accuracies = get_allstats_data(w2v_ls_iter)["mean_accuracies"]

w2v_knn10_lp_iter = get_allstats_files(
    ["word2vec", "label_propagation", "n_neighbors"], 
    ["size"], 
    ['phase7.params.options.iter={}\\"'.format(i) for i in iterations])
w2v_knn10_lp_iter_mean_accuracies = get_allstats_data(w2v_knn10_lp_iter)["mean_accuracies"]

w2v_knn10_ls_iter = get_allstats_files(
    ["word2vec", "label_spreading", "n_neighbors"], 
    ["size"], 
    ['phase7.params.options.iter={}\\"'.format(i) for i in iterations])
w2v_knn10_ls_iter_mean_accuracies = get_allstats_data(w2v_knn10_ls_iter)["mean_accuracies"]

df = pd.DataFrame({
    'Label Spreading, k=10': w2v_knn10_ls_iter_mean_accuracies,
    'Label Propagation, k=10': w2v_knn10_lp_iter_mean_accuracies,
    'Label Propagation, k=3': w2v_lp_iter_mean_accuracies,
    'Label Spreading, k=3': w2v_ls_iter_mean_accuracies
}, index=iterations)
ax = plot_df(df, "Iterations", "Accuracy")
plt.ylim(0.15, 0.55)

vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0%}'.format(x) for x in vals])

plt.savefig('05_lp_technology_comparison_word2vec_accuracy.png', dpi = 300, bbox_inches='tight')