# Introduction

The purpose of this notebook is to easily reprocude the results of the paper "A Comparison of Machine Learning-Based Text Classifiers for Mapping Source Code to Architectural Modules" published at the "Eighth Workshop on Software Architecture Erosion and Architectural Consistency (SAEroCon 2021)".

If you are running the notebook for the first time, you should edit the two last lines in the next cell:
-Set STEP_PREPROCESS to True
-Set STEP_EVALUATE to True

These variables can be used to skip the computationally expensive steps of preprocessing and evaluation, respectively, in later runs and use previously computed results instead.

Similarily, TEST_ALL_PRPROCESSING_SETTINGS can be used to (by setting it to false) to use only a couple of preprocessing settings from a configuration file instead of all the combinations possible (see paper for details).

In [None]:
# Imports and scripts
#import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import pathmagic
%matplotlib inline
with pathmagic.context():
    import Preprocess as Prep
    import RelativePaths as RP
    import Evaluation as Eva
    import GatherData as Gather
    import Graphs
    import Utils
    import Metrics
#import ray
#ray.shutdown()
#ray.init()
from IPython.display import display
from itertools import chain, combinations
import concurrent.futures
import os
import itertools
import sys
import threading

systems = ['jabref', 'prom', 'team', 'ant', 'lucene']
systemNames = {'jabref' : 'JabRef', 'prom' : 'ProM', 'team' : 'TeamMates', 'ant' : 'Ant', 'lucene' : 'Lucene'}
#systems = ['jabref']
#systemNames = {'jabref' : 'JabRef'}
TEST_ALL_PREPROCESSING_SETTINGS = True

STEP_PREPROCESS = False
STEP_EVALUATE = False

In [None]:
# File containing settings
path_to_yaml = '../config.yaml'
config = Utils.read_yaml_file(path_to_yaml)
files = {}

for system in systems:
    files[system] = config['file locations'][system]
preprocess_settings = config['preprocess settings list']

In [None]:
# set file_locations which will be relative to computer in use
from pathlib import Path
raw_data_csv = {}
system_folder = {}
tmp_csv = {}
table_file = {}
for system in systems:
    raw_data_csv[system] = str(Path.cwd().parent / files[system]['raw data'])
    system_folder[system] = str(Path.cwd().parent / files[system]['system folder'])
    tmp_csv[system] = str(Path.cwd().parent / files[system]['tmp data'])
    table_file[system] = str(Path.cwd().parent / files[system]['preprocess comparisons'])

In [None]:
#setting up the logging
import logging
import sys
import datetime

timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')
logfilename=f'../log/experiments.log'
#formatter = logging.Formatter('[%(asctime)s] %(name)s %(levelname)s - %(message)s')

file_handler = logging.FileHandler(filename=logfilename)
file_handler.setLevel(logging.INFO)

logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(message)s',
    handlers=[
        file_handler
    ]
)
logger = logging.getLogger()
logger.info("Logfile created")

## Extracting the architectural data

In [None]:
for system in systems:
    Gather.gather_architectural_concerns_data(system_folder[system], raw_data_csv[system])

In [None]:
dataset_df = {}
for system in systems:
    dataset_df[system] = pd.read_csv(raw_data_csv[system])
    y_labels = dataset_df[system].Label.unique()
    x_quantity = [len(dataset_df[system].loc[dataset_df[system]['Label']==label]) for label in y_labels]
    tmp_df = pd.DataFrame({
        'Labels' : y_labels,
        'Quantity' : x_quantity
    })
    tmp_df = tmp_df.sort_values(by=['Quantity'])
    plt.style.use("seaborn-whitegrid")
    plt.barh(y=tmp_df.Labels, width=tmp_df.Quantity)
    for i, v in enumerate(tmp_df.Quantity):
        plt.text(v, i, str(v), color='black', fontweight='bold', ha='left', va='center')

    plt.xlabel('Modules')
    plt.ylabel('Number of files')
    plt.title('Files per module for ' + systemNames[system])
    plt.show()

## Comparison of preprocessing settings

In [None]:
def int_to_bin_array(i, dim):
    result = np.zeros(dim[0]*dim[1])
    j = -1
    while i != 0:
        bit = i % 2
        result[j] = bit
        j-=1
        i = i // 2
    return np.reshape(result, dim).astype(int)

def add_default_preprocessing(settings):
    for s in settings:
        s.extend(['lc','sc','sw','jk', 'tow'])
    return settings

def create_setting(df):
    if len(df.index) == 0:
        return None
    df_temp = df.replace(1, pd.Series(df.columns, df.columns))
    df_temp.insert(0, 'step', df_temp.index)
    result = [list(filter(lambda x: x != 0,l)) for l in df_temp.to_numpy().tolist()]
    return add_default_preprocessing(result)
  
pp_steps = ['scw', 'stem']
pp_parts = ['pac', 'lib', 'c', 'pm', 'com']
pp_parts_powerset = list(chain.from_iterable(combinations(pp_parts, r) for r in range(1, len(pp_parts)+1)))
available_settings = []
if TEST_ALL_PREPROCESSING_SETTINGS:
    for parts in pp_parts_powerset:
        settings = [int_to_bin_array(i, (len(parts), len(pp_steps))) for i in range(2**(len(pp_steps)*len(parts)))]
        available_settings.extend([create_setting(pd.DataFrame(s, index=parts, columns=pp_steps)) for s in settings])
else:
    for setting_id, setting in preprocess_settings.items():
        available_settings.append(setting)
available_settings = [(i,available_setting) for i, available_setting in enumerate(available_settings)]

We rank the available preprocessing settings for all classifiers per system by accuracy. Evaluation at a training set size of 0.1 and Monte Carlo CV with 100 iterations.

In [None]:
if STEP_PREPROCESS:
    MAX_TASKS = 16

    for system in systems:
        print("Processing system " + systemNames[system] + "...")
        Prep.clear_preprocessing_cache()
        try:
            with concurrent.futures.ThreadPoolExecutor() as executor:
                settings_iterator = iter(available_settings)
                pp_futures = [executor.submit(Prep.lazy_preprocess_settings,
                                        setting,
                                        raw_data_csv[system],
                                        tmp_csv[system] + os.path.sep + str(setting_id) + ".csv")
                              for setting_id, setting in itertools.islice(settings_iterator, MAX_TASKS)]
                while pp_futures:
                    done, pp_futures = concurrent.futures.wait(pp_futures, return_when=concurrent.futures.FIRST_COMPLETED)
                    for setting_id, setting in itertools.islice(settings_iterator, len(done)):
                        pp_futures.add(executor.submit(Prep.lazy_preprocess_settings,
                                        setting,
                                        raw_data_csv[system],
                                        tmp_csv[system] + os.path.sep + str(setting_id) + ".csv"))
        except Exception as e:
            logger.info("Error: {0}".format(e))
    print("Preprocessing complete.")
else:
    print("Preprocessing deactivated - reusing previous results.")

In [None]:
df_columns = [
    'classifier',
    'setting_id',
    'settings', 
    'accuracy', 
    'w_avg_precision', 
    'w_avg_recall'
]

MAX_TASKS = 16
WORKER_THREADS = 2
table_lock  = threading.Lock()
main_table = {}
test_size=0.8
fold_quantity = 10

def evaluate_concurrently(system, setting_id, setting, test_size, n_splits):
    if setting != None:
        tmp_df = pd.read_csv(tmp_csv[system] + os.path.sep + str(setting_id) + ".csv")
        df_sliced = Utils.remove_concerns_under_quantity_threshold(tmp_df)
        # Train and gather evaluation metrics
        logging.getLogger().info("Training started for setting " + str(setting_id))
        evaluate = Eva.Evaluation(df_sliced, CountVectorizer(), test_size, n_splits, 10)
        metrics = evaluate.evaluate_all()
        table_lock.acquire()
        for m in metrics:
            row = Utils.make_dataframe_row(m, setting, "s"+str(setting_id))
            main_table[system] = main_table[system].append(row, ignore_index=True)
        table_lock.release()
        logging.getLogger().info("Training done for setting " + str(setting_id))

if STEP_EVALUATE:
    for system in systems:
        print("Evaluating settings for system " + systemNames[system] + "...")
        main_table[system] = pd.DataFrame(columns=df_columns)
        try:
            with concurrent.futures.ThreadPoolExecutor(max_workers=WORKER_THREADS) as executor:
                settings_iterator = iter(available_settings)
                eval_futures = [executor.submit(evaluate_concurrently,
                                              system,
                                              setting_id,
                                              setting,
                                              test_size,
                                              fold_quantity)
                              for setting_id, setting in itertools.islice(settings_iterator, MAX_TASKS)]
                while eval_futures:
                    done, eval_futures = concurrent.futures.wait(eval_futures, return_when=concurrent.futures.FIRST_COMPLETED)
                    for setting_id, setting in itertools.islice(settings_iterator, len(done)):
                        eval_futures.add(executor.submit(evaluate_concurrently,
                                            system,
                                            setting_id,
                                            setting,
                                            test_size,
                                            fold_quantity))
                main_table[system].to_csv(table_file[system], index=False)
        except Exception as e:
            logger.info("Error: {0}".format(e))
    print("Evaluation of preprocessing settings complete.")
else:
    print("Evaluation of preprocessing settings deactivated.")

In [None]:
#for adding columns for code parts
code_elems = {'package_decl' : "'pac'", 'imports' : "'lib'", 'class_decl' : "'c'", 'public_methods' : "'pm'", 'comments' : "'com'"}
for system in systems:
    main_table[system] = pd.read_csv(table_file[system])
    for elem in code_elems.keys():
        main_table[system][elem] = 0
        main_table[system].loc[main_table[system].settings.str.contains(code_elems[elem])==True, elem] = 1

## Results per system

In [None]:
simple_settings = ['s0', 's1', 's2', 's3']

for system in systems:
    mask = main_table[system]['setting_id'].isin(simple_settings)
    main_table[system] = main_table[system][~mask]
    main_table[system]['system'] = systemNames[system]
    print(systemNames[system])
    display(main_table[system].sort_values(by='accuracy', ascending=False))

## Results by classifier and setting

In [None]:
df_total = pd.concat(main_table, axis=0, ignore_index=True)
df_grouped = df_total.drop(labels=['system', 'settings', 'setting_id'], axis=1)
df_grouped = df_grouped.groupby(by=['classifier', 'package_decl', 'imports', 'class_decl', 'public_methods', 'comments']).mean()
df_grouped.sort_values(by='accuracy',ascending=[False])

In [None]:
df_h = df_grouped.reset_index()
df_h = df_h.set_index([c for c in code_elems.keys()]).sort_index()
df_h = pd.pivot_table(df_h, values='accuracy', columns='classifier', index=code_elems.keys()).transpose()
plt.figure(figsize=(12,2))
ax = sns.heatmap(df_h, annot=True, cbar=False, annot_kws={'rotation': 90, 'fontsize' : 13})
ax.set_xlabel("Package declaration - import statement - class declaration -\npublic methods - comments extracted (0=no/1=yes)", fontsize=14)
ax.set_ylabel("")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_yticklabels(['Log. Regression', 'Naive Bayes', 'SVM'], fontsize=13)

In [None]:
df_sd = df_total.drop(labels=['system'], axis=1)
df_sd = df_sd.groupby(by=['classifier', 'package_decl', 'imports', 'class_decl', 'public_methods', 'comments', 'settings', 'setting_id']).mean()
df_sd = df_sd.reset_index().set_index([c for c in code_elems.keys()]).sort_index()
df_sd = df_sd.drop(labels=['settings','setting_id'], axis=1)
df_sd = df_sd.groupby(by=['classifier', 'package_decl', 'imports', 'class_decl', 'public_methods', 'comments']).std()
df_sd = df_sd.reset_index().set_index([c for c in code_elems.keys()]).sort_index()

df_sd = pd.pivot_table(df_sd, values='accuracy', columns='classifier', index=code_elems.keys()).transpose()
plt.figure(figsize=(12,2))
ax = sns.heatmap(df_sd, annot=True, cbar=False, fmt=".3f", annot_kws={'rotation': 90, 'fontsize' : 12})
ax.set_xlabel("Package declaration - import statement - class declaration -\npublic methods - comments extracted (0=no/1=yes)", fontsize=14)
ax.set_ylabel("")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.set_yticklabels(['Log. Regression', 'Naive Bayes', 'SVM'], fontsize=13)

In [None]:
classifiers = ['Support Vector Machines', 'Logistic Regression', 'Naive Bayes']
df_total = pd.concat(main_table, axis=0, ignore_index=True)
df_grouped = df_total.drop(labels=['system'], axis=1)
df_grouped = df_grouped.groupby(by=['setting_id', 'classifier', 'package_decl', 'imports', 'class_decl', 'public_methods', 'comments']).mean()

for cl in classifiers:
    df_cl = df_grouped.reset_index()
    df_cl = df_cl[df_cl['classifier']==cl].sort_values(by='accuracy',ascending=[False])
    print("Results for " + cl)
    display(df_cl)

## Results per setting

In [None]:
df_grouped = df_total.drop(labels=['system', 'classifier', 'setting_id', 'settings'], axis=1)
df_grouped = df_grouped.groupby(by=[c for c in code_elems.keys()]).mean()
df_grouped.sort_values(by='accuracy',ascending=[False]).head(50)

In [None]:
df_grouped = df_total.drop(labels=['system', 'classifier'], axis=1)
df_grouped = df_grouped.groupby(by=['setting_id','settings']).mean()
df_grouped.sort_values(by='accuracy',ascending=[False]).head(15)

In [None]:
df_no_pkg = df_grouped[df_grouped['package_decl'] == 0]
df_no_pkg.sort_values(by='accuracy', ascending=[False]).head(15)

In [None]:
#41: best with package declarations
#643: best without
setting = available_settings[41][1]
print(setting)
n_splits = 10
df_overview_columns = ['system', 'classifier', 'accuracy',
                       'precision macro', 'precision weighted',
                       'recall macro', 'recall weighted',
                       'f1 macro', 'f1 weighted']
df_overview = pd.DataFrame(columns=df_overview_columns)

for system in systems:
    Prep.preprocess_settings(setting, raw_data_csv[system], tmp_csv[system] + os.path.sep + str(41) + ".csv")
    tmp_df = pd.read_csv(tmp_csv[system]  + os.path.sep + str(41) + ".csv")
    df_sliced = Utils.remove_concerns_under_quantity_threshold(tmp_df)

    feature_representation = CountVectorizer()
    results = {}
    evaluate = Eva.Evaluation(df_sliced, feature_representation, test_size, n_splits, 10)
    results["Logistic Regression"] = Metrics.get_average_classification_report(evaluate.evaluate_MaxEnt())
    results["Support Vector Machines"] = Metrics.get_average_classification_report(evaluate.evaluate_SVM())
    results["Naive Bayes"] = Metrics.get_average_classification_report(evaluate.evaluate_Naive_Bayes())
    for key, value in results.items():
        item = {'system' : systemNames[system],
                'classifier' : key,
                'accuracy' : value.loc['accuracy'][0],
                'precision macro' : value.loc['macro avg']['precision'],
                'precision weighted' : value.loc['weighted avg']['precision'],
                'recall macro' : value.loc['macro avg']['recall'],
                'recall weighted' : value.loc['weighted avg']['recall'],
                'f1 macro' : value.loc['macro avg']['f1-score'],
                'f1 weighted' : value.loc['weighted avg']['f1-score']
               }
        df_overview = df_overview.append(item, ignore_index=True)
df_overview
    

In [None]:
df_cl = df_overview.drop(labels={'system'}, axis=1)
df_cl = df_cl.groupby(by="classifier").mean()
df_cl

In [None]:
fig, axs = plt.subplots(1,3, figsize=(12,5))
metrics = ["accuracy", "precision weighted", "f1 weighted"]

for i, metric in enumerate(metrics):
    plot = sns.barplot(data=df_overview,
                    y="system",
                    x=metric,
                    palette="colorblind",
                    alpha=0.8,
                    hue="classifier",
                    ax=axs[i]
               )
    for p in plot.patches:
        plot.annotate(format(p.get_width(), '.3f'),
                     (p.get_x() + p.get_width() - 0.01, p.get_y() + p.get_height()/2),
                     ha = 'right', va = 'center',
                     textcoords = 'offset points',
                     xytext  = (0, -1),
                     fontsize=10)
for ax in axs:
    ax.set(xlim=(0.4, 1.0))
    ax.label_outer()
    ax.get_legend().remove()
handles, labels = axs[2].get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(0.5, -0.1, 0, 0), loc='lower center', ncol=3)

## Analysis of impact of relative training set size.

At fixed preprocessing setting s0, we look at how training set size affects the performance of the classifiers. The selected relative sizes are stratified which means that each module is represented in the training set according to its size.

### Preprocessing

In [None]:
#setting = preprocess_settings['s0']
df_sliced = {}
processed_data_csv = {}
for system in systems:
    processed_data_csv[system] = str(Path.cwd().parent / files[system]['data size percentage'])
    Prep.preprocess_settings(setting, raw_data_csv[system], processed_data_csv[system])
    processed_data_df = pd.read_csv(processed_data_csv[system])
    df_sliced[system] = Utils.remove_concerns_under_quantity_threshold(processed_data_df)

### Training and Evaluation

In [None]:
test_sizes = [0.95, 0.9, 0.85, 0.8, 0.75]
n_splits = 100
maxEnt_reports = {}
svm_reports = {}
naive_reports = {}

for system in systems:
    print("Processing system " + systemNames[system])
    maxEnt_reports[system] = []
    svm_reports[system] = []
    naive_reports[system] = []
    for test_size in test_sizes:
        feature_representation = CountVectorizer()
        # Train and gather evaluation metrics
        evaluate = Eva.Evaluation(df_sliced[system], feature_representation, test_size, n_splits)
        metrics_max_ent = evaluate.evaluate_MaxEnt()
        metrics_svm = evaluate.evaluate_SVM()
        metrics_naive = evaluate.evaluate_Naive_Bayes()
        maxEnt_reports[system].append(Metrics.get_average_classification_report(metrics_max_ent))
        svm_reports[system].append(Metrics.get_average_classification_report(metrics_svm))
        naive_reports[system].append(Metrics.get_average_classification_report(metrics_naive))

### Results and Visualization

In [None]:
def line_plot(title: str, x_axis, y_axis: dict, x_axis_name, y_axis_name, ax):
    ax.set_title(title, fontsize=14)
    ax.set_ylim(0.4, 1)
    ax.set_ylabel(y_axis_name, fontsize=12)
    ax.set_xlabel(x_axis_name, fontsize=10)

    ax.scatter(x_axis, y_axis["maxEnt"], s=40, c="r", marker="o", label="Log. Regr.")
    ax.plot(x_axis, y_axis["maxEnt"], c="r", linewidth=0.8)
    
    ax.scatter(x_axis, y_axis["naive"], s=40, c="b", marker="x", label="Naive Bayes")
    ax.plot(x_axis, y_axis["naive"], c="b", linewidth=0.8)

    ax.scatter(x_axis, y_axis["svm"], s=40, c="y", marker="^", label="SVM")
    ax.plot(x_axis, y_axis["svm"], c="y", linewidth=0.8)

    #plt.legend(loc="lower right")
    #plt.show()
 

train_size = [str(format(1 - i, '.2f')) for i in test_sizes]
fig, axs = plt.subplots(4, len(systems), figsize=(10, 6), sharex=True, sharey=True)

for i, system in enumerate(systems):
    y_axis = {
        'maxEnt': [report.loc['accuracy'][0] for report in maxEnt_reports[system]],
        'naive': [report.loc['accuracy'][0] for report in naive_reports[system]],
        'svm': [report.loc['accuracy'][0] for report in svm_reports[system]]
    }
    line_plot(systemNames[system], train_size, y_axis, "", "Accuracy" if i == 0 else "", axs[0,i])

    y_axis = {
        'maxEnt': [report.loc['weighted avg']['precision'] for report in maxEnt_reports[system]],
        'naive': [report.loc['weighted avg']['precision'] for report in naive_reports[system]],
        'svm': [report.loc['weighted avg']['precision'] for report in svm_reports[system]]
    }
    line_plot("", train_size, y_axis,
                           "", "W. avg. prec." if i == 0 else "", axs[1,i])

    y_axis = {
        'maxEnt': [report.loc['macro avg']['precision'] for report in maxEnt_reports[system]],
        'naive': [report.loc['macro avg']['precision'] for report in naive_reports[system]],
        'svm': [report.loc['macro avg']['precision'] for report in svm_reports[system]]
    }
    line_plot("", train_size, y_axis,
                           "", "Avg. prec." if i == 0 else "", axs[2,i])

    y_axis = {
        'maxEnt': [report.loc['macro avg']['recall'] for report in maxEnt_reports[system]],
        'naive': [report.loc['macro avg']['recall'] for report in naive_reports[system]],
        'svm': [report.loc['macro avg']['recall'] for report in svm_reports[system]]
    }
    line_plot("", train_size, y_axis,
                           "", "Avg. recall" if i == 0 else "", axs[3,i])
for ax in [a for b in axs for a in b]:
    ax.label_outer()
handles, labels = axs[0,0].get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(0.5, -0.05, 0, 0), loc='lower center', ncol=3, fontsize=14, markerscale=1.5)
plt.tight_layout()

## Analysis of impact of absolute training set sizes

At fixed preprocessing s0, we look at the scenario that, for each module, k files can be used for training. This corresponds to asking a system expert to map k files for each module before we can automatically map. In general, we don't know how large modules are, hence this scenario is more realistic than the previous consideration of relative sizes.

### Preprocessing

We can reuse the preprocessing from above.

In [None]:
processed_data_df = {}
for system in systems:
    processed_data_csv[system] = str(Path.cwd().parent / files[system]['data size abs num'])
    Prep.preprocess_settings(setting, raw_data_csv[system], processed_data_csv[system])
    processed_data_df[system] = pd.read_csv(processed_data_csv[system])


### Training and Evaluation

In [None]:
nr_of_training_files = [3, 5, 10, 15, 20, 25]
n_splits = 100
maxEnt_reports = {}
svm_reports = {}
naive_reports = {}

for system in systems:
    maxEnt_reports[system] = []
    svm_reports[system] = []
    naive_reports[system] = []    
    for training_files in nr_of_training_files:
        feature_representation = CountVectorizer()
        # Train and gather evaluation metrics
        df_sliced = Utils.remove_concerns_under_quantity_threshold(processed_data_df[system], training_files)
        evaluate = Eva.Evaluation(df_sliced, feature_representation, n_splits, numberOfFiles=training_files)
        metrics_max_ent = evaluate.evaluate_MaxEnt(type = 'custom')
        metrics_svm = evaluate.evaluate_SVM(type = 'custom')
        metrics_naive = evaluate.evaluate_Naive_Bayes(type = 'custom')
        maxEnt_reports[system].append(Metrics.get_average_classification_report(metrics_max_ent))
        svm_reports[system].append(Metrics.get_average_classification_report(metrics_svm))
        naive_reports[system].append(Metrics.get_average_classification_report(metrics_naive))

### Results and Visualization

In [None]:
x_axis = [str(i) for i in nr_of_training_files]
fig, axs = plt.subplots(4, len(systems), figsize=(10, 6), sharex=True, sharey=True)

for i, system in enumerate(systems):
    y_axis = {
        'maxEnt': [report.loc['accuracy'][0] for report in maxEnt_reports[system]],        
        'naive': [report.loc['accuracy'][0] for report in naive_reports[system]],
        'svm': [report.loc['accuracy'][0] for report in svm_reports[system]]
    }
    line_plot(systemNames[system], x_axis, y_axis, "", "Accuracy" if i == 0 else "", axs[0,i])

    y_axis = {
        'maxEnt': [report.loc['weighted avg']['precision'] for report in maxEnt_reports[system]],
        'naive': [report.loc['weighted avg']['precision'] for report in naive_reports[system]],
        'svm': [report.loc['weighted avg']['precision'] for report in svm_reports[system]]
    }
    line_plot(systemNames[system], x_axis, y_axis,
                           "", "W. avg. prec." if i == 0 else "", axs[1,i])

    y_axis = {
        'maxEnt': [report.loc['macro avg']['precision'] for report in maxEnt_reports[system]],
        'naive': [report.loc['macro avg']['precision'] for report in naive_reports[system]],
        'svm': [report.loc['macro avg']['precision'] for report in svm_reports[system]]
    }
    line_plot(systemNames[system], x_axis, y_axis,
                           "", "Avg. prec. " if i == 0 else "", axs[2,i])


    ##############RECALL#######################
    y_axis = {
        'maxEnt': [report.loc['macro avg']['recall'] for report in maxEnt_reports[system]],
        'naive': [report.loc['macro avg']['recall'] for report in naive_reports[system]],
        'svm': [report.loc['macro avg']['recall'] for report in svm_reports[system]]
    }
    line_plot(systemNames[system], x_axis, y_axis,
                           "", "Avg. recall" if i == 0 else "", axs[3,i])
for ax in [a for b in axs for a in b]:
    ax.label_outer()
handles, labels = axs[0,0].get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(0.5, -0.05, 0, 0), loc='lower center', ncol=3, fontsize=14, markerscale=1.5)
plt.tight_layout()