# Introduction
This is a Jupyter notebook for re-running the experiments described in the paper titled "Using InMap to Create Seed Mapping for Machine Learning-Based Code-to-Architecture Mappers". This is only for double-blind review purposes, it will be made easier accessible via GitHub upon accepatence and publication of the paper.

## Manage imports

In [None]:
import sys
sys.path.append('./script')
import pandas as pd
import os
import re
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline
import Preprocess as Prep
import RelativePaths as RP
import Evaluation as Eva
import GatherData as Gather
import Graphs as Graphs
import Utils as Utils
import Metrics as Metrics
from IPython.display import display
from itertools import chain, combinations
import concurrent.futures
import itertools
import threading



## Define constants

In [None]:
inmap_result_cols = ["pos", "name", "x", "y", "recommendation", "score", "module", "consistent", "explicit"]
inmap_renaming = {'ant' : {
                            
                            'compil' : 'compilers',
                            'condit' : 'condition',
                            'listen' : 'listeners',
                            'option' : 'optional'
                          },
                  'argouml' : {'applicat' : 'application',
                               'code-Generat' : 'code-generation',
                               'configurat' : 'configuration',
                               'explor' : 'explorer',
                               'internationalizat' : 'internationalization',
                               'java-Code-Generat' : 'java-code-generation',
                               'notat' : 'notation',
                               'persist' : 'persistence',
                               'reverseEngineer' : 'reverse-engineering',
                               'swing-Extens' : 'swing-extension',
                               'task-Manage' : 'task-management'
                              },
                  'jabref' : {'gui ui' : 'gui'},
                  'jittac' : {'eclipse-resource-mapp' : 'eclipse-resource-mapping',
                              'impl-model implmodel' : 'impl-model',
                              'resource-mapp' : 'resource-mapping'},
                  'teammates' : {'common-data-Transf' : 'data-transfer',
                                 'common-except' : 'exceptions',
                                 'common-util' : 'util',
                                 'logic-back-door' : 'logic-backdoor',
                                 'storage-entity' : 'entity',
                                 'storage-search' : 'search',
                                 'ui-automate' : 'ui-automated',
                                 'ui-controll' : 'ui-controller'}}
INMAP_DATA_PATH = './data/inmap'
INMAP_DATA_PATH_EXP2 = './data/inmap_exp2'

systems = ['ant', 'argouml', 'jabref', 'jittac', 'teammates']
systemNames = {'ant' : 'Ant', 'argouml' : 'ArgoUML', 'jabref' : 'JabRef', 'jittac' : 'Jittac', 'prom' : 'ProM','teammates' : 'TeamMates'}

TEST_ALL_PREPROCESSING_SETTINGS = False
STEP_PREPROCESS = True
STEP_EVALUATE = True

## Read configuration file

In [None]:
# File containing settings
path_to_yaml = './config.yaml'
config = Utils.read_yaml_file(path_to_yaml)
files = {}

for system in systems:
    files[system] = config['file locations'][system]
preprocess_settings = config['preprocess settings list']

In [None]:
# set file_locations which will be relative to computer in use
from pathlib import Path
raw_data_csv = {}
system_folder = {}
tmp_csv = {}
table_file = {}
for system in systems:
    raw_data_csv[system] = str(Path.cwd() / files[system]['raw data'])
    system_folder[system] = str(Path.cwd() / files[system]['system folder'])
    tmp_csv[system] = str(Path.cwd() / files[system]['tmp data'])
    table_file[system] = str(Path.cwd() / files[system]['preprocess comparisons'])

## Configure logging

In [None]:
#setting up the logging
import logging
import sys
import datetime

timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')
logfilename=f'./log/experiments.log'
#formatter = logging.Formatter('[%(asctime)s] %(name)s %(levelname)s - %(message)s')

file_handler = logging.FileHandler(filename=logfilename)
file_handler.setLevel(logging.INFO)

logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(message)s',
    handlers=[
        file_handler
    ]
)
logger = logging.getLogger()
logger.info("Logfile created")#setting up the logging
import logging
import sys
import datetime

timestamp = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')
logfilename=f'./log/experiments.log'
#formatter = logging.Formatter('[%(asctime)s] %(name)s %(levelname)s - %(message)s')

file_handler = logging.FileHandler(filename=logfilename)
file_handler.setLevel(logging.INFO)

logging.basicConfig(
    level=logging.INFO, 
    format='%(asctime)s - %(message)s',
    handlers=[
        file_handler
    ]
)
logger = logging.getLogger()
logger.info("Logfile created")

In [None]:
def load_inmap_results(data_dir, system):
    df_total = pd.DataFrame(columns=inmap_result_cols)
    result_files = [f for f in os.listdir(f"{data_dir}/{system}") if re.match("page-[0-9]*-recommendations-data.json", f)]
    for i in range(len(result_files)):
        df = pd.read_json(f"{data_dir}/{system}/page-{i+1}-recommendations-data.json")
        data_list = df['data'].values.tolist()
        if len(data_list[0]) == 10:
            data_list = [x[0:7] + x[8:] for x in data_list]
        df = pd.DataFrame(data_list, columns=inmap_result_cols)
        df_total = pd.concat([df_total,df], ignore_index=True)
    df_total['pos'] = df_total.index + 1
    return df_total

def transform_inmap_results(df_inmap, rename_dict):
    df_inmap['name'] = df_inmap['name'].str.split('.', n = -1, expand = False).str[-1]
    df_inmap['name'] = df_inmap['name'].astype(str) + '.java'
    df_inmap['consistent'] = df_inmap['consistent'].map({'true' : True, 'false' : False})
    df_inmap['explicit'] = df_inmap['explicit'].map({'true' : True, 'false' : False})
    df_inmap.drop(columns = ['x', 'y'], inplace=True)
    df_inmap[['module','recommendation']].replace(to_replace=rename_dict, value=None, inplace=True)
    df_inmap.replace({'module' : rename_dict, 'recommendation' : rename_dict}, inplace=True)
    return df_inmap

def get_correct_inmap_recommendations(df_inmap, n = 5, per_module = 0):
    if per_module == 0:
        return df_inmap[df_inmap['consistent'] == True].sort_values(by='pos').head(n)
    else:
        return df_inmap[df_inmap['consistent'] == True].sort_values(by='pos').groupby('module').head(per_module)

def line_plot(title: str, x_axis, y_axis: dict, x_axis_name, y_axis_name, ax):
    ax.set_title(title, fontsize=14)
    ax.set_ylim(0.0, 1)
    ax.set_ylabel(y_axis_name, fontsize=14)
    ax.set_xlabel(x_axis_name, fontsize=14)

    ax.scatter(x_axis, y_axis["maxEnt"], s=40, c="r", marker="o", label="Log. Regr.")
    ax.plot(x_axis, y_axis["maxEnt"], c="r", linewidth=0.8)
    
    ax.scatter(x_axis, y_axis["naive"], s=40, c="b", marker="x", label="Naive Bayes")
    ax.plot(x_axis, y_axis["naive"], c="b", linewidth=0.8)

    ax.scatter(x_axis, y_axis["svm"], s=40, c="y", marker="^", label="SVM")
    ax.plot(x_axis, y_axis["svm"], c="y", linewidth=0.8)

In [None]:
for system in systems:
    print(system)
    Gather.gather_architectural_concerns_data(system_folder[system], raw_data_csv[system])

In [None]:
dataset_df = {}
for system in systems:
    dataset_df[system] = pd.read_csv(raw_data_csv[system])
    y_labels = dataset_df[system].Label.unique()
    x_quantity = [len(dataset_df[system].loc[dataset_df[system]['Label']==label]) for label in y_labels]
    tmp_df = pd.DataFrame({
        'Labels' : y_labels,
        'Quantity' : x_quantity
    })
    tmp_df = tmp_df.sort_values(by=['Quantity'])
    plt.style.use("seaborn-whitegrid")
    plt.barh(y=tmp_df.Labels, width=tmp_df.Quantity)
    for i, v in enumerate(tmp_df.Quantity):
        plt.text(v, i, str(v), color='black', fontweight='bold', ha='left', va='center')

    plt.xlabel('Modules')
    plt.ylabel('Number of files')
    plt.title('Files per module for ' + systemNames[system])
    plt.show()

## Overall InMap performance

In [None]:
df_inmap = [transform_inmap_results(load_inmap_results(INMAP_DATA_PATH, s), inmap_renaming[s]) for s in systems]

data = {
    'system' : [s for s in systems],
    'entities' : [pd.read_csv(raw_data_csv[s]).shape[0] for s in systems],
    'recommendations' : [df.shape[0] for df in df_inmap], 
    'correct' : [df[df['consistent']==True].shape[0] for df in df_inmap]
                                                 
}

df_inmap_results = pd.DataFrame(data=data)
df_inmap_results['precision'] = df_inmap_results.correct / df_inmap_results.recommendations
df_inmap_results['recall'] = df_inmap_results.correct / df_inmap_results.entities

df_inmap_results
#for system in systems:
#    print(f"System {system}:")
#    nr_of_entities[system] = pd.read_csv(raw_data_csv[system]).shape[0]
#    print(f"\tNumber of files: {nr_of_entities[system]}")
#    df_inmap[system] = transform_inmap_results(load_inmap_results(INMAP_DATA_PATH, system), inmap_renaming[system])
#    print(f"\tNumber of recommendations: {df_inmap[system].shape[0]}")
#    print(f"\tNumber of correct recommendations {df_inmap[system][df_inmap[system]['consistent']==True].shape[0]}")
    

In [None]:
s = df_inmap_results.style
print(s.to_latex())

## Relative training set sizes

### Preprocessing

In [None]:
setting = preprocess_settings[0]
df_sliced = {}
processed_data_csv = {}
for system in systems:
    processed_data_csv[system] = str(Path.cwd() / files[system]['data size percentage'])
    Prep.preprocess_settings(setting, raw_data_csv[system], processed_data_csv[system])
    processed_data_df = pd.read_csv(processed_data_csv[system])
    df_sliced[system], removed_labels = Utils.remove_concerns_under_quantity_threshold(processed_data_df, minNumOfFiles=2)
    #df_sliced[system] = processed_data_df

### Training and evaluation

In [None]:
test_sizes = [0.95, 0.9, 0.85, 0.8, 0.75]
n_splits = 100
maxEnt_reports = {}
svm_reports = {}
naive_reports = {}

for system in systems:
    print("Processing system " + systemNames[system])
    maxEnt_reports[system] = []
    svm_reports[system] = []
    naive_reports[system] = []
    for test_size in test_sizes:
        feature_representation = CountVectorizer()
        # Train and gather evaluation metrics
        evaluate = Eva.Evaluation(df_sliced[system], feature_representation, test_size, n_splits)
        metrics_max_ent = evaluate.evaluate_MaxEnt()
        metrics_svm = evaluate.evaluate_SVM()
        metrics_naive = evaluate.evaluate_Naive_Bayes()
        maxEnt_reports[system].append(Metrics.get_average_classification_report(metrics_max_ent))
        svm_reports[system].append(Metrics.get_average_classification_report(metrics_svm))
        naive_reports[system].append(Metrics.get_average_classification_report(metrics_naive))

### Visualization

In [None]:
x_axis = [str(format(1 - i, '.2f')) for i in test_sizes]

fig, axs = plt.subplots(len(systems), 2, figsize=(8, 10), sharex=True, sharey=True, squeeze=False)

for i, system in enumerate(systems):
    y_axis = {
        'maxEnt': [report.loc['accuracy'][0] for report in maxEnt_reports[system]],
        'naive': [report.loc['accuracy'][0] for report in naive_reports[system]],
        'svm': [report.loc['accuracy'][0] for report in svm_reports[system]]
    }
    line_plot("", x_axis, y_axis, "W. avg. recall" if i == len(systems) - 1 else "", "", axs[i, 1])

    y_axis = {
        'maxEnt': [report.loc['weighted avg']['precision'] for report in maxEnt_reports[system]],
        'naive': [report.loc['weighted avg']['precision'] for report in naive_reports[system]],
        'svm': [report.loc['weighted avg']['precision'] for report in svm_reports[system]]
    }
    line_plot("", x_axis, y_axis, "W. avg. prec" if i == len(systems) - 1 else "", systemNames[system], axs[i, 0])

#    y_axis = {
#        'maxEnt': [report.loc['macro avg']['precision'] for report in maxEnt_reports[system]],
#        'naive': [report.loc['macro avg']['precision'] for report in naive_reports[system]],
#        'svm': [report.loc['macro avg']['precision'] for report in svm_reports[system]]
#    }
#    line_plot("", x_axis, y_axis, "Avg. prec." if i == len(systems) - 1 else "", "", axs[i, 2])

#    y_axis = {
#        'maxEnt': [report.loc['macro avg']['recall'] for report in maxEnt_reports[system]],
#        'naive': [report.loc['macro avg']['recall'] for report in naive_reports[system]],
#        'svm': [report.loc['macro avg']['recall'] for report in svm_reports[system]]
#    }
#    line_plot("", x_axis, y_axis, "Avg. recall" if i == len(systems) - 1 else "", "", axs[i, 3])
for ax in [a for b in axs for a in b]:
    ax.label_outer()
handles, labels = axs[0,0].get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(0.5, -0.05, 0, 0), loc='lower center', ncol=3, fontsize=14, markerscale=1.5)
plt.tight_layout()

In [None]:
df_summary_table = pd.DataFrame()
for system in systems:
    for i, seed in enumerate(test_sizes):
        df_tmp = maxEnt_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'maxent'
        df_tmp['seed'] = 1 - seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)

        df_tmp = naive_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'naive'
        df_tmp['seed'] = 1 - seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)
        
        df_tmp = svm_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'svm'
        df_tmp['seed'] = 1 - seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)


df_summary_table.pivot_table(index=['system','classifier'], columns=['seed'], values=['precision', 'recall']).round(2)

In [None]:
df_pivot = df_summary_table.pivot_table(index=['system','classifier'], columns=['seed'], values=['precision', 'recall'])
s = df_pivot.style
s.format('{:.2f}')
print(s.to_latex())

## Initial Mapping with InMap

### Preprocessing

In [None]:
setting = preprocess_settings[0]
df_sliced = {}
processed_data_csv = {}
processed_data_df = {}

for system in systems:
    processed_data_csv[system] = str(Path.cwd() / files[system]['data inmap relative seed'])
    Prep.preprocess_settings(setting, raw_data_csv[system], processed_data_csv[system])
    processed_data_df[system] = pd.read_csv(processed_data_csv[system])

### Training

In [None]:
seed_sizes = [0.05, 0.1, 0.15, 0.2, 0.25]
maxEnt_reports = {}
svm_reports = {}
naive_reports = {}

for system in systems:
    print(f"Processing {systemNames[system]}...")
    maxEnt_reports[system] = []
    svm_reports[system] = []
    naive_reports[system] = []  
    
    df_inmap = transform_inmap_results(load_inmap_results(INMAP_DATA_PATH, system), inmap_renaming[system])
    
    for seed_size in seed_sizes:
        df_inmap_seed = get_correct_inmap_recommendations(df_inmap, n = math.ceil(seed_size * processed_data_df[system].shape[0]))
        
        feature_representation = CountVectorizer()
        # Train and gather evaluation metrics
        #df_sliced, removed_labels = Utils.remove_concerns_under_quantity_threshold(processed_data_df[system])
        df_sliced = processed_data_df[system]
        evaluate = Eva.Evaluation(dataFrame = df_sliced, feature_vector = feature_representation, df_training = df_inmap_seed)
        metrics_max_ent = evaluate.evaluate_MaxEnt(type = 'split')
        metrics_svm = evaluate.evaluate_SVM(type = 'split')
        metrics_naive = evaluate.evaluate_Naive_Bayes(type = 'split')
        maxEnt_reports[system].append(Metrics.get_average_classification_report(metrics_max_ent))
        svm_reports[system].append(Metrics.get_average_classification_report(metrics_svm))
        naive_reports[system].append(Metrics.get_average_classification_report(metrics_naive))


## Effort for Seed

In [None]:
columns = ['system', 'rel_seed', 'recommendations', 'mappings', 'precision']
df_inmap_effort = pd.DataFrame(columns=columns)
for system in systems:
    for seed_size in seed_sizes:
        df_entry = pd.DataFrame(columns=columns)
        df_inmap_seed = get_correct_inmap_recommendations(df_inmap, n = math.ceil(seed_size * processed_data_df[system].shape[0]))
        df_entry['system']  = [system]
        df_entry['rel_seed'] = seed_size
        df_entry['recommendations'] = df_inmap_seed['pos'].max() + 1
        df_entry['mappings'] = df_inmap_seed.shape[0]
        df_inmap_effort = pd.concat([df_inmap_effort, df_entry], ignore_index=True)
df_inmap_effort['precision'] = df_inmap_effort['mappings'] / df_inmap_effort['recommendations']
df_inmap_effort.pivot_table(index='system', columns='rel_seed', values='precision')

In [None]:
s = df_inmap_effort.pivot_table(index='system', columns='rel_seed', values='precision').style
s.format('{:.2f}')
print(s.to_latex())

### Visualization: Label distribution in Seed Set

In [None]:
fig, axs = plt.subplots(len(systems), len(seed_sizes), figsize = (30, 25), squeeze=False)
for i, system in enumerate(systems):
    df_inmap = transform_inmap_results(load_inmap_results(INMAP_DATA_PATH, system), inmap_renaming[system])
    for j, seed_size in enumerate(seed_sizes):
        df_inmap_seed = get_correct_inmap_recommendations(df_inmap, n = math.ceil(seed_size * processed_data_df[system].shape[0]))
        sns.countplot(data = df_inmap_seed, y = 'module', orient = 'h', ax = axs[i, j])
        axs[i, j].set_xlabel(f"Seed mapping size at {seed_size} of the overall system ({df_inmap_seed['pos'].iat[-1]}).")
        axs[i, j].set_ylabel(systemNames[system] if j == 0 else "")
        axs[i, j].bar_label(axs[i, j].containers[0])
fig.savefig('./seed_dist.png')

### Visualization

In [None]:
x_axis = [str(format(i, '.2f')) for i in seed_sizes]

fig, axs = plt.subplots(len(systems), 2, figsize=(8, 10), sharex=True, sharey=True, squeeze=False)

for i, system in enumerate(systems):
    y_axis = {
        'maxEnt': [report.loc['accuracy'][0] for report in maxEnt_reports[system]],
        'naive': [report.loc['accuracy'][0] for report in naive_reports[system]],
        'svm': [report.loc['accuracy'][0] for report in svm_reports[system]]
    }
    line_plot("", x_axis, y_axis, "W. avg. recall" if i == len(systems) - 1 else "", "", axs[i, 1])

    y_axis = {
        'maxEnt': [report.loc['weighted avg']['precision'] for report in maxEnt_reports[system]],
        'naive': [report.loc['weighted avg']['precision'] for report in naive_reports[system]],
        'svm': [report.loc['weighted avg']['precision'] for report in svm_reports[system]]
    }
    line_plot("", x_axis, y_axis, "W. avg. prec" if i == len(systems) - 1 else "", systemNames[system], axs[i, 0])

#    y_axis = {
#        'maxEnt': [report.loc['macro avg']['precision'] for report in maxEnt_reports[system]],
#        'naive': [report.loc['macro avg']['precision'] for report in naive_reports[system]],
#        'svm': [report.loc['macro avg']['precision'] for report in svm_reports[system]]
#    }
#    line_plot("", x_axis, y_axis, "Avg. prec." if i == len(systems) - 1 else "", "", axs[i, 2])

#    y_axis = {
#        'maxEnt': [report.loc['macro avg']['recall'] for report in maxEnt_reports[system]],
#        'naive': [report.loc['macro avg']['recall'] for report in naive_reports[system]],
#        'svm': [report.loc['macro avg']['recall'] for report in svm_reports[system]]
#    }
#    line_plot("", x_axis, y_axis, "Avg. recall" if i == len(systems) - 1 else "", "", axs[i, 3])
for ax in [a for b in axs for a in b]:
    ax.label_outer()
handles, labels = axs[0,0].get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(0.5, -0.05, 0, 0), loc='lower center', ncol=3, fontsize=14, markerscale=1.5)
plt.tight_layout()

In [None]:
df_summary_table = pd.DataFrame()
for system in systems:
    for i, seed in enumerate(seed_sizes):
        df_tmp = maxEnt_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'maxent'
        df_tmp['seed'] = seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)

        df_tmp = naive_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'naive'
        df_tmp['seed'] = seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)
        
        df_tmp = svm_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'svm'
        df_tmp['seed'] = seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)


df_summary_table.pivot_table(index=['system','classifier'], columns=['seed'], values=['precision', 'recall'])

In [None]:
df_pivot = df_summary_table.pivot_table(index=['system','classifier'], columns=['seed'], values=['precision', 'recall'])
s = df_pivot.style
s.format('{:.2f}')
print(s.to_latex())

In [None]:
merged_report = {}
for system in systems:
    merged_report[system] = pd.DataFrame()
    for i, seed in enumerate(seed_sizes):
        df_tmp = maxEnt_reports[system][i].drop(index=['accuracy', 'macro avg', 'weighted avg']).reset_index()
        df_tmp['classifier'] = 'maxent'
        df_tmp['seed_size'] = seed
        merged_report[system] = pd.concat([merged_report[system], df_tmp], ignore_index=True)

        df_tmp = naive_reports[system][i].drop(index=['accuracy', 'macro avg', 'weighted avg']).reset_index()
        df_tmp['classifier'] = 'naive'
        df_tmp['seed_size'] = seed
        merged_report[system] = pd.concat([merged_report[system], df_tmp], ignore_index=True)

        df_tmp = svm_reports[system][i].drop(index=['accuracy', 'macro avg', 'weighted avg']).reset_index()
        df_tmp['classifier'] = 'svm'
        df_tmp['seed_size'] = seed
        merged_report[system] = pd.concat([merged_report[system], df_tmp], ignore_index=True)
    merged_report[system].rename(columns={'index' : 'module'}, inplace = True)
    merged_report_p = merged_report[system].drop(columns = ['recall', 'f1-score'])
    merged_report_r = merged_report[system].drop(columns = ['precision', 'f1-score'])
    merged_report_p['metric'] = 'precision'
    merged_report_p.rename(columns={'precision':'value'}, inplace=True)
    merged_report_r['metric'] = 'recall'
    merged_report_r.rename(columns={'recall':'value'}, inplace=True)
    merged_report[system] = pd.concat([merged_report_p, merged_report_r], ignore_index=True)

In [None]:
fig, axs = plt.subplots(len(systems), len(seed_sizes), figsize = (35,25), squeeze=False)
for i, system in enumerate(systems):
    df_inmap = transform_inmap_results(load_inmap_results(INMAP_DATA_PATH, system), inmap_renaming[system])
    for j, seed_size in enumerate(seed_sizes):
        df_prec = merged_report[system][(merged_report[system]['classifier'] == 'svm') &
                                        (np.isclose(merged_report[system]['seed_size'], seed_size))] 
        df_inmap_seed = get_correct_inmap_recommendations(df_inmap, n = math.ceil(seed_size * processed_data_df[system].shape[0]))
        df_prec = df_prec[df_prec['module'].isin(df_inmap_seed.module)]
        ax2 = axs[i, j].twiny()
        ax2.grid(False)
        sns.scatterplot(data=df_prec.sort_values('module'), x='value', style='metric', y='module',  ax = ax2) 
        sns.countplot(data = df_inmap_seed.sort_values('module'), y = 'module', orient = 'h', ax = axs[i,j], saturation=0.5, palette='light:#5A9')
        axs[i,j].set_ylabel(systemNames[system] if j == 0 else "")
        axs[i,j].set_xlabel(f"Mappings for seed size {seed_sizes[j]}" if i == len(systems) - 1 else "")
        ax2.set_xlabel("Precision/recall per module" if i == 0 else "")
        ax2.get_legend().remove()
        axs[i,j].bar_label(axs[i,j].containers[0])
handles, labels = ax2.get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(0.5, 0.0, 0, 0), loc='lower center', ncol=2, fontsize=14, markerscale=1.5)
        

## Initial Mapping with InMap and Minimal Number of Mapping per Module

### Preprocessing

In [None]:
setting = preprocess_settings[0]
df_sliced = {}
processed_data_csv = {}
processed_data_df = {}

for system in systems:
    processed_data_csv[system] = str(Path.cwd() / files[system]['data inmap module seed'])
    Prep.preprocess_settings(setting, raw_data_csv[system], processed_data_csv[system])
    processed_data_df[system] = pd.read_csv(processed_data_csv[system])

### Training

In [None]:
seed_sizes = [0.05, 0.1, 0.15, 0.2, 0.25]
pm = {}
maxEnt_reports = {}
svm_reports = {}
naive_reports = {}

for system in systems:
    maxEnt_reports[system] = []
    svm_reports[system] = []
    naive_reports[system] = []  
    pm[system] = [math.ceil(seed_size * processed_data_df[system].shape[0] / processed_data_df[system].Label.nunique()) for seed_size in seed_sizes]
    
    df_inmap = transform_inmap_results(load_inmap_results(INMAP_DATA_PATH_EXP2, system), inmap_renaming[system])
    
    for i, seed_size in enumerate(seed_sizes):
        df_inmap_seed = get_correct_inmap_recommendations(df_inmap, per_module=pm[system][i])
         
        feature_representation = CountVectorizer()
        # Train and gather evaluation metrics
        #df_sliced, removed_labels = Utils.remove_concerns_under_quantity_threshold(processed_data_df[system])
        df_sliced = processed_data_df[system]
        evaluate = Eva.Evaluation(dataFrame = df_sliced, feature_vector = feature_representation, df_training = df_inmap_seed)
        metrics_max_ent = evaluate.evaluate_MaxEnt(type = 'split')
        metrics_svm = evaluate.evaluate_SVM(type = 'split')
        metrics_naive = evaluate.evaluate_Naive_Bayes(type = 'split')
        maxEnt_reports[system].append(Metrics.get_average_classification_report(metrics_max_ent))
        svm_reports[system].append(Metrics.get_average_classification_report(metrics_svm))
        naive_reports[system].append(Metrics.get_average_classification_report(metrics_naive))


In [None]:
columns = ['system', 'rel_seed', 'recommendations', 'mappings', 'precision']
df_inmap_effort = pd.DataFrame(columns=columns)
for system in systems:
    pm[system] = [math.ceil(seed_size * processed_data_df[system].shape[0] / processed_data_df[system].Label.nunique()) for seed_size in seed_sizes]
    df_inmap = transform_inmap_results(load_inmap_results(INMAP_DATA_PATH_EXP2, system), inmap_renaming[system])
    
    for i, seed_size in enumerate(seed_sizes):
        df_entry = pd.DataFrame(columns=columns)
        nr_of_rec = 0
        df_inmap_seed = get_correct_inmap_recommendations(df_inmap, per_module=pm[system][i])
        for m in df_inmap_seed['module'].unique():
            #get max position for module in seed -> the we are done finding mapping for the seed
            max_pos = df_inmap_seed[df_inmap_seed['recommendation'] == m]['pos'].max()
            #get all recommendation from df_inmap recommending module smaller than pos
            nr_of_rec += df_inmap[(df_inmap['recommendation'] == m) & (df_inmap['pos'] <= max_pos)].shape[0]
        df_entry['system']  = [system]
        df_entry['rel_seed'] = seed_size
        df_entry['recommendations'] = nr_of_rec
        df_entry['mappings'] = df_inmap_seed.shape[0]
        df_inmap_effort = pd.concat([df_inmap_effort, df_entry], ignore_index=True)
df_inmap_effort['precision'] = df_inmap_effort['mappings'] / df_inmap_effort['recommendations']
df_pivot = df_inmap_effort.pivot_table(index='system', columns='rel_seed', values='precision')
s = df_pivot.style
s.format('{:.2f}')
print(s.to_latex())

In [None]:
df_summary_table = pd.DataFrame()
for system in systems:
    for i, seed in enumerate(seed_sizes):
        df_tmp = maxEnt_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'maxent'
        df_tmp['seed'] = seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)

        df_tmp = naive_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'naive'
        df_tmp['seed'] = seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)
        
        df_tmp = svm_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'svm'
        df_tmp['seed'] = seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)


df_summary_table.pivot_table(index=['system','classifier'], columns=['seed'], values=['precision', 'recall'])

In [None]:
df_pivot = df_summary_table.pivot_table(index=['system','classifier'], columns=['seed'], values=['precision', 'recall'])
s = df_pivot.style
s.format('{:.2f}')
print(s.to_latex())

### Visualization

In [None]:
fig, axs = plt.subplots(len(systems), 2, figsize=(8, 10), sharex=False, sharey=False, squeeze=False)


for i, system in enumerate(systems):
    x_axis = [str(format(i, '.0f')) for i in pm[system]]
    y_axis = {
        'maxEnt': [report.loc['weighted avg']['recall'] for report in maxEnt_reports[system]],
        'naive': [report.loc['weighted avg']['recall'] for report in naive_reports[system]],
        'svm': [report.loc['weighted avg']['recall'] for report in svm_reports[system]]
    }
    line_plot("", x_axis, y_axis, "W. avg. recall" if i == len(systems) - 1 else "", "", axs[i, 1])

    y_axis = {
        'maxEnt': [report.loc['weighted avg']['precision'] for report in maxEnt_reports[system]],
        'naive': [report.loc['weighted avg']['precision'] for report in naive_reports[system]],
        'svm': [report.loc['weighted avg']['precision'] for report in svm_reports[system]]
    }
    line_plot("", x_axis, y_axis, "W. avg. prec" if i == len(systems) - 1 else "", systemNames[system], axs[i, 0])

handles, labels = axs[0,0].get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(0.5, -0.05, 0, 0), loc='lower center', ncol=3, fontsize=14, markerscale=1.5)
plt.tight_layout()

In [None]:
df_summary_table = pd.DataFrame()
for system in systems:
    for i, seed in enumerate(seed_sizes):
        df_tmp = maxEnt_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'maxent'
        df_tmp['seed'] = seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)

        df_tmp = naive_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'naive'
        df_tmp['seed'] = seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)
        
        df_tmp = svm_reports[system][i].loc[['weighted avg'], ['precision', 'recall']]
        df_tmp['classifier'] = 'svm'
        df_tmp['seed'] = seed
        df_tmp['system'] = system
        df_summary_table = pd.concat([df_summary_table, df_tmp], ignore_index=True)


df_summary_table.pivot_table(index=['system','classifier'], columns=['seed'], values=['precision', 'recall']).round(2)

In [None]:
merged_report = {}
for system in systems:
    merged_report[system] = pd.DataFrame()
    for i, seed in enumerate(pm[system]):
        df_tmp = maxEnt_reports[system][i].drop(index=['accuracy', 'macro avg', 'weighted avg', 'micro avg'], errors='ignore').reset_index()
        df_tmp['classifier'] = 'maxent'
        df_tmp['seed_size'] = seed
        merged_report[system] = pd.concat([merged_report[system], df_tmp], ignore_index=True)

        df_tmp = naive_reports[system][i].drop(index=['accuracy', 'macro avg', 'weighted avg', 'micro avg'], errors='ignore').reset_index()
        df_tmp['classifier'] = 'naive'
        df_tmp['seed_size'] = seed
        merged_report[system] = pd.concat([merged_report[system], df_tmp], ignore_index=True)

        df_tmp = svm_reports[system][i].drop(index=['accuracy', 'macro avg', 'weighted avg', 'micro avg'], errors='ignore').reset_index()
        df_tmp['classifier'] = 'svm'
        df_tmp['seed_size'] = seed
        merged_report[system] = pd.concat([merged_report[system], df_tmp], ignore_index=True)
    merged_report[system].rename(columns={'index' : 'module'}, inplace = True)
    merged_report_p = merged_report[system].drop(columns = ['recall', 'f1-score'])
    merged_report_r = merged_report[system].drop(columns = ['precision', 'f1-score'])
    merged_report_p['metric'] = 'precision'
    merged_report_p.rename(columns={'precision':'value'}, inplace=True)
    merged_report_r['metric'] = 'recall'
    merged_report_r.rename(columns={'recall':'value'}, inplace=True)
    merged_report[system] = pd.concat([merged_report_p, merged_report_r], ignore_index=True)

In [None]:
fig, axs = plt.subplots(len(systems), len(seed_sizes), figsize = (35,25), squeeze=False)
for i, system in enumerate(systems):
    df_inmap = transform_inmap_results(load_inmap_results(INMAP_DATA_PATH_EXP2, system), inmap_renaming[system])
    for j, seed_size in enumerate(pm[system]):
        df_prec = merged_report[system][(merged_report[system]['classifier'] == 'svm') &
                                        (np.isclose(merged_report[system]['seed_size'], seed_size))] 
        df_inmap_seed = get_correct_inmap_recommendations(df_inmap, per_module=seed_size)
        df_prec = df_prec[df_prec['module'].isin(df_inmap_seed.module)]
        ax2 = axs[i, j].twiny()
        ax2.grid(False)

        sns.scatterplot(data=df_prec.sort_values('module'), x='value', style='metric', y='module',  ax = ax2) 
        sns.countplot(data = df_inmap_seed.sort_values('module'), y = 'module', orient = 'h', ax = axs[i,j], saturation=0.5, palette='light:#5A9')
        axs[i,j].set_ylabel(systemNames[system] if j == 0 else "")
        axs[i,j].set_xlabel(f"Mappings for seed size {seed_sizes[j]}" if i == len(systems) - 1 else "")
        ax2.set_xlabel("Precision/recall per module" if i == 0 else "")
        ax2.get_legend().remove()
        axs[i,j].bar_label(axs[i,j].containers[0])
handles, labels = ax2.get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(0.5, -0.1, 0, 0), loc='lower center', ncol=2, fontsize=14, markerscale=1.5)