# Experiment 2: Part Type Collision Analysis

This notebook will contain gathered results from experiment 2.

# Part Type Collision Analysis
## Methodology
For each part type we have, run the experiment many times over many different hyperparameters. Specifically, isolate one hyperparameter, run the experiment over a range of values, tracking the computed collision rate each time. Repeat this for each hyperparameter and each part type.
## Deliverables
Graphs and analysis for the impact of different values of the hyperparmeters. How do they affect the final collision rate? Why are the effecting the collision rate like that? What does this tell us? 
Graphs and analysis for comparing the results across different part types. Are different part types affected in the same way by the same change in hyperperamters? How close are their collision rates? What does this tell us about the relative importance of both hyperparameters and part types. 

## Source Code

The below sections contains all of our source codes.

In [None]:
import mlflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import os 

user_path = '~/GitHub/matcher'  # CHANGE THIS LINE AS NEEDED FOR YOUR ENVIRONMENT
os.chdir(os.path.expanduser(user_path))

In [None]:
def get_metrics_series(mlruns_path: str, experiment_id: str, run_id: str, metric_name: str) -> list:
    """Get a series of metric values for a given metric name."""
    with open(f'{mlruns_path}/{experiment_id}/{run_id}/metrics/{metric_name}') as f:
        file_lines = f.readlines()
    return [float(line.split()[1]) for line in file_lines]

In [None]:
experiment_id = mlflow.get_experiment_by_name(name='Experiment 3').experiment_id
runs_df = mlflow.search_runs(experiment_ids=experiment_id, max_results=10_000)
runs_df['monte_carlo_upper_collision_rate_series'] = runs_df.apply(
    lambda row: get_metrics_series(
            mlruns_path='mlruns', 
            experiment_id=experiment_id, 
            run_id=row['run_id'], 
            metric_name='monte_carlo_upper_collision_rate'), 
    axis=1)

print(runs_df.head(1)['monte_carlo_upper_collision_rate_series'])
print(runs_df['params.confidence_bound'])


In [None]:
base_meta_pdf_ci = 0.995
base_part_pdf_ci = 0.995
base_confidence_bound = 0.995

runs_df['params.meta_pdf_ci'] = runs_df['params.meta_pdf_ci'].astype(float)
runs_df['params.part_pdf_ci'] = runs_df['params.part_pdf_ci'].astype(float)
runs_df['params.confidence_bound'] = runs_df['params.confidence_bound'].astype(float)



In [None]:
mlflow.end_run()
mlflow.start_run()


meta_pdf_ci_analysis_df = runs_df.loc[
    (runs_df['params.confidence_bound'] == base_confidence_bound) &
    (runs_df['params.part_pdf_ci'] == base_part_pdf_ci)]
part_pdf_ci_analysis_df = runs_df.loc[
    (runs_df['params.confidence_bound'] == base_confidence_bound) &
    (runs_df['params.meta_pdf_ci'] == base_meta_pdf_ci)]
confidence_bound_analysis_df = runs_df.loc[
    (runs_df['params.meta_pdf_ci'] == base_meta_pdf_ci) &
    (runs_df['params.part_pdf_ci'] == base_part_pdf_ci)]

meta_pdf_ci_part_groups = meta_pdf_ci_analysis_df.groupby('params.part_type')
part_pdf_ci_part_groups = part_pdf_ci_analysis_df.groupby('params.part_type')
confidence_bound_part_groups = confidence_bound_analysis_df.groupby('params.part_type')


def run_experiment(df_groups, param_col: str):
    
    for part_type, part_group in df_groups:
    
        part_group.sort_values(by=param_col, inplace=True)
        vars = [
            np.var(np.array(param_collision_rates)) 
            for param_collision_rates in part_group['monte_carlo_upper_collision_rate_series'].to_numpy()]    
        plt.plot(part_group[param_col], vars, label=f'{part_type} - Correlation: {np.corrcoef(part_group[param_col], vars)[0,1]:.2f}')
    
    plt.legend()
    plt.title(f'Variance of Collision Rates vs {param_col}')
    plt.xlabel(f'{param_col}')
    plt.ylabel('Variance of Collision Rates')
    plt.savefig(f'psig_matcher/experiments/graphs/variance_of_collision_rates_vs_{param_col}.png')
    mlflow.log_artifact(f'psig_matcher/experiments/graphs/variance_of_collision_rates_vs_{param_col}.png')
    plt.clf()
        
run_experiment(meta_pdf_ci_part_groups, 'params.meta_pdf_ci')
run_experiment(part_pdf_ci_part_groups, 'params.part_pdf_ci')
run_experiment(confidence_bound_part_groups, 'params.confidence_bound')
    

In [81]:
meta_pdf_ci_analysis_df.sort_values(by='params.meta_pdf_ci', inplace=True)
meta_pdf_ci_param_groups = meta_pdf_ci_analysis_df.groupby('params.meta_pdf_ci')

x_vals = []
averaged_varainces = []
for meta_pdf_ci, df in meta_pdf_ci_param_groups:
    x_vals.append(meta_pdf_ci)
    averaged_varainces.append(np.mean([
        np.var(np.array(param_collision_rates)) 
        for param_collision_rates in df['monte_carlo_upper_collision_rate_series'].to_numpy()]))

plt.plot(x_vals, averaged_varainces, label=f'Average Variance of Collision Rates Across all Part Types')
plt.legend()
plt.xlabel('Meta PDF Confidence Interval')
plt.ylabel('Variance of Collision Rates')
plt.title('Average Variance of Collision Rates vs Meta PDF Confidence Interval')
plt.savefig(f'psig_matcher/experiments/graphs/average_variance_of_collision_rates_vs_meta_pdf_ci.png')
mlflow.log_artifact(f'psig_matcher/experiments/graphs/average_variance_of_collision_rates_vs_meta_pdf_ci.png')
plt.clf()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_pdf_ci_analysis_df.sort_values(by='params.meta_pdf_ci', inplace=True)


<Figure size 640x480 with 0 Axes>