In [None]:
%pip install numpy pandas matplotlib scipy

In [None]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, AutoMinorLocator

plt.rcParams.update({
    "text.usetex": True,
    "font.family": "Helvetica"
})

fontSize = 18

In [None]:
df = pd.read_csv('data/kNN_parameters_grid.csv')
df = df[(df['P']==1) & (df['Weights'] == 'distance')].sort_values(by='N_neighbors')
x = df['N_neighbors']
y1 = df['Recall']
y2 = df['Precision']
y3 = df['F1 score']

plt.figure(figsize=(10, 6))

ax = plt.gca()
ax.xaxis.set_minor_locator(MultipleLocator(10))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))

plt.plot(x, y1, c='#1f77b4', label='Recall', marker='o', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y2, c='#7f7f7f', label='Precision', marker='^', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y3, c='#d62728', label='$F_1$ score', marker='s', linestyle='-', linewidth=1, markersize=2)

plt.xticks(range(0, 101, 20), fontsize=18)
plt.yticks([i/10 for i in range(11)], fontsize=18)

plt.grid(True, which='both', linestyle='--', linewidth=0.3)

plt.xlabel('k, Number of nearest neighbours', fontsize=18)
plt.ylabel('Metrics', fontsize=18)
plt.title('Metrics vs GRID k ($p=1$, with $n$ only)', fontsize=18)
plt.legend(fontsize=16)

plt.ylim(0.8, 1.01)
plt.xlim(3, 99)

plt.savefig('output/kNN_vs_k2.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
df = pd.read_csv('data/kNN_features.csv')

df['k'] = df['Meth. & Params'].str.extract('k=(\d+)', expand=False).astype(int)
df['p'] = df['Meth. & Params'].str.extract('p=(\d+)', expand=False).astype(int)
df['w'] = df['Meth. & Params'].str.extract('w=(\d+)', expand=False).astype(int)

df_selected = df[(df['Train NR'] == 16280) & (df['Train R'] == 50)]

selected_columns = ['k', 'p', 'w', 'Features', 'TP', 'FP', 'TN', 'FN', 'Acc.', 'Prec.', 'Rec.', 'F1']
df_selected = df_selected[selected_columns]
df_selected = df_selected.sort_values(by=['k', 'p', 'w', 'F1', 'Rec.', 'Features'], ascending=[True, True, True, False, False, True])

df_selected.groupby(['k', 'p', 'w']).head(1)


# df_selected.head(10)
# df.head(3)

In [None]:
df_selected = df_selected[(df_selected['k'] == 3) & (df_selected['p'] == 1) & (df_selected['w'] == 1)]
df_latex = df_selected.drop(['k', 'p', 'w'], axis=1)

latex_table = df_latex.to_latex(index=False)
# latex_table = pd.DataFrame(df_selected).style.to_latex(index=False)
print(latex_table)

In [None]:
df = pd.read_csv('data/kNN_parameters.csv')

df['k'] = df['Meth. & Params'].str.extract('k=(\d+)', expand=False).astype(int)
df['p'] = df['Meth. & Params'].str.extract('p=(-?\d+)', expand=False).astype(int)
df['w'] = df['Meth. & Params'].str.extract('w=(\d+)', expand=False).astype(int)

selected_columns = ['k', 'p', 'w', 'Features', 'TP', 'FP', 'TN', 'FN', 'Acc.', 'Prec.', 'Rec.', 'F1']
df_selected = df[selected_columns]

df_selected.head(15)

In [None]:
df = df_selected[(df_selected['Features'] == 'n') & (df_selected['p'] == 1) & (df_selected['w'] == 1)]

x = df['k']
y1 = df['Rec.']
y2 = df['Prec.']
y3 = df['F1']

plt.figure(figsize=(10, 6))

ax = plt.gca()
ax.xaxis.set_minor_locator(MultipleLocator(10))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))

plt.plot(x, y1, c='#1f77b4', label='Recall', marker='o', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y2, c='#7f7f7f', label='Precision', marker='^', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y3, c='#d62728', label='$F_1$ score', marker='s', linestyle='-', linewidth=1, markersize=2)

plt.xticks(range(0, 101, 20), fontsize=18)
plt.yticks([i/10 for i in range(11)], fontsize=18)

plt.grid(True, which='both', linestyle='--', linewidth=0.3)

plt.xlabel('k, Number of nearest neighbours', fontsize=18)
plt.ylabel('Metrics', fontsize=18)
plt.title('Metrics vs the number of neighbours ($p=1$, $n$ only)', fontsize=18)
plt.legend(fontsize=16)

plt.ylim(0.5, 1.01)
plt.xlim(3, 99)

plt.savefig('output/kNN_vs_k.pdf', format='pdf', bbox_inches='tight')
plt.show()
x

In [None]:
df_n = df_selected[(df_selected['Features'] == 'n') & (df_selected['p'] == 1) & (df_selected['w'] == 1)]
df_a = df_selected[(df_selected['Features'] == 'a') & (df_selected['p'] == 1) & (df_selected['w'] == 1)]
df_a_n = df_selected[(df_selected['Features'] == 'a, n') & (df_selected['p'] == 1) & (df_selected['w'] == 1)]

fig, axs = plt.subplots(3, 1, figsize=(10, 8), sharey=False, sharex=True)

i=0
dfs = [df_n, df_a, df_a_n]
labels = ['n', 'a', 'a, n']
for ax in axs:
    ax.plot(dfs[i]['k'], dfs[i]['Rec.'], label='Recall', c='#1f77b4', marker='o', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['k'], dfs[i]['Prec.'], label='Precision', c='#7f7f7f', marker='^', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['k'], dfs[i]['F1'], label='$F_1$ score', c='#d62728', marker='s', linestyle='-', linewidth=1, markersize=2)
    ax.grid(True, which='both', linestyle='--', linewidth=0.3)
    ax.set_ylabel('Metrics', fontsize=fontSize)
    if i==2:
        ax.set_xlabel('k, the number of neighbours', fontsize=fontSize)
    ax.set_title(f"Features: {labels[i]}", fontsize=fontSize)
    ax.set_ylim(0.5, 1.01)
    ax.set_xlim(3, 99)
    ax.tick_params(axis='both', which='major', labelsize=fontSize)
    ax.legend().remove()
    i+=1

plt.setp(axs, xticks=range(10, 100, 10), yticks=[0.6, 0.7, 0.8, 0.9, 1.0])
plt.subplots_adjust(wspace=0.0)  # Remove horizontal spacing between subplots

fig.suptitle('Metrics vs the number of neigbours, $p=1$', fontsize=fontSize+5, y=1.1)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncols=3, fontsize=fontSize)
plt.subplots_adjust(wspace=0.0, hspace=0.0)  # Remove horizontal spacing between subplots
plt.tight_layout()

plt.savefig('output/kNN_parameters_all.pdf', format='pdf', bbox_inches='tight')
plt.show()


In [None]:
df = pd.read_csv('data/kNN_initial_data.csv')

df['k'] = df['Meth. & Params'].str.extract('k=(\d+)', expand=False).astype(int)
df['p'] = df['Meth. & Params'].str.extract('p=(-?\d+)', expand=False).astype(int)

selected_columns = ['Train R', 'k', 'p', 'Features', 'TP', 'FP', 'TN', 'FN', 'Acc.', 'Prec.', 'Rec.', 'F1']
df_selected = df[selected_columns]

df_selected.head(15)

In [None]:
top_3_results = df_selected[(df_selected['p'] == 1)].groupby(['Features', 'p', 'k']).apply(lambda x: x.nlargest(3, 'F1')).reset_index(drop=True)
top_3_results

In [None]:
df_latex = top_3_results.drop(['p'], axis=1)
latex_table = df_latex.to_latex(index=False)
print(latex_table)

In [None]:
df_k3 = df_selected[(df_selected['Features'] == 'n') & (df_selected['p'] == 1) & (df_selected['k'] == 3)]
df_k3d = df_selected[(df_selected['Features'] == 'a, n') & (df_selected['p'] == 1) & (df_selected['k'] == 3)]
df_k16 = df_selected[(df_selected['Features'] == 'a, n') & (df_selected['p'] == 1) & (df_selected['k'] == 16)]

fig, axs = plt.subplots(3, 1, figsize=(10, 8), sharey=False, sharex=True)

i=0
dfs = [df_k16, df_k3, df_k3d]
labels = ['Features: $a$, $n$; $k=16$', 'Features: $n$; $k=3$', 'Features: $a$, $n$; $k=3$']
for ax in axs:
    ax.plot(dfs[i]['Train R'], dfs[i]['Rec.'], label='Recall', c='#1f77b4', marker='o', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['Train R'], dfs[i]['Prec.'], label='Precision', c='#7f7f7f', marker='^', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['Train R'], dfs[i]['F1'], label='$F_1$ score', c='#d62728', marker='s', linestyle='-', linewidth=1, markersize=2)
    ax.grid(True, which='both', linestyle='--', linewidth=0.3)
    ax.set_ylabel('Metrics', fontsize=fontSize)
    if i==2:
        ax.set_xlabel('n, the number of resonant asteroids in the training set', fontsize=fontSize)
    ax.set_title(f"{labels[i]}", fontsize=fontSize)
    ax.set_ylim(0.5, 1.01)
    ax.set_xlim(3, 99)
    ax.tick_params(axis='both', which='major', labelsize=fontSize)
    ax.legend().remove()
    i+=1

plt.setp(axs, xticks=range(10, 100, 10), yticks=[0.6, 0.7, 0.8, 0.9, 1.0])
plt.subplots_adjust(wspace=0.0)  # Remove horizontal spacing between subplots

fig.suptitle('Metrics vs the size of the training set', fontsize=fontSize+5, y=1.1)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncols=3, fontsize=fontSize)
plt.subplots_adjust(wspace=0.0, hspace=0.0)  # Remove horizontal spacing between subplots
plt.tight_layout()

plt.savefig('output/kNN_initial_data.pdf', format='pdf', bbox_inches='tight')
plt.show()


In [None]:
df = pd.read_csv('data/DT_parameters.csv')

df['d'] = df['Meth. & Params'].str.extract('d=(\d+)', expand=False).astype(int)
df['p'] = df['Meth. & Params'].str.extract('p=(-?\d+)', expand=False).astype(int)
df['w'] = df['Meth. & Params'].str.extract('w=(\d+)', expand=False).astype(int)
df['s'] = df['Meth. & Params'].str.extract('s=(\d+)', expand=False).astype(int)
df['l'] = df['Meth. & Params'].str.extract('l=(\d+)', expand=False).astype(int)
df['f'] = df['Meth. & Params'].str.extract('f=([a-zA-Z0-9]+)', expand=False)

selected_columns = ['Train R', 'd', 'p', 'w', 's', 'l', 'f', 'Features', 'TP', 'FP', 'TN', 'FN', 'Acc.', 'Prec.', 'Rec.', 'F1']
df = df[selected_columns]
df.head(5)

In [None]:
df.sort_values(by=['F1', 'Rec.', 'Prec.'], ascending=[False, False, False]).head(5)

In [None]:
df2 = df.copy()
# Replace values in column 'f'
# df2['f'] = df2['f'].replace({'None': 0, 'sqrt': 1, 'log2': 2})
# feature_mapping = {feature: i for i, feature in enumerate(df2['Features'].unique())}
# df2['Features'] = df2['Features'].replace(feature_mapping)

correlations = df2[['Rec.', 'd', 'w', 's', 'l']].corr(method='spearman', numeric_only=False)
correlations = correlations.round(4)
print(correlations)


In [None]:
import scipy.stats as stats

# Assuming `df` is your DataFrame with 'impurity' and 'F1_score' columns
metric = 'F1'
fvalue, pvalue = stats.f_oneway(df[df['w'] == 0][metric],
                                df[df['w'] == 1][metric])
print(f"weights: F-Value: {fvalue}, P-Value: {pvalue.round(4)}")

fvalue, pvalue = stats.f_oneway(df[df['p'] == 1][metric],
                                df[df['p'] == 2][metric],
                                df[df['p'] == 3][metric])
print(f"impurity: F-Value: {fvalue}, P-Value: {pvalue}")
fvalue, pvalue = stats.f_oneway(df[df['f'] == 'log2'][metric],
                                df[df['f'] == 'None'][metric],
                                df[df['f'] == 'sqrt'][metric])
print(f"max_features: F-Value: {fvalue}, P-Value: {pvalue.round(4)}")
fvalue, pvalue = stats.f_oneway(df[df['Features'] == 'a'][metric],
                                df[df['Features'] == 'a, n, sinI'][metric],
                                df[df['Features'] == 'a, e'][metric],
                                df[df['Features'] == 'a, sinI'][metric])
print(f"Features: F-Value: {fvalue}, P-Value: {pvalue.round(4)}")

In [None]:
average_values = df.groupby(['p']).mean()
average_values = average_values[['Rec.', 'Prec.', 'F1']]
average_values = average_values.round(4)
average_values.sort_values(by='F1', ascending=False)

In [None]:
average_values = df.groupby(['w', 'p', 'Features', 'f']).mean()
average_values = average_values[['Rec.', 'Prec.', 'F1']]
average_values = average_values.round(4)
# average_values.sort_values(by='F1', ascending=False).to_csv('res.csv')

In [None]:
print(df.sort_values(by=['F1', 'Rec.', 'Prec.'], ascending=[False, False, False]).head(10).to_latex(index=False))

In [None]:
filtered_df = df[df['F1'] > 0.910].sort_values(by=['d', 'p', 'w', 's', 'l', 'Features', 'f'])
# filtered_df.to_csv('filtered_records.csv', index=False)

In [None]:
df.sort_values(by=['p', 'w', 's', 'l', 'f']).head(10)

In [None]:
df[(df['w'] == 0) & (df['s'] == 6) & (df['l'] == 5) & (df['Features'] == 'a, n, sinI') & (df['d'] == 20)]

In [None]:
df_ordered = df.sort_values(by='F1', ascending=False).head(500)
grouped_df = df_ordered.groupby('Features').size()
grouped_df = grouped_df.sort_values(ascending=False)
print(grouped_df)

In [None]:
# Filter the dataframe
df_filtered = df_ordered[df_ordered['Features'] == 'sinI, n']
# Iterate over the rows of the filtered dataframe
for index, row in df_filtered.iterrows():
    # Find all rows in the original dataframe that have the same values of 'd', 'p', 'w', 's', 'l', and 'f'
    matching_rows = df_ordered[(df_ordered['d'] == row['d']) & (df_ordered['p'] == row['p']) & (df_ordered['w'] == row['w']) & (df_ordered['s'] == row['s']) & (df_ordered['l'] == row['l']) & (df_ordered['f'] == row['f'])]
    # Print the row from the filtered dataframe
    print(row.to_dict())
    # Print the matching rows
    print(matching_rows)

In [None]:
df = pd.read_csv('data/DT_parameters_vs_d.csv')

df['d'] = df['Meth. & Params'].str.extract('d=([a-zA-Z0-9]+)', expand=False).replace('None', -1).astype(int)
df['s'] = df['Meth. & Params'].str.extract('s=(\d+)', expand=False).astype(int)
df['f'] = df['Meth. & Params'].str.extract('f=([a-zA-Z0-9]+)', expand=False)

df.head(5)

In [None]:
df2 = df[(df['Features'] == 'a, n, sinI') & (df['d'] == -1) & (df['f'] == 'sqrt')]
df2.head(5)

In [None]:
df2 = df[(df['Features'] == 'a, n, sinI') & (df['d'] == -1) & (df['f'] == 'sqrt')]

x = df2['s']
y1 = df2['Rec.']
y2 = df2['Prec.']
y3 = df2['F1']

fontSize = 24
plt.figure(figsize=(10, 6))

ax = plt.gca()
ax.xaxis.set_minor_locator(MultipleLocator(1))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))

plt.plot(x, y1, c='#1f77b4', label='Recall', marker='o', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y2, c='#7f7f7f', label='Precision', marker='^', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y3, c='#d62728', label='$F_1$ score', marker='s', linestyle='-', linewidth=1, markersize=2)

plt.xticks(range(0, 24, 5), fontsize=fontSize)
plt.yticks([i/10 for i in range(11)], fontsize=fontSize)

plt.grid(True, which='both', linestyle='--', linewidth=0.3)

plt.xlabel('s, The minimum number of samples to split', fontsize=fontSize)
plt.ylabel('Metrics', fontsize=fontSize)
plt.title('Metrics vs s ($d=-1$, $\{a, n, \sin{I}\}$, $f=$ sqrt)', fontsize=fontSize)
plt.legend(fontsize=fontSize-4)

plt.ylim(0.7, 1.01)
plt.xlim(2, 20)

plt.savefig('output/DT_vs_s.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
df2 = df[(df['Features'] == 'a, n, sinI') & (df['s'] == 3) & (df['f'] == 'sqrt')]

x = df2['d']
y1 = df2['Rec.']
y2 = df2['Prec.']
y3 = df2['F1']

plt.figure(figsize=(10, 6))

ax = plt.gca()
ax.xaxis.set_minor_locator(MultipleLocator(10))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))

plt.plot(x, y1, c='#1f77b4', label='Recall', marker='o', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y2, c='#7f7f7f', label='Precision', marker='^', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y3, c='#d62728', label='$F_1$ score', marker='s', linestyle='-', linewidth=1, markersize=2)

plt.xticks(range(0, 101, 20), fontsize=fontSize)
plt.yticks([i/10 for i in range(11)], fontsize=fontSize)

plt.grid(True, which='both', linestyle='--', linewidth=0.3)

plt.xlabel('d, The maximum depth', fontsize=fontSize)
plt.ylabel('Metrics', fontsize=fontSize)
plt.title('Metrics vs d ($s=3$, $\{a, n, \sin{I}\}$, $f=$ sqrt)', fontsize=fontSize)
plt.legend(fontsize=fontSize-4)

plt.ylim(0.9, 1.01)
plt.xlim(2, 100)

plt.savefig('output/DT_vs_d_s3.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
df = pd.read_csv('data/DT_initial_data.csv')
df['s'] = df['Meth. & Params'].str.extract('s=(\d+)', expand=False).astype(int)
df.head(5)

In [None]:
df2 = df[(df['Features'] == 'a, n, sinI') & (df['s'] == 3)].sort_values(by='Train R')

fontSize = 20

x = df2['Train R']
y1 = df2['Rec.']
y2 = df2['Prec.']
y3 = df2['F1']

plt.figure(figsize=(10, 6))

ax = plt.gca()
ax.xaxis.set_minor_locator(MultipleLocator(10))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))

plt.plot(x, y1, c='#1f77b4', label='Recall', marker='o', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y2, c='#7f7f7f', label='Precision', marker='^', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y3, c='#d62728', label='$F_1$ score', marker='s', linestyle='-', linewidth=1, markersize=2)

plt.xticks(range(0, 101, 20), fontsize=fontSize)
plt.yticks([i/10 for i in range(11)], fontsize=fontSize)

plt.grid(True, which='both', linestyle='--', linewidth=0.3)

plt.xlabel('n, The number of resonant asteroids in the training set', fontsize=fontSize)
plt.ylabel('Metrics', fontsize=fontSize)
plt.title('Metrics vs n ($s=3$, $\{a, n, \sin{I}\}$, $f=$ sqrt)', fontsize=fontSize)
plt.legend(fontsize=fontSize-4)

plt.ylim(0.65, 1.01)
plt.xlim(2, 100)

plt.savefig('output/DT_vs_n_s3.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
df2 = pd.read_csv('data/DT_initial_data_100.csv').sort_values(by='Train R')

x = df2['Train R']
y1 = df2['Rec.']
y2 = df2['Prec.']
y3 = df2['F1']

plt.figure(figsize=(10, 6))

ax = plt.gca()
ax.xaxis.set_minor_locator(MultipleLocator(10))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))

plt.plot(x, y1, c='#1f77b4', label='Recall', marker='o', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y2, c='#7f7f7f', label='Precision', marker='^', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y3, c='#d62728', label='$F_1$ score', marker='s', linestyle='-', linewidth=1, markersize=2)

plt.xticks(range(0, 101, 20), fontsize=fontSize)
plt.yticks([i/10 for i in range(11)], fontsize=fontSize)

plt.grid(True, which='both', linestyle='--', linewidth=0.3)

plt.xlabel('n, The number of resonant asteroids in the training set', fontsize=fontSize)
plt.ylabel('Metrics', fontsize=fontSize)
plt.title('Metrics vs the size of the training set (increased test set)', fontsize=fontSize)
plt.legend(fontsize=fontSize-4)

plt.ylim(0.65, 1.01)
plt.xlim(2, 100)

plt.savefig('output/DT_vs_n_100000.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
df = pd.read_csv('data/GB_grid_vs_s100.csv')
df['Max_depth'] = df['Max_depth'].fillna(0).astype(int)
df.sort_values(by='F1 score', ascending=False).head(20)

In [None]:
# df2 = df[(df['Features'] == 'a, sinI') & (df['Learning_rate'] == 0.20) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Min_samples_split'] == 9) & (df['N_estimators'] == 95)]
# df2 = df[(df['Features'] == 'a, e, n') & (df['Learning_rate'] == 0.15) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Min_samples_split'] == 9) & (df['N_estimators'] == 95)]
# df2 = df[(df['Features'] == 'a, e') & (df['Learning_rate'] == 0.20) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 8) & (df['N_estimators'] == 95)]
# df2 = df[(df['Features'] == 'a, sinI') & (df['Learning_rate'] == 0.20) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 8) & (df['N_estimators'] == 95)]
df2 = df[(df['Features'] == 'a, e, n, sinI') & (df['Learning_rate'] == 0.10) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 6) & (df['N_estimators'] == 55)]
df2 = df2.sort_values(by='Min_samples_split')

x = df2['Min_samples_split']
y1 = df2['Recall']
y2 = df2['Precision']
y3 = df2['F1 score']

plt.figure(figsize=(10, 6))

ax = plt.gca()
ax.xaxis.set_minor_locator(MultipleLocator(10))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))

plt.plot(x, y1, c='#1f77b4', label='Recall', marker='o', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y2, c='#7f7f7f', label='Precision', marker='^', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y3, c='#d62728', label='$F_1$ score', marker='s', linestyle='-', linewidth=1, markersize=2)

plt.xticks(range(0, 101, 20), fontsize=fontSize)
plt.yticks([i/10 for i in range(11)], fontsize=fontSize)

plt.grid(True, which='both', linestyle='--', linewidth=0.3)

plt.xlabel('s, the minimum number of samples required to split a node', fontsize=fontSize)
plt.ylabel('Metrics', fontsize=fontSize)
plt.title('Metrics vs s ($d=8$, $\{a, e, n, \sin{I}\}$)', fontsize=fontSize)
plt.legend(fontsize=fontSize-4)

plt.ylim(0.9, 1.01)
plt.xlim(0, 100)

plt.savefig('output/GB_vs_s1.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
dfs = [
    df[(df['Features'] == 'a, sinI') & (df['Learning_rate'] == 0.10) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 0) & (df['N_estimators'] == 55)].sort_values(by='Min_samples_split'),
    df[(df['Features'] == 'a, e, n') & (df['Learning_rate'] == 0.10) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 6) & (df['N_estimators'] == 55)].sort_values(by='Min_samples_split'),
    df[(df['Features'] == 'a, e, n') & (df['Learning_rate'] == 0.10) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 6) & (df['N_estimators'] == 95)].sort_values(by='Min_samples_split'),
    df[(df['Features'] == 'a, e, n') & (df['Learning_rate'] == 0.10) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 0) & (df['N_estimators'] == 95)].sort_values(by='Min_samples_split'),
]

fontSize=18

fig, axs = plt.subplots(4, 1, figsize=(10, 8), sharey=False, sharex=True)

i=0
labels = ['Features: $a$, $\sin{I}$; $n=55$; $d=0$', 'Features: $a$, $e$, $n$; $n=55$; $d=6$', 'Features: $a$, $e$, $n$; $n=55$; $d=6$', 'Features: $a$, $e$, $n$; $n=95$; $d=0$']
for ax in axs:
    ax.plot(dfs[i]['Min_samples_split'], dfs[i]['Recall'], label='Recall', c='#1f77b4', marker='o', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['Min_samples_split'], dfs[i]['Precision'], label='Precision', c='#7f7f7f', marker='^', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['Min_samples_split'], dfs[i]['F1 score'], label='$F_1$ score', c='#d62728', marker='s', linestyle='-', linewidth=1, markersize=2)
    ax.grid(True, which='both', linestyle='--', linewidth=0.3)
    ax.set_ylabel('Metrics', fontsize=fontSize)
    if i==3:
        ax.set_xlabel('s, the minimum number of samples required to split a node', fontsize=fontSize)
    ax.set_title(f"{labels[i]}", fontsize=fontSize)
    ax.set_ylim(0.9, 1.01)
    ax.set_xlim(2, 100)
    ax.tick_params(axis='both', which='major', labelsize=fontSize)
    ax.legend().remove()
    i+=1

plt.setp(axs, xticks=range(10, 100, 10), yticks=[0.90, 0.95, 1.00])
plt.subplots_adjust(wspace=0.0)  # Remove horizontal spacing between subplots

fig.suptitle('Metrics vs s', fontsize=fontSize+5, y=1.1)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncols=3, fontsize=fontSize)
plt.subplots_adjust(wspace=0.0, hspace=0.0)  # Remove horizontal spacing between subplots
plt.tight_layout()

plt.savefig('output/GB_vs_s.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
for elem in dfs:
    print(elem.sort_values(by='F1 score', ascending=False).head(60)[['Min_samples_split', 'Recall', 'Precision', 'F1 score']])

In [None]:
df = pd.read_csv('data/GB_grid_vs_lr30.csv')
df['Max_depth'] = df['Max_depth'].fillna(0).astype(int)
df.sort_values(by='F1 score', ascending=False).head(20)

In [None]:
dfs = [
    df[(df['Features'] == 'a, sinI') & (df['Min_samples_split'] == 7) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 0) & (df['N_estimators'] == 55)].sort_values(by='Learning_rate'),
    df[(df['Features'] == 'a, e, n') & (df['Min_samples_split'] == 7) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 6) & (df['N_estimators'] == 55)].sort_values(by='Learning_rate'),
    df[(df['Features'] == 'a, e, n') & (df['Min_samples_split'] == 10) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 6) & (df['N_estimators'] == 55)].sort_values(by='Learning_rate'),
    df[(df['Features'] == 'a, e, n') & (df['Min_samples_split'] == 7) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 0) & (df['N_estimators'] == 95)].sort_values(by='Learning_rate'),
]

fontSize=18

fig, axs = plt.subplots(4, 1, figsize=(10, 8), sharey=False, sharex=True)

i=0
labels = ['Features: $a$, $\sin{I}$; $n=55$; $d=0$', 'Features: $a$, $e$, $n$; $n=55$; $d=6$', 'Features: $a$, $e$, $n$; $n=55$; $d=6$', 'Features: $a$, $e$, $n$; $n=95$; $d=0$']
for ax in axs:
    ax.plot(dfs[i]['Learning_rate'], dfs[i]['Recall'], label='Recall', c='#1f77b4', marker='o', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['Learning_rate'], dfs[i]['Precision'], label='Precision', c='#7f7f7f', marker='^', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['Learning_rate'], dfs[i]['F1 score'], label='$F_1$ score', c='#d62728', marker='s', linestyle='-', linewidth=1, markersize=2)
    ax.grid(True, which='both', linestyle='--', linewidth=0.3)
    ax.set_ylabel('Metrics', fontsize=fontSize)
    if i==3:
        ax.set_xlabel('lr, learning rate', fontsize=fontSize)
    ax.set_title(f"{labels[i]}", fontsize=fontSize)
    ax.set_ylim(0.0, 1.01)
    ax.set_xlim(0.01, 0.30)
    ax.tick_params(axis='both', which='major', labelsize=fontSize)
    ax.legend().remove()
    i+=1

plt.setp(axs, xticks=[(i+1)/100 for i in range(4, 31, 5)], yticks=[0.0, 0.5, 1.00])
plt.subplots_adjust(wspace=0.0)  # Remove horizontal spacing between subplots

fig.suptitle('Metrics vs learning rate', fontsize=fontSize+5, y=1.1)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncols=3, fontsize=fontSize)
plt.subplots_adjust(wspace=0.0, hspace=0.0)  # Remove horizontal spacing between subplots
plt.tight_layout()

plt.savefig('output/GB_vs_lr.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
df = pd.read_csv('data/NB_initial_data.csv')
df.head(5)

In [None]:
df1 = df[(df['Features'] == 'n')]
df2 = df[(df['Features'] == 'a, n')]
df3 = df[(df['Features'] == 'a, e, n')]
df1['Train R'] = df1['Train R'].astype(int)
df2['Train R'] = df2['Train R'].astype(int)
df3['Train R'] = df3['Train R'].astype(int)

df1 = df1.sort_values(by='Train R', ascending=True)
df2 = df2.sort_values(by='Train R', ascending=True)
df3 = df3.sort_values(by='Train R', ascending=True)
df1.head(20)

In [None]:
fontSize = 18
fig, axs = plt.subplots(3, 1, figsize=(10, 6), sharey=True, sharex=True)

i=0
dfs = [df1, df2, df3]
labels = ['Features: $n$', 'Features: $a$, $n$', 'Features: $a$, $e$, $n$']
for ax in axs:
    ax.plot(dfs[i]['Train R'], dfs[i]['Rec.'], label='Recall', c='#1f77b4', marker='o', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['Train R'], dfs[i]['Prec.'], label='Precision', c='#7f7f7f', marker='^', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['Train R'], dfs[i]['F1'], label='$F_1$ score', c='#d62728', marker='s', linestyle='-', linewidth=1, markersize=2)
    ax.grid(True, which='both', linestyle='--', linewidth=0.3)
    ax.set_ylabel('Metrics', fontsize=fontSize)
    if i==2:
        ax.set_xlabel('n, the number of resonant asteroids in the training set', fontsize=fontSize)
    ax.set_title(f"{labels[i]}", fontsize=fontSize)
    ax.set_ylim(0.7, 1)
    ax.set_xlim(5, 100)
    ax.tick_params(axis='both', which='major', labelsize=fontSize)
    ax.legend().remove()
    i+=1

plt.setp(axs, xticks=range(10, 100, 10), yticks=[0.7, 0.8, 0.9, 1.0])
plt.subplots_adjust(wspace=0.0)  # Remove horizontal spacing between subplots

fig.suptitle('Metrics vs the size of the training set', fontsize=fontSize+5, y=1.1)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncols=3, fontsize=fontSize)
plt.subplots_adjust(wspace=0.0, hspace=0.0)  # Remove horizontal spacing between subplots
plt.tight_layout()

plt.savefig('output/NB_initial_data.pdf', format='pdf', bbox_inches='tight')
plt.show()


In [None]:
for elem in dfs:
    print(elem.sort_values(by='F1', ascending=False).head(10)[['Train R', 'Rec.', 'Prec.', 'F1']])

In [None]:
df = pd.read_csv('data/GB_grid_vs_d100.csv')
df['Max_depth'] = df['Max_depth'].fillna(0).astype(int)
df.sort_values(by='F1 score', ascending=False).head(20)

In [None]:
dfs = [
    df[(df['Features'] == 'a, e') & (df['Min_samples_split'] == 9) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['N_estimators'] == 95)].sort_values(by='Max_depth'),
    df[(df['Features'] == 'a, sinI') & (df['Min_samples_split'] == 3) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['N_estimators'] == 55)].sort_values(by='Max_depth'),
    df[(df['Features'] == 'a, e, n') & (df['Min_samples_split'] == 5) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['N_estimators'] == 95)].sort_values(by='Max_depth'),
]
# 0,1,"a, e",1.000,0.956,0.970,0.962,0.1,exponential,5,sqrt,9,95
# 1,2,"a, sinI",1.000,0.962,0.962,0.962,0.1,exponential,5,sqrt,3,55
# 7,8,"a, e, n",1.000,0.963,0.963,0.961,0.1,exponential,5,sqrt,5,95

fontSize=18

fig, axs = plt.subplots(3, 1, figsize=(10, 6), sharey=False, sharex=True)

i=0
labels = ['Features: $a$, $e$; $n=95$; $s=9$', 'Features: $a$, $\sin{I}$; $n=55$; $s=3$', 'Features: $a$, $e$, $n$; $n=55$; $s=5$', 'Features: $a$, $e$, $n$; $n=95$; $d=0$']
for ax in axs:
    ax.plot(dfs[i]['Max_depth'], dfs[i]['Recall'], label='Recall', c='#1f77b4', marker='o', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['Max_depth'], dfs[i]['Precision'], label='Precision', c='#7f7f7f', marker='^', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['Max_depth'], dfs[i]['F1 score'], label='$F_1$ score', c='#d62728', marker='s', linestyle='-', linewidth=1, markersize=2)
    ax.grid(True, which='both', linestyle='--', linewidth=0.3)
    ax.set_ylabel('Metrics', fontsize=fontSize)
    if i==2:
        ax.set_xlabel('d, the maximum depth', fontsize=fontSize)
    ax.set_title(f"{labels[i]}", fontsize=fontSize)
    ax.set_ylim(0.9, 1.00)
    ax.set_xlim(1, 100)
    ax.tick_params(axis='both', which='major', labelsize=fontSize)
    ax.legend().remove()
    i+=1

plt.setp(axs, xticks=range(0, 101, 10), yticks=[0.90, 0.95, 1.00])
plt.subplots_adjust(wspace=0.0)  # Remove horizontal spacing between subplots

fig.suptitle('Metrics vs the maximum depth', fontsize=fontSize+5, y=1.1)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncols=3, fontsize=fontSize)
plt.subplots_adjust(wspace=0.0, hspace=0.0)  # Remove horizontal spacing between subplots
plt.tight_layout()

plt.savefig('output/GB_vs_d100.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
dfs[0].head(5)

In [None]:
df = pd.read_csv('data/GB_final_ae.csv')
df2 = pd.read_csv('data/GB_final_ai.csv')
df['Max_depth'] = df['Max_depth'].fillna(0).astype(int)
df2['Max_depth'] = df2['Max_depth'].fillna(0).astype(int)
df.sort_values(by='F1 score', ascending=False).head(20)
df2.sort_values(by='F1 score', ascending=False).head(20)

In [None]:
dfs = [
    df[(df['Features'] == 'a, e') & (df['Min_samples_split'] == 9) & (df['Loss'] == 'exponential') & (df['Max_features'] == 'sqrt') & (df['Max_depth'] == 5)].sort_values(by='N_estimators'),
    df2[(df2['Features'] == 'a, sinI') & (df2['Min_samples_split'] == 3) & (df2['Loss'] == 'exponential') & (df2['Max_features'] == 'sqrt') & (df2['Max_depth'] == 5)].sort_values(by='N_estimators'),
]

fontSize=18

fig, axs = plt.subplots(2, 1, figsize=(10, 4), sharey=False, sharex=True)

i=0
labels = ['Features: $a$, $e$; $d=5$; $s=9$', 'Features: $a$, $\sin{I}$; $d=5$; $s=3$', 'Features: $a$, $e$, $n$; $n=55$; $s=5$', 'Features: $a$, $e$, $n$; $n=95$; $d=0$']
for ax in axs:
    ax.plot(dfs[i]['N_estimators'], dfs[i]['Recall'], label='Recall', c='#1f77b4', marker='o', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['N_estimators'], dfs[i]['Precision'], label='Precision', c='#7f7f7f', marker='^', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['N_estimators'], dfs[i]['F1 score'], label='$F_1$ score', c='#d62728', marker='s', linestyle='-', linewidth=1, markersize=2)
    ax.grid(True, which='both', linestyle='--', linewidth=0.3)
    ax.set_ylabel('Metrics', fontsize=fontSize)
    if i==1:
        ax.set_xlabel('n, the number of estimators', fontsize=fontSize)
    ax.set_title(f"{labels[i]}", fontsize=fontSize)
    ax.set_ylim(0.0, 1.00)
    ax.set_xlim(1, 100)
    ax.tick_params(axis='both', which='major', labelsize=fontSize)
    ax.legend().remove()
    i+=1

plt.setp(axs, xticks=range(0, 101, 10), yticks=[0.0, 0.50, 1.00])
plt.subplots_adjust(wspace=0.0)  # Remove horizontal spacing between subplots

fig.suptitle('Metrics vs the number of estimators', fontsize=fontSize+5, y=1.1)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncols=3, fontsize=fontSize)
plt.subplots_adjust(wspace=0.0, hspace=0.0)  # Remove horizontal spacing between subplots
plt.tight_layout()

plt.savefig('output/GB_vs_n_est.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
df = pd.read_csv('data/GB_initial_data.csv')
df.head(10)

In [None]:
df2 = df[(df['Features'] == 'a, e') & (df['Meth. & Params'] == 'GB1')].sort_values(by='Train R')
# df2 = df[(df['Features'] == 'a, sinI') & (df['Meth. & Params'] == 'GB2')].sort_values(by='Train R')

fontSize = 24

x = df2['Train R']
y1 = df2['Rec.']
y2 = df2['Prec.']
y3 = df2['F1']

plt.figure(figsize=(10, 6))

ax = plt.gca()
ax.xaxis.set_minor_locator(MultipleLocator(10))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))

plt.plot(x, y1, c='#1f77b4', label='Recall', marker='o', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y2, c='#7f7f7f', label='Precision', marker='^', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y3, c='#d62728', label='$F_1$ score', marker='s', linestyle='-', linewidth=1, markersize=2)

plt.xticks(range(0, 101, 20), fontsize=fontSize)
plt.yticks([i/10 for i in range(11)], fontsize=fontSize)

plt.grid(True, which='both', linestyle='--', linewidth=0.3)

plt.xlabel('n, The number of resonant asteroids in the training set', fontsize=fontSize)
plt.ylabel('Metrics', fontsize=fontSize)
# plt.title('Metrics vs n', fontsize=fontSize)
plt.legend(fontsize=fontSize-4)

plt.ylim(0.65, 1.01)
plt.xlim(2, 100)

plt.savefig('output/GB_initial_data.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
df = pd.read_csv('data/Ada_grid_vs_n_est.csv')

best_f1_scores = df.groupby('Features')['F1 score'].idxmax()
best_df = df.loc[best_f1_scores, ['Features', 'N_estimators', 'Accuracy', 'Precision', 'Recall', 'F1 score']]
best_df = best_df.sort_values(by='F1 score', ascending=False)
best_df

In [None]:
latex_table = best_df.to_latex(index=False)
print(latex_table)

In [None]:
dfs = [
    df[(df['Features'] == 'a, e, sinI')].sort_values(by='N_estimators'),
    df[(df['Features'] == 'a, n')].sort_values(by='N_estimators'),
]

fontSize=18

fig, axs = plt.subplots(2, 1, figsize=(10, 4), sharey=False, sharex=True)

i=0
labels = ['Features: $a$, $e$, $\sin{I}$', 'Features: $a$, $n$']
for ax in axs:
    ax.plot(dfs[i]['N_estimators'], dfs[i]['Recall'], label='Recall', c='#1f77b4', marker='o', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['N_estimators'], dfs[i]['Precision'], label='Precision', c='#7f7f7f', marker='^', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i]['N_estimators'], dfs[i]['F1 score'], label='$F_1$ score', c='#d62728', marker='s', linestyle='-', linewidth=1, markersize=2)
    ax.grid(True, which='both', linestyle='--', linewidth=0.3)
    ax.set_ylabel('Metrics', fontsize=fontSize)
    if i==1:
        ax.set_xlabel('n, the number of estimators', fontsize=fontSize)
    ax.set_title(f"{labels[i]}", fontsize=fontSize)
    ax.set_ylim(0.9, 1.00)
    ax.set_xlim(100, 2000)
    ax.tick_params(axis='both', which='major', labelsize=fontSize)
    ax.legend().remove()
    i+=1

plt.setp(axs, xticks=range(100, 2001, 200), yticks=[0.90, 0.950, 1.00])
plt.subplots_adjust(wspace=0.0)  # Remove horizontal spacing between subplots

fig.suptitle('Metrics vs the number of estimators', fontsize=fontSize+5, y=1.1)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncols=3, fontsize=fontSize)
plt.subplots_adjust(wspace=0.0, hspace=0.0)  # Remove horizontal spacing between subplots
plt.tight_layout()

plt.savefig('output/Ada_vs_n_est.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
df = pd.read_csv('data/Ada_initial_data.csv')
df.head(10)

In [None]:
# df2 = df[(df['Features'] == 'a, e') & (df['Meth. & Params'] == 'GB1')].sort_values(by='Train R')
df2 = df.sort_values(by='Train R')
# df2 = df[(df['Features'] == 'a, sinI') & (df['Meth. & Params'] == 'GB2')].sort_values(by='Train R')

fontSize = 20

x = df2['Train R']
y1 = df2['Rec.']
y2 = df2['Prec.']
y3 = df2['F1']

plt.figure(figsize=(10, 6))

ax = plt.gca()
ax.xaxis.set_minor_locator(MultipleLocator(10))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))

plt.plot(x, y1, c='#1f77b4', label='Recall', marker='o', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y2, c='#7f7f7f', label='Precision', marker='^', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y3, c='#d62728', label='$F_1$ score', marker='s', linestyle='-', linewidth=1, markersize=2)

plt.xticks(range(0, 101, 20), fontsize=fontSize)
plt.yticks([i/10 for i in range(11)], fontsize=fontSize)

plt.grid(True, which='both', linestyle='--', linewidth=0.3)

plt.xlabel('n, The number of resonant asteroids in the training set', fontsize=fontSize)
plt.ylabel('Metrics', fontsize=fontSize)
# plt.title('Metrics vs n', fontsize=fontSize)
plt.legend(fontsize=fontSize-4)

plt.ylim(0.65, 1.01)
plt.xlim(2, 100)

plt.savefig('output/Ada_initial_data.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
df = pd.read_csv('data/RF_grid.csv')
df['Max_depth'] = df['Max_depth'].fillna('None').astype(str)
df['Class_weight'] = df['Class_weight'].fillna('None').astype(str)
df.head(10)

In [None]:
latex_table = df[['Features', 'Max_depth', 'N_estimators', 'Class_weight', 'Accuracy', 'Precision', 'Recall', 'F1 score']].sort_values(by='F1 score', ascending=False).head(10).to_latex(index=False)
print(latex_table)

In [None]:
df1 = pd.read_csv('data/RF_grid_aen_vs_n_est.csv')
df2 = pd.read_csv('data/RF_grid_aein_vs_n_est.csv')
for df in [df1, df2]:
    df['Max_depth'] = df['Max_depth'].fillna(0).astype(int)
    df['Class_weight'] = df['Class_weight'].fillna('None').astype(str)
df1[(df1['Features'] == 'a, e, n') & (df1['Max_depth'] == 0) & (df1['Class_weight'] == 'None')].sort_values(by='N_estimators').head(5)

In [None]:
dfs = [
    df1[(df1['Features'] == 'a, e, n') & (df1['Max_depth'] == 0)].sort_values(by='N_estimators'),
    df2[(df2['Max_depth'] == 0)].sort_values(by='N_estimators'),
    # df1[(df1['Features'] == 'a, e, n') & (df1['Max_depth'] == 0) & (df1['Class_weight'] == 'balanced')].sort_values(by='N_estimators'),
    # df[(df['Features'] == 'a, e, n, sinI')].sort_values(by='N_estimators'),
]

fontSize=18

fig, axs = plt.subplots(2, 1, figsize=(10, 4), sharey=False, sharex=True)

i=0
labels = ['Features: $a$, $e$, $n$', 'Features: $a$, $e$, $n$, $\sin{I}$']
for ax in axs:
    # ax.plot(dfs[i]['N_estimators'], dfs[i]['Recall'], label='Recall', c='#1f77b4', marker='o', linestyle='-', linewidth=1, markersize=2)
    # ax.plot(dfs[i]['N_estimators'], dfs[i]['Precision'], label='Precision', c='#7f7f7f', marker='^', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i][(dfs[i]['Class_weight'] == 'None')]['N_estimators'], dfs[i][(dfs[i]['Class_weight'] == 'None')]['F1 score'], label='None', c='#d62728', marker='s', linestyle='-', linewidth=1, markersize=2)
    ax.plot(dfs[i][(dfs[i]['Class_weight'] == 'balanced')]['N_estimators'], dfs[i][(dfs[i]['Class_weight'] == 'balanced')]['F1 score'], label='Balanced', c='#7f7f7f', marker='^', linestyle='-', linewidth=1, markersize=2)
    ax.grid(True, which='both', linestyle='--', linewidth=0.3)
    ax.set_ylabel('$F_1$ score', fontsize=fontSize)
    if i==1:
        ax.set_xlabel('n, the number of estimators', fontsize=fontSize)
    ax.set_title(f"{labels[i]}", fontsize=fontSize)
    ax.set_ylim(0.93, 0.97)
    ax.set_xlim(1, 100)
    ax.tick_params(axis='both', which='major', labelsize=fontSize)
    ax.legend().remove()
    i+=1

plt.setp(axs, xticks=range(0, 100, 10), yticks=[0.93, 0.950, 0.97])
plt.subplots_adjust(wspace=0.0)  # Remove horizontal spacing between subplots

fig.suptitle('$F_1$ score vs the class weights and the number of estimators', fontsize=fontSize+2, y=1.1)

handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, 1.05), ncols=3, fontsize=fontSize)
plt.subplots_adjust(wspace=0.0, hspace=0.0)  # Remove horizontal spacing between subplots
plt.tight_layout()

plt.savefig('output/RF_vs_n_est_weights.pdf', format='pdf', bbox_inches='tight')
plt.show()
dfs[0].head(5)

In [None]:
df = pd.read_csv('data/RF_initial_data.csv')
df.head(5)

In [None]:
# df2 = df[(df['Features'] == 'a, e') & (df['Meth. & Params'] == 'GB1')].sort_values(by='Train R')
df2 = df.sort_values(by='Train R')
# df2 = df[(df['Features'] == 'a, sinI') & (df['Meth. & Params'] == 'GB2')].sort_values(by='Train R')

fontSize = 20

x = df2['Train R']
y1 = df2['Rec.']
y2 = df2['Prec.']
y3 = df2['F1']

plt.figure(figsize=(10, 6))

ax = plt.gca()
ax.xaxis.set_minor_locator(MultipleLocator(10))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))

plt.plot(x, y1, c='#1f77b4', label='Recall', marker='o', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y2, c='#7f7f7f', label='Precision', marker='^', linestyle='-', linewidth=1, markersize=2)
plt.plot(x, y3, c='#d62728', label='$F_1$ score', marker='s', linestyle='-', linewidth=1, markersize=2)

plt.xticks(range(0, 101, 20), fontsize=fontSize)
plt.yticks([i/10 for i in range(11)], fontsize=fontSize)

plt.grid(True, which='both', linestyle='--', linewidth=0.3)

plt.xlabel('n, The number of resonant asteroids in the training set', fontsize=fontSize)
plt.ylabel('Metrics', fontsize=fontSize)
# plt.title('Metrics vs n', fontsize=fontSize)
plt.legend(fontsize=fontSize-4)

plt.ylim(0.65, 1.01)
plt.xlim(2, 100)

plt.savefig('output/RF_initial_data.pdf', format='pdf', bbox_inches='tight')
plt.show()

In [None]:
df = pd.read_csv('data/BRF_grid_sm.csv')
df['Max_depth'] = df['Max_depth'].fillna(0).astype(int)
df.head(10)

In [None]:
latex_table = df[['Features', 'Max_depth', 'Min_samples_split', 'N_estimators', 'Accuracy', 'Precision', 'Recall', 'F1 score']].head(10).to_latex(index=False)
print(latex_table)

In [None]:
df = pd.read_csv('data/final.csv')
df.head(10)


In [None]:
latex_table = df[['Train R', 'Meth. & Params', 'Features', 'TP', 'FP', 'TN', 'FN', 'Acc.', 'Prec.', 'Rec.', 'F1']].head(10).to_latex(index=False)
print(latex_table)