# Unsupervised Machine Learning for the Classification of Astrophysical X-ray Sources
###### *Víctor Samuel Pérez Díaz<sup>1</sup>, Rafael Martinez-Galarza<sup>2</sup>, Alexander Caicedo-Dorado<sup>1</sup>, Raffaele D'Abrusco<sup>2</sup>*

*1. Universidad del Rosario, 2. Center for Astrophysics | Harvard & Smithsonian*

---

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from pylab import cm
import math
import seaborn as sns

In [None]:
# Edit the font, font size, and axes width

mpl.rcParams['font.family'] = 'Avenir LT Std'
plt.rcParams['font.size'] = 14
plt.rcParams['axes.linewidth'] = 2

#### Histogram plots

Histogram plots for all features in X-ray only tables and X-ray + Optical.

In [None]:
df_xray_optical = pd.read_csv('new_sim_data_colors.csv')
df_xray_optical.drop('Unnamed: 0.1', inplace=True, axis=1)
df_xray_optical.drop('Unnamed: 0', inplace=True, axis=1)

In [None]:
df_xray = pd.read_csv('gmm6_allvar_0_simbad.csv')
df_xray.drop('col1', inplace=True, axis=1)
features_xray = ['src_area_b', 'hard_hm', 'hard_hs', 'hard_ms', 'powlaw_gamma', 'bb_kt', 'var_prob_b','var_sigma_b', 'var_mean_b', 'var_min_b', 'var_max_b', 'var_prob_h', 'var_sigma_h', 'var_mean_h', 'var_min_h', 'var_max_h', 'var_prob_m', 'var_sigma_m', 'var_mean_m', 'var_min_m', 'var_max_m', 'var_prob_s', 'var_sigma_s',
'var_mean_s', 'var_min_s', 'var_max_s', 'ks_prob_b', 'ks_prob_h', 'ks_prob_m', 'ks_prob_s', 'kp_prob_b', 'kp_prob_h', 'kp_prob_m', 'kp_prob_s']

features_lognorm_xray = ['theta', 'src_area_b', 'bb_kt', 'var_sigma_b', 'var_mean_b', 'var_min_b', 'var_max_b', 'var_sigma_h', 'var_mean_h', 'var_min_h', 'var_max_h', 'var_sigma_m', 'var_mean_m', 'var_min_m', 'var_max_m',  'var_sigma_s', 'var_mean_s', 'var_min_s', 'var_max_s']


In [None]:
df_xray.head()

In [None]:
def move_legend(ax, new_loc, **kws):
    '''https://github.com/mwaskom/seaborn/issues/2280'''
    old_legend = ax.legend_
    handles = old_legend.legendHandles
    labels = [t.get_text() for t in old_legend.get_texts()]
    title = old_legend.get_title().get_text()
    ax.legend(handles, labels, loc=new_loc, title=title, **kws)

def hist_plots(df):
    colors = cm.get_cmap('Set1')
    
    features = ['theta', 'src_area_b', 'hard_hm', 'hard_hs', 'hard_ms', 'powlaw_gamma', 'bb_kt', 'var_prob_b','var_sigma_b', 'var_mean_b', 'var_min_b', 'var_max_b', 'var_prob_h', 'var_sigma_h', 'var_mean_h', 'var_min_h', 'var_max_h', 'var_prob_m', 'var_sigma_m', 'var_mean_m', 'var_min_m', 'var_max_m', 'var_prob_s', 'var_sigma_s',
    'var_mean_s', 'var_min_s', 'var_max_s', 'ks_prob_b', 'ks_prob_h', 'ks_prob_m', 'ks_prob_s', 'kp_prob_b', 'kp_prob_h', 'kp_prob_m', 'kp_prob_s']

    features_lognorm = ['theta', 'src_area_b', 'bb_kt', 'var_sigma_b', 'var_mean_b', 'var_min_b', 'var_max_b', 'var_sigma_h', 'var_mean_h', 'var_min_h', 'var_max_h', 'var_sigma_m', 'var_mean_m', 'var_min_m', 'var_max_m',  'var_sigma_s', 'var_mean_s', 'var_min_s', 'var_max_s']
    
    nrow = 12; ncol = 3;
    fig, axs = plt.subplots(nrows=nrow, ncols=ncol, figsize=(16,60))
    fig.tight_layout(h_pad=10, w_pad=2)
    for i, ax in enumerate(axs.reshape(-1)): 
        if i >= len(features):
            ax.set_axis_off()
            continue
        ax.yaxis.set_tick_params(which='major', size=6, width=0.5, direction='in')
        ax.yaxis.set_tick_params(which='minor', size=3, width=0.5, direction='in')
        ax.xaxis.set_tick_params(which='major', size=6, width=0.5, direction='out')
        ax.xaxis.set_tick_params(which='minor', size=3, width=0.5, direction='out')
        
        if features[i] in features_lognorm:
            X_desc = df[features[i]]
            nonzero = X_desc[X_desc!=0]
            minval = np.min(nonzero)/10

            df[features[i]] = X_desc + minval
            axsns = sns.histplot(data=df, x=features[i], hue='cluster', ax=ax , palette=colors, bins=30, element="step", log_scale=True)
            ax.set_xlabel('log({})'.format(features[i]))
        else:
            axsns = sns.histplot(data=df, x=features[i], hue='cluster', ax=ax , palette=colors, bins=30, element="step")
        move_legend(ax,
            new_loc="lower center",
            bbox_to_anchor=(.5, 1), ncol=3
        )
        #ax.set_title(features[i], fontsize=12)
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=90)
        
        #handles, labels = ax.get_legend_handles_labels()
    #fig.legend(handles, labels, bbox_to_anchor=(0.8, 1.01))
    
    #plt.savefig('figures/xray_hists.pdf', dpi=300, transparent=False, bbox_inches='tight')

In [None]:
# For saving just one plot.
colors = cm.get_cmap('Set1')
fig, ax = plt.subplots(nrows=1, ncols=1)
ax.yaxis.set_tick_params(which='major', size=6, width=0.5, direction='in')
ax.yaxis.set_tick_params(which='minor', size=3, width=0.5, direction='in')
ax.xaxis.set_tick_params(which='major', size=6, width=0.5, direction='out')
ax.xaxis.set_tick_params(which='minor', size=3, width=0.5, direction='out')
axsns = sns.histplot(data=df_xray, x='powlaw_gamma_CI_per', hue='cluster', ax=ax , palette=colors, bins=30, element="step")
move_legend(ax,
new_loc="lower center",
bbox_to_anchor=(.5, 1), ncol=3
)
#ax.set_title(features[i], fontsize=12)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=90)
#plt.savefig('figures/xray_hist__.svg', dpi=300, transparent=False, bbox_inches='tight')

In [None]:
#hist_plots(df_xray)

#### Mean and std tables

Produce mean and std matrices for all features.

In [None]:
features_xray = ['theta', 'src_area_b', 'hard_hm', 'hard_hs', 'hard_ms', 'powlaw_gamma', 'bb_kt', 'var_prob_b','var_sigma_b', 'var_mean_b', 'var_min_b', 'var_max_b', 'var_prob_h', 'var_sigma_h', 'var_mean_h', 'var_min_h', 'var_max_h', 'var_prob_m', 'var_sigma_m', 'var_mean_m', 'var_min_m', 'var_max_m', 'var_prob_s', 'var_sigma_s',
    'var_mean_s', 'var_min_s', 'var_max_s', 'ks_prob_b', 'ks_prob_h', 'ks_prob_m', 'ks_prob_s', 'kp_prob_b', 'kp_prob_h', 'kp_prob_m', 'kp_prob_s']
features_xray_optical = features_xray + ['u-g', 'g-r', 'r-i', 'i-z']

In [None]:
xray_summary = df_xray.groupby(['cluster']).agg(['mean', 'std'])
xray_summary = xray_summary[features_xray]

In [None]:
xray_optical_summary = df_xray_optical.groupby(['cluster']).agg(['mean', 'std'])
xray_optical_summary = xray_optical_summary[features_xray_optical]

In [None]:
xray_optical_summary