# Unsupervised Machine Learning for the Classification of Astrophysical X-ray Sources
###### *Víctor Samuel Pérez Díaz<sup>1,2</sup>, Rafael Martinez-Galarza<sup>1</sup>, Alexander Caicedo-Dorado<sup>2</sup>, Raffaele D'Abrusco<sup>1</sup>*

*1. Center for Astrophysics | Harvard & Smithsonian, 2. Universidad del Rosario*

Contact ```vperezdiaz@cfa.harvard.edu``` for questions or comments.


#### Classification

---

In [2]:
%load_ext autoreload
%autoreload 2

from umlcaxs_lib import votable_to_pandas, lognorm, mahalanobis, mahal_classifier_cl, create_summary_tables, mahal_classifier_all
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import matplotlib as mpl
import itertools
import seaborn as sns

from collections import Counter

%matplotlib inline

In [2]:
# Edit the font, font size, and axes width

mpl.rcParams['font.family'] = 'Avenir LT Std'
plt.rcParams['font.size'] = 14
plt.rcParams['axes.linewidth'] = 2

In [3]:
uks = ['Star', 'X', 'Radio', 'IR', 'Blue', 'UV', 'gamma', 'PartofG', '**']
ltypes = ['QSO', 'AGN', 'Seyfert_1', 'Seyfert_2', 'HMXB', 'LMXB', 'XB', 'YSO', 'TTau*', 'Orion_V*']
grouped_replace = {'QSO': 'AGN', 'Seyfert_1': 'Seyfert', 'Seyfert_2': 'Seyfert', 'HMXB': 'XB', 'LMXB':'XB', 'TTau*':'YSO', 'Orion_V*': 'YSO' }

In [3]:
df_cluster_csc_simbad = pd.read_csv('./out_data/cluster_csc_simbad.csv', index_col=0)

  df_cluster_csc_simbad = pd.read_csv('./out_data/cluster_csc_simbad.csv', index_col=0)


In [5]:
nans_df_csc_simbad = df_cluster_csc_simbad.copy(deep=True).fillna({'main_type': 'NaN'})

In [6]:
# Features that we use in our analysis
features = ['hard_hm', 'hard_hs', 'hard_ms', 'powlaw_gamma', 'bb_kt', 'var_prob_b','var_ratio_b', 'var_prob_h', 'var_ratio_h', 'var_prob_s', 'var_ratio_s', 'var_newq_b']

# Features to log transform and normalize
features_lognorm = ['bb_kt', 'var_ratio_h', 'var_ratio_b', 'var_ratio_s', 'var_newq_b']

# Features to normalize
features_norm = ['powlaw_gamma']

# Drop data with missing values in features
df_cscs_out = nans_df_csc_simbad.dropna(subset=features)

In [7]:
# Normalize or log normalize
df_cscs_lognorm = lognorm(df_cscs_out, features, features_norm, features_lognorm)

In [8]:
sum(df_cscs_lognorm.main_type.isin(uks+['NaN']))

14507

In [9]:
classified_df = mahal_classifier_all(df_cscs_lognorm, df_cscs_out, features, ltypes, uks=uks)
classified_df.head(10)

***Cluster 0***
***Cluster 1***
***Cluster 2***
***Cluster 3***
***Cluster 4***
***Cluster 5***


Unnamed: 0,name,obsid,cluster,main_type,QSO,AGN,Seyfert_1,Seyfert_2,HMXB,LMXB,...,R_simbad,J,H,K,u,g,r_simbada,i,z,angDist
0,2CXO J000010.0-501526,11997,1,QSO,0.7109412,0.1998591,0.03817763,0.00119828,0.02087573,0.004708968,...,,,,,,,,,,
1,2CXO J000019.8-245030,13394,2,QSO,0.7841575,0.2115698,0.004237418,8.312877e-06,2.696935e-05,9.165581e-15,...,,,,,,,,,,
2,2CXO J000025.4-245419,13394,1,Seyfert_1,4.459738e-05,0.0001858106,0.6535472,0.3462222,3.457804e-08,1.228568e-07,...,,,,,,,,,,
3,2CXO J000027.4-500421,11742,1,AGN,0.0001823781,0.8045687,0.06197661,3.05158e-10,0.1188399,0.01034259,...,,,,,,,,,,
4,2CXO J000027.4-500421,11997,1,Seyfert_1,0.04029002,0.004302557,0.8071425,1.109741e-05,0.0004020933,0.02280539,...,,,,,,,,,,
5,2CXO J000028.5+623053,2810,3,AGN,0.002199853,0.7341143,1.090844e-05,2.907129e-07,0.004578093,0.2590946,...,,,,,,,,,,
6,2CXO J000031.1-500914,11997,0,Orion_V*,5.149786e-58,7.325791e-25,2.212079e-33,0.0,2.709021e-09,6.852283e-11,...,,,,,,,,,,
7,2CXO J000031.8-245458,13394,5,QSO,0.4335578,0.006251523,0.001896583,8.076405e-06,0.3305473,0.05331714,...,,,,,,,,,,
8,2CXO J000047.6-551937,7061,2,QSO,0.7616869,0.05975887,0.178552,2.229388e-06,2.421506e-09,1.382007e-32,...,,,,,,,,,,
9,2CXO J000058.9-245449,13394,1,QSO,0.6777465,0.2986486,0.02052719,3.545975e-05,0.0002002315,7.983729e-06,...,,,,,,,,,,


In [10]:
classified_df.to_csv('./out_data/classified_cscs.csv')

In [11]:
len(classified_df.name.unique())

8756

#### Create classification summary table
---

In [4]:
classified_df = pd.read_csv('./out_data/classified_cscs.csv', index_col=0)

In [7]:
counts = classified_df['name'].value_counts()

# Define bins for the grouping
bins = [1, 2, 3, 11, np.inf]
labels = ['1', '2', '3-10', '>10']

# Group the data based on the defined bins
grouped_counts = pd.cut(counts, bins=bins, labels=labels, right=False).value_counts()

# Sort the result by the index (the bins)
grouped_counts = grouped_counts.sort_index()

# Calculate the total number of sources
total_sources = grouped_counts.sum()

# Print the result with percentage
for range, num_sources in grouped_counts.items():
    percentage = (num_sources / total_sources) * 100
    print(f"{num_sources} sources ({percentage:.2f}%) have {range} detections.")

6802 sources (77.68%) have 1 detections.
1000 sources (11.42%) have 2 detections.
882 sources (10.07%) have 3-10 detections.
72 sources (0.82%) have >10 detections.


In [12]:
print(f'There are {len(np.unique(classified_df.name))} unique classified sources...')

There are 8756 unique classified sources...


In [None]:
fig, ax = plt.subplots(1, figsize=(8, 8))
ax.yaxis.set_tick_params(which='major', size=6, width=0.5, direction='out')
ax.yaxis.set_tick_params(which='minor', size=3, width=0.5, direction='in')
ax.yaxis.set_major_locator(mpl.ticker.MultipleLocator(1000))
ax.yaxis.set_minor_locator(mpl.ticker.MultipleLocator(250))
#ax.grid(True)
#sns.set_style("whitegrid")
sns.histplot(classified_df.name.value_counts().reset_index(name="counts").counts, color='black', bins=20, log_scale=True, ax=ax)
ax.set(xlabel='Count of detections', ylabel='Frequency')
ax.set_yscale('log')

plt.savefig('./figures/results_class_a_nofobs.pdf', bbox_inches='tight')  

In [13]:
def mean_std_round(x): return (str(round(np.mean(x), 3)) + '±' + str(round(np.std(x), 3)))
summ_table = classified_df.groupby('name')[ltypes].agg(mean_std_round)
summ_table['detection_count'] = classified_df.groupby(['name']).size()

summ_table_prov = classified_df.groupby('name')[ltypes].agg(['mean', 'std'])
class_mean_names = [list(tup) for tup in itertools.product(ltypes, ['mean'], repeat=1)]
names_comp = summ_table_prov[class_mean_names].idxmax(axis=1).to_list()
master_names = [name[0] for name in names_comp]
summ_table['soft_master_class'] = master_names
summ_table.sort_values('detection_count', ascending=False, inplace=True)

In [14]:
n_obs = 0
class_df_grouped = classified_df.copy(deep=True)
def most_common(x): 
    most_common,name_most_common = Counter(x).most_common(1)[0]
    return(most_common) 

summ_table_hard = class_df_grouped.groupby(['name']).size().to_frame(name='detection_count')
summ_table_hard = summ_table_hard[summ_table_hard.detection_count > n_obs]
summ_table_hard['hard_master_class'] = class_df_grouped.groupby(['name'])['main_type'].agg(most_common)
summ_table_hard.sort_values('detection_count', ascending=False, inplace=True)

In [15]:
agreeing_class_df = summ_table[summ_table.soft_master_class == summ_table_hard.hard_master_class].copy(deep=True)
agreeing_class_df = agreeing_class_df.rename(columns={'soft_master_class':'master_class'})
agreeing_class_df['agg_master_class'] = agreeing_class_df.master_class.replace(grouped_replace)

In [17]:
agreeing_class_df.shape

(8271, 13)

In [None]:
cols = agreeing_class_df.columns.tolist()
first_cols = cols[-3:]
first_cols.reverse()
cols = first_cols + cols[:-3]
agreeing_class_df_out = agreeing_class_df[cols]

In [None]:
agreeing_class_df_out.to_csv('./out_data/agreeing_classification.csv')

In [18]:
confused_class_df = summ_table[summ_table.soft_master_class != summ_table_hard.hard_master_class].copy(deep=True)
confused_class_df['hard_master_class'] = summ_table_hard.hard_master_class

In [20]:
confused_class_df.shape

(485, 13)

In [None]:
cols = confused_class_df.columns.tolist()
first_cols = cols[-3:]
first_cols.reverse()
cols = first_cols + cols[:-3]
confused_class_df_out = confused_class_df[cols]

In [None]:
confused_class_df_out.to_csv('./out_data/confused_classification.csv')