### UNSUPERVISED MACHINE LEARNING FOR THE CLASSIFICATION OF ASTROPHYSICAL X-RAY SOURCES

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.spatial.distance import euclidean, cdist
from scipy.special import softmax
import matplotlib.pyplot as plt

In [2]:
sim_data = pd.read_csv('./crossmatch_gwu.csv', index_col=0)
sim_data.drop(columns='col1_2', inplace=True)
sim_data.rename(columns={'Class':'main_type', 'name_1':'name'}, inplace=True)

In [3]:
sim_data.head()

Unnamed: 0_level_0,name,obsid,region_id,theta,ra_1,dec_1,significance,likelihood,src_area_b,flux_aper_b,...,Signif.,G,J,W1,HR_ms,HR_hm,HR_h(ms),F_x/F_o,GroupID,GroupSize
col1_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8263,2CXO J045423.6+170953,11002,5,0.27971,73.5986,17.164924,21.386765,3273.721041,,2.763923e-13,...,21.39,13.504,10.693,9.488,-0.714,0.027,-0.738,0.004,,
34643,2CXO J190447.4+090241,3793,28,7.974032,286.197787,9.04484,27.204168,4686.180826,2.278425,1.626861e-12,...,27.2,19.782,15.177,,0.836,0.931,0.925,8.033,,
29576,2CXO J162514.2+154522,3229,20,1.241024,246.309531,15.756327,35.905013,7906.170286,0.070935,3.229546e-13,...,35.91,19.864,,15.62,-0.112,0.53,0.182,1.745,,
29583,2CXO J162557.5-243032,618,1,7.954022,246.489696,-24.508906,11.222434,694.4981,15.569604,3.143077e-13,...,11.22,14.364,10.207,7.952,0.458,0.448,0.313,0.009,,
15238,2CXO J071826.4-245453,4469,219,4.665557,109.61005,-24.915,13.465217,959.170947,0.320363,2.234897e-14,...,13.47,17.451,15.0,13.939,-0.203,0.386,-0.053,0.013,,


In [4]:
print('Classes in this dataset:', sim_data.main_type.unique())

Classes in this dataset: ['YSO' 'HMXB' 'AGN' 'LMXB' 'LM-STAR' 'NS_BIN' 'CV' 'HM-STAR' 'NS' nan]


In [5]:
print('Number of sources in this dataset: ', len(np.unique(sim_data['name'])))

Number of sources in this dataset:  15248


In [6]:
sim_df_clean = sim_data.copy(deep=True).fillna({'main_type': 'NaN'})

In [7]:
print('Number of source detections for each class:\n', sim_df_clean.main_type.value_counts())

Number of source detections for each class:
 NaN        26526
YSO         1606
AGN         1095
NS           103
HM-STAR       92
LM-STAR       89
LMXB          78
CV            40
HMXB          21
NS_BIN         5
Name: main_type, dtype: int64


In [None]:
features = ['hard_hm', 'hard_hs', 'hard_ms', 'powlaw_gamma', 'bb_kt', 'var_prob_b','var_ratio_b', 'var_prob_h', 'var_ratio_h', 'var_prob_s', 'var_ratio_s', 'var_newq_b']

features_lognorm = ['bb_kt', 'var_ratio_b', 'var_ratio_s', 'var_newq_b']

features_norm = ['powlaw_gamma']

X = sim_df_clean.copy(deep=True).to_numpy()

In [None]:
# FUNCTION lognorm
# Apply log transform adding the minimum non-zero value divided by ten in order to preserve zero properties, then normalize.
# INPUT:
# X_df = data array
# X = data array as np array
# name_desc = string, name of the descriptor
# log = boolean, True if apply log transform before norm

# PROCEDURE:
# Modifies X np array of data with the normalizated data
def lognorm(X_df, X, name_desc, log):
    
    col = X_df.columns.get_loc(name_desc)
    X_desc = X_df[name_desc]
    
    if log:
        nonzero = X_desc[X_desc!=0]
        minval = np.min(nonzero)/10

        # print(minval)
        X_desc = X_desc + minval

        x = np.log(X_desc.values)  #returns a numpy array
    else:
        x = X_desc.to_numpy()
    min_max_scaler = MinMaxScaler(feature_range=(0,1))
    x_scaled = min_max_scaler.fit_transform(x.reshape(-1,1))
    X[:,col] = x_scaled.flatten()
    
    return X

In [None]:
# Log transformation

for feature in features_lognorm:
    X = lognorm(sim_df_clean, X, feature, True)
        
for feature in features_norm:
    X = lognorm(sim_df_clean, X, feature, False)

In [None]:
s_df = pd.DataFrame(X, columns=sim_df_clean.columns)

In [None]:
def create_summary_tables(df):
    data_n = df.copy(deep=True)
    count_obs = data_n.groupby(['main_type']).size()
    df_n = pd.concat([count_obs], axis=1)
    df_n = df_n.rename(columns={0:'size'})
    return df_n

def softmin(x):
    return np.exp(-np.abs(x))/sum(np.exp(-np.abs(x)))
    
def frequent_types(df, n, uks):
    if uks:
        df = df[(df.main_type != 'NaN') & ~(df.main_type.isin(uks))]
    else:
        df = df[df.main_type != 'NaN']
    count_obs = df.groupby(['main_type']).size()
    count_obs_df = count_obs.reset_index()
    count_obs_df = count_obs_df.rename(columns={0:'size'})
    quer = count_obs_df.sort_values(by='size', ascending=False).head(n)
    ltypes = np.unique(quer.main_type)
    return ltypes

def gen_nan_probs(cl, features, uks=[], mean=True, distance='mahalanobis'):
    if uks:
        cl_nan = cl[(cl.main_type == 'NaN') | cl.main_type.isin(uks)]
    else:
        cl_nan = cl[cl.main_type == 'NaN']

    cl_types_desc = cl_nan[features]
    ltypes = frequent_types(cl, 5, uks=uks)
    types_comp = []
    types_prob = []
    for i, row in cl_types_desc.iterrows():
        r_np = row.to_numpy().reshape(1, -1)

        mdists = []
        for t in ltypes:
            cl_type = cl[cl.main_type == t]
            cl_type_desc = cl_type[features]
            cltd_np = cl_type_desc.to_numpy()
            
            if distance == 'euclidean':
                d2t = cdist(r_np, cltd_np, metric='euclidean')
            elif distance == 'mahalanobis':
                V = np.cov(cltd_np.astype(float).T)
                
                IV = np.linalg.inv(V)
                d2t = cdist(r_np, cltd_np, metric='mahalanobis', VI=IV)
            
            if mean:
                d2t_mean = np.mean(d2t)
            else:
                d2t_mean = np.median(d2t)

            mdists.append(d2t_mean)

        scaler = MinMaxScaler()
        mdistsn = np.asarray(mdists).reshape(-1, 1)
        mdistsn = mdistsn.ravel()
        sm_probs = softmin(mdistsn)
        types_prob.append(sm_probs)
        t_amax = np.argmax(sm_probs)
        types_comp.append(ltypes[t_amax])

    out_l = pd.DataFrame(cl_nan[['name', 'obsid'] + features])
    out_l['main_type'] = types_comp
    for i, t in enumerate(ltypes):
        tarray_probs = np.vstack(types_prob)
        out_l[t] = tarray_probs[:, i]
        
    return out_l

In [None]:
n = 4
uks = ['Star', 'X', 'Unknown']
cl_n = s_df[s_df.cluster == n]
#cl_n_nans = pd.read_csv(f'mahalanobis_labeled/cl{n}_mean.csv', index_col=0)
cl_n_nans = gen_nan_probs(cl_n, features, uks=uks, mean=True, distance='mahalanobis')
cl_n_nans.head(10)

In [None]:
create_summary_tables(s_df[s_df.cluster == 5]).sort_values(by='size', ascending=False).head(10)

In [None]:
cl_n_nans['main_type'].value_counts().plot.bar(color='black')

In [None]:
cl_n_nans.to_csv('class_data/cl{}.csv'.format(n))