In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance

from utils import votable_to_pandas
from astropy.table import Table

In [3]:
def_last_prob_df = pd.read_csv('out_data/last_new.csv')
def_second_prob_df = pd.read_csv('out_data/second_new.csv')
def_first_prob_df = pd.read_csv('out_data/most_probable_new.csv')

In [42]:
def_first_prob_df = pd.read_csv('out_data/most_probable_new.csv')

In [35]:
#check
def_first_prob_df = pd.read_csv('out_data/def_most_prob.csv')

In [46]:
def_first_prob_df_filtered = def_first_prob_df.query('nway_separation < 0.2')
# Extract unique chandra_source_id values from def_first_prob_df
chandra_ids_in_first = def_first_prob_df_filtered['chandra_source_id'].unique()

# Filter def_second_prob_df and def_last_prob_df based on the extracted chandra_source_id values
filtered_second_prob_df = def_second_prob_df[def_second_prob_df['chandra_source_id'].isin(chandra_ids_in_first)]
filtered_last_prob_df = def_last_prob_df[def_last_prob_df['chandra_source_id'].isin(chandra_ids_in_first)]

In [44]:
def_first_prob_df_filtered.name.isna().sum()

12302

In [45]:
count_of_nan_chandra = def_first_prob_df_filtered.name.isna().sum()
print(count_of_nan_chandra/def_first_prob_df.shape[0])

0.10262356621480709


In [24]:
from matplotlib.backends.backend_pdf import PdfPages

features = [ 
    'phot_g_mean_mag', 'phot_bp_mean_mag', 'phot_rp_mean_mag', 'bp_rp', 'bp_g', 'g_rp', 'parallax', 'parallax_over_error',
    'hard_hs',
    'hard_hm',
    'hard_ms',
    'var_intra_prob_b',
    'var_inter_prob_b',
]

with PdfPages('figures/chandra_gaia_nway_exp/distributions_separation_g0.2_new.pdf') as pdf:
    for feature in features:
        fig, ax = plt.subplots(figsize=(15, 5))
        
        if feature == 'parallax':
            ax.hist(np.log10(1/def_first_prob_df_filtered[feature]), bins=40, histtype='step', linewidth=3, label='First', density=True)
            ax.hist(np.log10(1/filtered_second_prob_df[feature]), bins=40, histtype='step', linewidth=3, label='Second', density=True)
            ax.hist(np.log10(1/filtered_last_prob_df[feature]), bins=40, histtype='step', linewidth=3, label='Last', density=True)
            ax.set_title(f'log(1/{feature})')
            
        elif feature == 'parallax_over_error' or feature == 'flux_aper_b':
            ax.hist(np.log10(def_first_prob_df_filtered[feature]), bins=40, histtype='step', linewidth=3, label='First', density=True)
            ax.hist(np.log10(filtered_second_prob_df[feature]), bins=40, histtype='step', linewidth=3, label='Second', density=True)
            ax.hist(np.log10(filtered_last_prob_df[feature]), bins=40, histtype='step', linewidth=3, label='Last', density=True)
            ax.set_title(f'log({feature})')
            
        else:
            ax.hist(def_first_prob_df_filtered[feature], bins=40, histtype='step', linewidth=3, label='First', density=True)
            ax.hist(filtered_second_prob_df[feature], bins=40, histtype='step', linewidth=3, label='Second', density=True)
            ax.hist(filtered_last_prob_df[feature], bins=40, histtype='step', linewidth=3,label='Last', density=True)
            ax.set_title(f'{feature}')
        
        ax.legend()

        pdf.savefig(fig)  # Saves the current figure into a PDF page
        plt.close()  # Close the figure to free up memory

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [25]:
from astropy.coordinates import SkyCoord
import astropy.units as u

# Assuming dfs is a list of your three datasets
dfs = [def_first_prob_df_filtered]  # Replace with actual dataset variables
names = ['First']

with PdfPages('figures/chandra_gaia_nway_exp/sky_distribution_g0.2_scatter_new.pdf') as pdf:
    # Create a single row with three subplots
    fig, axes = plt.subplots(1, 1, figsize=(15, 5), subplot_kw={'projection': 'mollweide'})

    for df, name in zip(dfs, names):
        # Transform coordinates to Galactic
        skycoord = SkyCoord(ra=df['ra']*u.degree, dec=df['dec']*u.degree, frame='icrs')
        galactic = skycoord.galactic
        l_rad = galactic.l.wrap_at(180*u.degree).radian
        b_rad = galactic.b.radian

        # KDE plot
        sns.scatterplot(x=l_rad, y=b_rad, ax=axes,  rasterized=True, s=2)

        # Set title for each subplot
        ax.set_title(name)

    # Super title for the entire figure
    plt.suptitle("Sky Distribution in Galactic Coordinates")

    # Save the figure into the PDF
    pdf.savefig(fig, dpi=150)

    # Close the figure to free up memory
    plt.close(fig)