# Import The Relevant Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.image as mpimg
from astropy.io import fits
from matplotlib import pyplot as plt
import cv2
import os

# Obtain and Process Relevant VAST Data

In [2]:
t_data = pd.read_csv("All_Transient_Data.csv")

In [3]:
t_data.columns

Index(['source_id', 'name', 'ra_str', 'dec_str', 'ra', 'dec', 'chi_square',
       'chi_square_log_sigma', 'chi_square_sigma', 'peak_map',
       'peak_map_log_sigma', 'peak_map_sigma', 'gaussian_map',
       'gaussian_map_sigma', 'std_map', 'md_deep', 'deep_sep_arcsec',
       'deep_num', 'bright_sep_arcmin', 'beam_sep_deg', 'beam_ra', 'beam_dec',
       'deep_name', 'deep_ra_deg', 'deep_dec_deg', 'deep_peak_flux',
       'deep_int_flux', 'priority', 'lightcurve', 'deepcutout', 'slices',
       'chisq_map2', 'peak_map2', 'beam', 'sbid', 'PSR_name', 'PSR_sep',
       'dyspec', 'KNOWN_name', 'KNOWN_sep', 'PSR_name_int', 'KNOWN_name_int',
       'PSR_Label'],
      dtype='object')

In [4]:
rel_data = t_data[["sbid","beam","name","peak_map_sigma","PSR_Label"]]
# Remove all null_values
rel_data = rel_data.dropna(how='any',axis=0)

# Obtain Corresponding fits paths
rel_data["sbid"] = rel_data["sbid"].astype(str)
rel_data["fits_path"] = "SB"+rel_data["sbid"]+"_"+rel_data["beam"]+"_slices_"+rel_data["name"]+".fits"

In [5]:
rel_data.head()

Unnamed: 0,sbid,beam,name,peak_map_sigma,PSR_Label,fits_path
0,49588,beam00,J163259.92-501507.22,3.226517,1,SB49588_beam00_slices_J163259.92-501507.22.fits
1,49588,beam00,J163048.20-491129.49,2.178119,0,SB49588_beam00_slices_J163048.20-491129.49.fits
2,49588,beam04,J162710.82-481537.04,2.259193,0,SB49588_beam04_slices_J162710.82-481537.04.fits
3,49588,beam05,J163250.42-482506.53,3.402407,0,SB49588_beam05_slices_J163250.42-482506.53.fits
4,49588,beam06,J164019.07-490047.32,3.30843,0,SB49588_beam06_slices_J164019.07-490047.32.fits


In [6]:
def obtain_avail_fits(rel_data, fits_folder_name = "VAST 10s fitscube", outlier_to_test = False, outlier_fname = None):
    """
    Obtains the fits files and labels which are in both the given dataframe
    and the fits folder. If the user wants to assign a particular
    outlier to the testing data later, they can set outlier_to_test to be True
    and give the outlier's filename.

    Parameters
    ----------
    rel_data : pandas dataframe
        The dataframe with all the possible fits paths.

    fits_folder_name : string, default "VAST 10s fitscube"
        The fits folder path.

    outlier_to_test : bool, default False
        Whether the user wants to push a particular outlier to the testing set.

    outlier_fname : string, default None
        The filename of the outlier.
    
    Returns
    -------
    avail_fits : list
        A list of strings of the selected fits file names.

    fits_labels : list
        A list of integers of the labels for the selected fits file names.

    outlier : dict
        A dictionary which maps the outlier's file name to its label.
    """
    # List of files in fits_folder and the actual data
    avail_fits = []
    # List of fits_labels
    fits_labels = []
    outlier = {}
    if outlier_to_test:
        
        for file in os.listdir(fits_folder_name):
            if file in rel_data["fits_path"].unique():
                if file == outlier_fname:
                    outlier[outlier_fname] = rel_data[rel_data["fits_path"]==outlier_fname]["PSR_Label"].item()
                else:
                    avail_fits.append(file)
                    fits_labels.append(rel_data[rel_data["fits_path"]==file]["PSR_Label"].item())
    else:
        for file in os.listdir(fits_folder_name):
            if file in rel_data["fits_path"].unique():
                avail_fits.append(file)
                fits_labels.append(rel_data[rel_data["fits_path"]==file]["PSR_Label"].item())

    return avail_fits, fits_labels, outlier
    

In [7]:
# Assign a particular outlier to the test data
outlier_to_test = False
outlier_fname = "SB62644_beam19_slices_J183248.46-091115.92.fits"

# Finding the available fits files
avail_fits, fits_labels, outlier = obtain_avail_fits(rel_data, outlier_to_test = outlier_to_test)

In [8]:
fits_labels = np.array(fits_labels)

In [9]:
np.unique(fits_labels, return_counts = True)

(array([0, 1]), array([2267,  252], dtype=int64))

# Obtain A Balanced Number of Fits Files and Labels

In [10]:
def obtain_rms(fits_path, lc_folder_name):
    """
    Obtains the local root mean squared data for a particular fits file.

    Parameters
    ----------
    fits_path : str
        The path for a fits file.

    lc_folder_name : str
        The path for a light curve folder.

    Returns
    -------
    rms : numpy array
        An array containing the local root mean squared values for a particular array.
    """
    # Need to return indices which have rms = 0 or nan
    sbid, beam, _, name = fits_path.split("_")
    name = name[:-5]
    rms_path = f"{sbid}_{beam}_lightcurve_local_rms.csv"

    # Need to ensure that a light curve exists 
    if rms_path not in os.listdir(lc_folder_name):
        return np.array([])
    else:
        local_rms_df = pd.read_csv(f"{lc_folder_name}//{rms_path}")
        rms = local_rms_df[name]
        rms = np.array(rms)
        # We need 3D rms since we will use cube / rms later.
        rms = rms[:, np.newaxis, np.newaxis]
        return rms

In [11]:
def obtain_cube(fits_path, fits_folder_name):
    """
    Obtains a 3D fits cube with values relating to images taken at different
    time points during an observation.

    Parameters
    ----------
    fits_path : str
        The path for a fits file.

    lc_folder_name : str
        The path for a light curve folder.

    Returns
    -------
    cube : numpy array
        A 3D fits cube corresponding to the given fits file.
    """
    # Only take slices which do not have a corresponding rms equal to 0
    hdu = fits.open(fits_folder_name+"/"+fits_path)
    cube = hdu[0].data

    return cube

In [12]:
def check_rms_nan(avail_fits):
    """
    Checks how many sources with fits files 
    have associated rms arrays that contain NaN values.

    Parameters
    ----------
    avail_fits : list
        The fits files we will check.

    Returns
    -------
    total : int
        The total number of sources which have rms arrays containing NaN values.

    loss_cand : int
        The number of transients lost from the data since their rms arrays contain NaN values.
    """
    # Check how many rms arrays contain nan values
    total = 0
    
    # Check how many detected transients have rms arrays that contain nan values
    loss_cand = 0
    for i, fits_path in enumerate(avail_fits):
        rms = obtain_rms(fits_path, lc_folder_name)
        if np.isnan(rms).any():
            total += 1
            if fits_labels[i] == 1:
                loss_cand += 1
                print(fits_path)

    return total, loss_cand

In [13]:
def find_zero_idx(avail_fits):
    """
    Find the indices of the zeros in the 
    rms arrays.

    Parameters
    ----------
    avail_fits : list
        The fits files we will examine.
    """
    for i, fits_path in enumerate(avail_fits):
            rms = obtain_rms(fits_path, lc_folder_name)
            zero_idx = np.where(rms == 0)[0]
            if zero_idx.size != 0:
                print(zero_idx)
                # All zeros occur at the start so just take the first non-zero value onwards
                print(np.argmax(rms != 0))

In [14]:
lc_folder_name = "VAST 10s lightcurve"
fits_folder_name = "VAST 10s fitscube"

In [15]:
import time

# Filter fits files and labels based on valid rms data
start = time.time()
adj_fits_labels = []
adj_avail_fits = []
for i, fits_path in enumerate(avail_fits):
        rms = obtain_rms(fits_path, lc_folder_name)
        # if there is no corresponding rms curve for a fits file, skip this iteration
        if rms.size == 0:
            continue

        # Only obtain the cube if the rms values do not contain nan values
        if not np.isnan(rms).any():
            adj_fits_labels.append(fits_labels[i])
            adj_avail_fits.append(avail_fits[i])

adj_fits_labels = np.array(adj_fits_labels)
adj_avail_fits = np.array(adj_avail_fits)

end = time.time()
print(end - start)

13.65091586112976


In [16]:
start = time.time()

# Filter fits files and labels based on valid cube data
map_size = (120,120)

# Invalid indices
invalid_idx = []

for i, fits_path in enumerate(adj_avail_fits):
        rms = obtain_rms(fits_path, lc_folder_name)
        # Take the first non-zero value onwards
        first_non_zero_idx = np.argmax(rms != 0)
        rms = rms[first_non_zero_idx:,:,:]
        cube = obtain_cube(fits_path, fits_folder_name)
        cube = cube[first_non_zero_idx:,:,:]
        if (cube.shape[1:] != map_size):
            invalid_idx.append(i)

end = time.time()
print(end-start)

31.65484046936035


In [17]:
adj_fits_labels = np.delete(adj_fits_labels, invalid_idx)
adj_avail_fits = np.delete(adj_avail_fits, invalid_idx)

In [18]:
# See the number of each label as a way to determine the degree of imbalance in the dataset
print(sum(adj_fits_labels == 1))
print(sum(adj_fits_labels == 0))

248
2224


### Balancing The Data

In [19]:
def obtain_bal_fpaths_labels(adj_avail_fits, adj_fits_labels, outlier = None, outlier_to_test = False):
    """
    Obtains a collection of fits paths and associated labels
    that are balanced so there are the same number of
    each of the unique labels.

    Parameters
    ----------
    adj_avail_fits : numpy array
        The fits paths that we need to balance.

    adj_avail_fits_labels : numpy array
        The labels that we need to balance.

    outlier : dict, default None
        A dictionary that has mapped an outlier's file name
        to its label.

    outlier_to_test : bool, False
        The user decides whether to send the outlier to the test data.

    Returns
    -------
    final_fits_paths : numpy array
        The final balanced collection of fits paths.

    final_labels : numpy array
        The final balanced collection of labels.
    """
    # Indices of transients/ positive candidates
    pos_idx_ls = np.where(adj_fits_labels == 1)[0]
    # Indices of non transients/ negative candidates
    neg_idx_ls = np.where(adj_fits_labels == 0)[0]
    num_rand_vals = len(pos_idx_ls)
    
    # Obtain num_rand_vals+1 to account for a particular outlier
    if outlier_to_test:
        reduced_neg_idx_ls = np.random.choice(neg_idx_ls, num_rand_vals+1, replace = False)
    else:
        reduced_neg_idx_ls = np.random.choice(neg_idx_ls, num_rand_vals, replace = False)

    # Use the indices of the fits labels and fits paths list to shuffle them
    idx_ls = np.concatenate([pos_idx_ls, reduced_neg_idx_ls])
    # Randomise the indices:
    shuffled_idx_ls = np.random.choice(idx_ls, len(idx_ls), replace = False)

    # Need to randomise these labels and fits paths:
    final_labels = adj_fits_labels[shuffled_idx_ls]
    final_fits_paths = adj_avail_fits[shuffled_idx_ls]
    
    # Ensures the outlier is in the test data
    if outlier_to_test:
        final_fits_paths = np.append(final_fits_paths, list(outlier.keys())[0])
        final_labels = np.append(final_labels, list(outlier.values())[0])
    
    return final_fits_paths, final_labels

In [20]:
final_fits_paths, final_labels = obtain_bal_fpaths_labels(adj_avail_fits, adj_fits_labels, outlier = outlier, outlier_to_test = outlier_to_test)

In [21]:
final_fits_paths.shape

(496,)

In [22]:
final_labels.shape

(496,)

# Obtain The Chi Square Maps

In [23]:
def normalise_arr(arr):
    """
    Applies min-max normalisation on an array.

    Parameters
    ----------
    arr : numpy array
        A list or array of integers or floats.

    Returns
    -------
    arr : numpy array
        A min-max normalised numpy array.
    """
    # Normalise the input image via min max normalisation
    arr = (arr - arr.min()) / (arr.max() - arr.min())
    return arr

In [24]:
def obtain_csm(cube, rms):
    """
    Obstains the chi square maps for
    a particular fits cube.

    Parameters
    ----------
    cube : numpy array
        A fits cube containing images of a particular observation
        at different time points.

    rms : numpy array
        The local rms of the associated cube.

    Returns
    -------
    chisq_array : numpy array
        The chi square map for the input fits cube.
    """
    # number of freedom
    nu = cube.shape[0] - 1
    mean = np.nanmean(cube, axis=0)
    
    data = (cube - mean) / rms
    chisq_array = np.sum(np.power(data, 2), axis=0)/nu

    # normalise the array
    chisq_array = normalise_arr(chisq_array)
    chisq_array = np.repeat(chisq_array[...,np.newaxis], 3, -1)
    
    return chisq_array

In [25]:
# Collect all the chi-square maps together
num_channels = 3
start = time.time()

# Instantiate a numpy array to contain the chi square maps 
all_csm = np.zeros((len(final_fits_paths), map_size[0], map_size[1], num_channels))
for i, fits_path in enumerate(final_fits_paths):
    rms = obtain_rms(fits_path, lc_folder_name)
    # Take the first non-zero value onwards
    first_non_zero_idx = np.argmax(rms != 0)
    rms = rms[first_non_zero_idx:,:,:]
    cube = obtain_cube(fits_path, fits_folder_name)
    cube = cube[first_non_zero_idx:,:,:]
    # Obtain the chi squared maps
    csm = obtain_csm(cube, rms)
    all_csm[i,:,:,:] = csm

end = time.time()
print(end-start)

29.834362983703613


In [26]:
# check if all_csm is normalised
print(all_csm.min())
print(all_csm.max())

0.0
1.0


### Splitting Chi Square Map Data into Training, Validation and Testing Sets

In [27]:
def train_test_split(map_array, labels, train_prop = 0.7):
    """
    Splits the maps and labels into training
    and testing data according to a particular proportion.

    Parameters
    ----------
    map_array : numpy array
        The maps that are going to be split.

    labels : numpy array
        The labels that are going to be split.

    train_prop : float, default 0.7
        The proportion of the maps and labels in
        the training set.

    Returns
    -------
    train : numpy array
        The maps in the training set.

    test : numpy array
        The maps in the test set.

    train_labels : numpy array
        The labels in the training set.

    test_labels : numpy array
        The labels in the test set.
    """
    # Obtain the number of training samples
    n_train = round(map_array.shape[0] * train_prop)

    # Obtaining train and test maps
    train = map_array[:n_train+1,:]
    test = map_array[n_train+1:,:]

    # Obtaining train and test labels
    train_labels = labels[:n_train+1]
    test_labels = labels[n_train+1:]

    return train, test, train_labels, test_labels

In [28]:
def save_maps(map_arrays, labels, data_class, data_fol):
    """
    Saves the map arrays and their labels.

    Parameters
    ----------
    map_arrays : numpy array
        The map arrays you wish to save.

    labels : numpy array
        The associated labels of the map arrays.

    data_class : str
        Whether you want to save the data as "Train", "Test" or "Validation" data.

    data_fol : str
        The folder path you are going to save the map arrays and labels to.
    """
    # Save maps data
    f_path = f"{data_fol}/{data_class}/{data_class}"
    np.save(f_path+"_maps", map_arrays)
    np.save(f_path+"_Labels", labels)

In [29]:
# Split the chi-square maps into training and test data
csm_train, csm_test, csm_train_labels, csm_test_labels = train_test_split(all_csm, final_labels)

In [30]:
# Split the training data into training and validation
csm_train, csm_val, csm_train_labels, csm_val_labels = train_test_split(csm_train, csm_train_labels)

In [31]:
# Save chi square map data
csm_data_fol = "chi_square_map"
save_maps(csm_train, csm_train_labels, "Train", csm_data_fol)
save_maps(csm_val, csm_val_labels, "Validation", csm_data_fol)
save_maps(csm_test, csm_test_labels, "Test", csm_data_fol)

# Obtain The Peak Maps

In [32]:
def obtain_pm(cube, rms):
    """
    Obstains the peak maps for
    a particular fits cube.

    Parameters
    ----------
    cube : numpy array
        A fits cube containing images of a particular observation
        at different time points.

    rms : numpy array
        The local rms of the associated cube.

    Returns
    -------
    peak_array : numpy array
        The peak map for the input fits cube.
    """
    snr = cube / rms 
    peak_array = np.nanmax(snr, axis=0) - np.median(snr, axis=0)
    
    # normalise the array
    peak_array = normalise_arr(peak_array)
    peak_array = np.repeat(peak_array[...,np.newaxis], 3, -1)

    return peak_array

In [33]:
# Collect all the peak maps together
start = time.time()

# Instantiate a numpy array to contain the peak maps 
all_pm = np.zeros((len(final_fits_paths), map_size[0], map_size[1], num_channels))
for i, fits_path in enumerate(final_fits_paths):
    rms = obtain_rms(fits_path, lc_folder_name)
    # Take the first non-zero value onwards
    first_non_zero_idx = np.argmax(rms != 0)
    rms = rms[first_non_zero_idx:,:,:]
    cube = obtain_cube(fits_path, fits_folder_name)
    cube = cube[first_non_zero_idx:,:,:]
    pm = obtain_pm(cube, rms)
    all_pm[i,:,:,:] = pm

end = time.time()
print(end-start)

12.595597505569458


In [34]:
# check if all_pm is normalised
print(all_pm.min())
print(all_pm.max())

0.0
1.0


### Splitting Peak Map Data into Training, Validation and Testing Sets

In [35]:
# Split the peak maps into training and test data
pm_train, pm_test, pm_train_labels, pm_test_labels = train_test_split(all_pm, final_labels)

In [36]:
# Split the training data into training and validation
pm_train, pm_val, pm_train_labels, pm_val_labels = train_test_split(pm_train, pm_train_labels)

In [37]:
# Save peak map data
pm_data_fol = "peak_map"
save_maps(pm_train, pm_train_labels, "Train", pm_data_fol)
save_maps(pm_val, pm_val_labels, "Validation", pm_data_fol)
save_maps(pm_test, pm_test_labels, "Test", pm_data_fol)

# Combine Chi-Square, Std and Peak Maps Together

In [38]:
def obtain_csm_std_pm(cube, rms):
    """
    Obstains an array which combines the chi square maps,
    standard deviation maps and peak maps for a particular fits cube.

    Parameters
    ----------
    cube : numpy array
        A fits cube containing images of a particular observation
        at different time points.

    rms : numpy array
        The local rms of the associated cube.

    Returns
    -------
    csm_std_pm : numpy array
        An array containing the chi square, standard deviation and peak maps for the input fits cube.
    """
    # Obtain chi square map
    
    # Number of freedom
    nu = cube.shape[0] - 1
    mean = np.nanmean(cube, axis=0)
    
    data = (cube - mean) / rms
    chisq_array = np.sum(np.power(data, 2), axis=0)/nu

    # Normalise the array
    chisq_array = normalise_arr(chisq_array)
    chisq_array = chisq_array[...,np.newaxis]

    # Obtain peak map
    snr = cube / rms 
    peak_array = np.nanmax(snr, axis=0) - np.median(snr, axis=0)
    
    # Normalise the array
    peak_array = normalise_arr(peak_array)
    peak_array = peak_array[...,np.newaxis]

    # Obtain std map
    std_array = np.nanstd(cube, axis=0)

    # Normalise the array
    std_array = normalise_arr(std_array)
    std_array = std_array[...,np.newaxis]

    # Concatenate the chi square maps and peak maps together
    csm_std_pm = np.concatenate([chisq_array, std_array, peak_array], axis=-1)
    
    return csm_std_pm

In [39]:
# Instantiate a numpy array to contain the chi-square, std and peak maps 
all_csm_std_pm = np.zeros((len(final_fits_paths), map_size[0], map_size[1], num_channels))
for i, fits_path in enumerate(final_fits_paths):
    rms = obtain_rms(fits_path, lc_folder_name)
    # Take the first non-zero value onwards
    first_non_zero_idx = np.argmax(rms != 0)
    rms = rms[first_non_zero_idx:,:,:]
    cube = obtain_cube(fits_path, fits_folder_name)
    cube = cube[first_non_zero_idx:,:,:]

    # Obtain the peak maps
    csm_std_pm = obtain_csm_std_pm(cube, rms)
    all_csm_std_pm[i,:,:,:] = csm_std_pm

### Splitting Chi Square, Std and Peak Map Data into Training, Validation and Testing Sets

In [40]:
# Split the maps into training and test data
csm_std_pm_train, csm_std_pm_test, csm_std_pm_train_labels, csm_std_pm_test_labels = train_test_split(all_csm_std_pm, final_labels)

In [41]:
# Split the training data into training and validation
csm_std_pm_train, csm_std_pm_val, csm_std_pm_train_labels, csm_std_pm_val_labels = train_test_split(csm_std_pm_train, csm_std_pm_train_labels)

In [42]:
# Save map data
csm_std_pm_data_fol = "chi_square_std_peak_maps"
save_maps(csm_std_pm_train, csm_std_pm_train_labels, "Train", csm_std_pm_data_fol)
save_maps(csm_std_pm_val, csm_std_pm_val_labels, "Validation", csm_std_pm_data_fol)
save_maps(csm_std_pm_test, csm_std_pm_test_labels, "Test", csm_std_pm_data_fol)

In [43]:
# check if all_csm_std_pm is normalised
print(all_csm_std_pm.max())
print(all_csm_std_pm.min())

1.0
0.0


In [44]:
np.unique(csm_train_labels, return_counts = True)

(array([0, 1]), array([126, 119], dtype=int64))

In [45]:
# Testing the presence of the outlier which should be the last map
if outlier_to_test:
    rms = obtain_rms(list(outlier.keys())[0], lc_folder_name)
    # Take the first non-zero value onwards
    first_non_zero_idx = np.argmax(rms != 0)
    rms = rms[first_non_zero_idx:,:,:]
    cube = obtain_cube(fits_path, fits_folder_name)
    cube = cube[first_non_zero_idx:,:,:]
    outlier_csm = obtain_csm(cube, rms)
    # Checking if the outlier is in the test set
    print(csm_test_labels[-1])
    print(sum(sum(sum(outlier_csm != csm_test[-1]))))