# Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
import os

# Obtain The Relevant Data

In [2]:
def get_rel_data(folder_name):
    """
    Collect all the VAST csv data into 1 dataframe and save the files. Label the sources which are transients with 1 and
    label the sources which are not transients with 0.

    Parameters
    ----------
    folder_name : string
        Path of the folder with the VAST csv files.

    Returns
    -------
    t_rel_data : dataframe
        A dataframe containing all the csv files with the only features being 'chi_square', 'chi_square_sigma', 'peak_map', 'peak_map_sigma', 'std_map', 'md_deep', 'deep_sep_arcsec', 
        'deep_num', 'bright_sep_arcmin', 'beam_sep_deg', 'deep_peak_flux', 'deep_int_flux', 'PSR_Label'.

    t_data : dataframe
        A dataframe containing all the csv files and includes a label indicating whether a transient was detected.
    """
    t_data = pd.read_csv(folder_name+"/" + os.listdir(folder_name)[1])
    for filename in os.listdir(folder_name)[2:]:
        new_df = pd.read_csv(folder_name +"/"+ filename)
        t_data = pd.concat([t_data, new_df], ignore_index=True, axis=0)
    
    # Create column of labels
    t_data['PSR_name_int'] = t_data['PSR_name'].notnull().astype(int)
    t_data['KNOWN_name_int'] = t_data['KNOWN_name'].notnull().astype(int)
    t_data['PSR_Label'] = t_data['PSR_name_int']+t_data['KNOWN_name_int'] # since "PSR_name_int" and "KNOWN_name_int" are exclusive of eachother.
    
    # Take a subset of features that may be useful for the decision tree (deep_sep_arcsec may be particularly important)
    rel_cols = ['chi_square', 'chi_square_sigma', 'peak_map', 'peak_map_sigma', 'std_map', 'md_deep', 'deep_sep_arcsec', 
                'deep_num', 'bright_sep_arcmin', 'beam_sep_deg', 'deep_peak_flux', 'deep_int_flux', 'PSR_Label']
    t_rel_data = t_data[rel_cols]
    
    # Save the data
    filename = "Transient_Data_Selected_Features.csv"
    t_rel_data.to_csv(filename, encoding='utf-8', index=False)

    filename2 = "All_Transient_Data.csv"
    t_data.to_csv(filename2, encoding='utf-8', index=False)

    return t_rel_data, t_data

In [3]:
# Collect the data
t_rel_data, t_data = get_rel_data("VAST_10s_CSV")