In [1]:
from astropy.io import ascii
import matplotlib.pyplot as plt
import lightkurve as lk
from pathlib import Path
import pandas as pd
import numpy as np
import re
import json
import os
%matplotlib inline



In [2]:
# def preprocess_lc(lc):
#     lc_cleaned = lc.remove_outliers()
#     lc_normalized = lc_cleaned / lc_cleaned.flux.max()
#     lc_normalized.time = (lc_normalized.time - lc_normalized.time[0]).value
#     return lc_normalized

In [3]:
TABLES_FOLDER = "./tbls_not_in_koi"
folder = Path(TABLES_FOLDER)
tbl_files = folder.iterdir()
num_timesteps = 72_000
koi = pd.read_csv('koi_last_cumulative.csv', comment='#')
SAVE_FOLDER = './init_df_not_in_koi/'

ID_DF_NAME = SAVE_FOLDER + 'df_version_02_id'
VALUE_DF_NAME = SAVE_FOLDER + 'df_version_02_value'
APPENDING = False
SAVING_INTERVAL = 500
STARTING_INDEX = 0
number_of_tables = len(os.listdir(TABLES_FOLDER))
TBL_CURVE_COLUMN = 'LC_INIT'

In [4]:
#TODO
# # read already processed files
# processed_set = set()
# if Path("already_processed_tbls.txt").exists():
#     with open("already_processed_tbls.txt") as f:
#         processed_set = set(line.strip() for line in f if line.strip())

# # build tbl_files as a list, excluding already processed
# tbl_files = [f for f in Path("your_ascii_folder").iterdir() 
#              if f.is_file() and str(f) not in processed_set]

In [5]:
# updating the TABLES_FOLDER compound table
data = []
for f in folder.iterdir():
    if f.is_file():
        kepid_m = re.search(r'kplr(\d+)_', str(f))
        tce_m = re.search(r'tce_(\d+)_', str(f))
        if kepid_m and tce_m:
            data.append((int(kepid_m.group(1)), int(tce_m.group(1)), f.name))
tbls_compound = pd.DataFrame(data, columns=['kepid','ascii_planet_num','file_name']).set_index(['kepid','ascii_planet_num'])
tbls_compound.to_csv(f'{TABLES_FOLDER}_compound_key.csv')
tbls_compound.shape

(8496, 1)

In [6]:
koi['koi_tce_plnt_num'].isna().sum() #how many planet nums are not set

np.int64(346)

In [7]:
# bc we use composite key where second element is a serial number of the tce
# we must ensure all serial numbers exist
# here we find star systems which have multiple planets, all NaN
koi_no_plnt_num = koi[koi['koi_tce_plnt_num'].isna()]
a = koi_no_plnt_num['kepid'].to_list()
def find_recurring_elements(lst):
    counts = pd.Series(lst).value_counts()
    return counts[counts > 1].index.tolist()
nonmarked_stars_mltpl_planets = find_recurring_elements(a)

#drop systems where there are multiple planets and they are unmarked bc we can't map them to curves
# leave koi that have planet number OR are the only tce in star system
koi = koi[(koi['koi_tce_plnt_num'].notna()) | (~koi['kepid'].isin(nonmarked_stars_mltpl_planets))]
koi['koi_tce_plnt_num'] = koi['koi_tce_plnt_num'].fillna(1)
koi['koi_tce_plnt_num'] = koi['koi_tce_plnt_num'].apply(int)

#we need compound keys but drop dups for now
koi_compound = koi.set_index(['kepid', 'koi_tce_plnt_num'])
koi_compound = koi_compound.drop(koi_compound[koi_compound.index.duplicated()].index, axis=0)

In [8]:
with open('label_mapping.json', 'r') as file:
    label_mapping = json.load(file)
label_map = label_mapping['label_map']
next_code = label_mapping['next_code']

In [9]:
label_map

{'CONFIRMED': 0, 'CANDIDATE': 1, 'FALSE POSITIVE': 2, 'TRUE NEGATIVE': 3}

In [10]:
#ITERATION OVER TABLES
processed_lcs = []
kepids = []
planet_nums = []
labels = []
i_saved = STARTING_INDEX #APPENDED IN TOTAL
i_tbls = 0 #TBLS GONE OVER
just_saved_flag = True
for tbl_file in tbl_files:
    i_tbls += 1
    #progress
    if i_tbls % 100 == 0:
        print(f'scanned {i_tbls} tables out of {number_of_tables}')

    # saving
    if i_saved % SAVING_INTERVAL == 0 and i_saved != 0 and not just_saved_flag:
        # if i_saved % 100 == 0:
        #     print(f'found {i_saved} entries')
        print(f'saving: {i_saved//SAVING_INTERVAL}')
        lcs_df = pd.DataFrame(processed_lcs)
        
        df_ids = pd.DataFrame({'LABEL': labels, 'KEPID': kepids, 'PLANET_NUM' : planet_nums})
        df_ids.to_csv(ID_DF_NAME + f'_{i_saved // SAVING_INTERVAL}.csv', index=False)
        #lcs_df = pd.concat((identity_df, lcs_df))
        
        #df_ids = lcs_df[['KEPID','PLANET_NUM','LABEL']].copy()
        lcs_df.columns = lcs_df.columns.astype(str)
        df_values = lcs_df[[str(i) for i in range(0,num_timesteps)]].T
        
        df_values.columns = df_values.columns.astype(str)
        #df_values = lcs_df[[str(i) for i in range(0,70000)]].T
        df_values.reset_index(drop=True, inplace=True)
        df_values.columns = [str(i) for i in range(df_values.shape[1])]
        df_values.to_parquet(VALUE_DF_NAME + f'_{i_saved // SAVING_INTERVAL}.parquet', engine="fastparquet", compression="snappy")

        processed_lcs = []
        kepids = []
        planet_nums = []
        labels = []

        just_saved_flag = True

    #identification
    s = re.search(r'kplr(\d+)_', str(tbl_file))
    if s is not None:
        kepid = int(s.group(1))
        ascii_planet_num = int(re.search(r'tce_(\d+)_', str(tbl_file)).group(1))
    else:
        print(f'ERROR for file {str(tbl_file)}, continuing')
        continue

    # append if it's not in koi
    if (kepid, ascii_planet_num) not in koi_compound.index: #CHANGED TO APPEND ONLY THOSE NOT IN KOI
        if i_saved % 100 == 0:
            print(f'found {i_saved} entries')
        tbl = ascii.read(tbl_file)
        # df = tbl.to_pandas()[["TIME", TBL_CURVE_COLUMN]]
        # lc = lk.LightCurve(time=df['TIME'], flux=df[TBL_CURVE_COLUMN])
        # processed_lc = preprocess_lc(lc)
        # flux = processed_lc.flux.value
        flux = tbl.to_pandas()[TBL_CURVE_COLUMN].to_list() # WE SKIP LK PROCESSING AND TAKE THE RAW LC_INIT
        if len(flux) < num_timesteps:
            #print('padding:', num_timesteps - len(flux))
            flux = np.pad(flux, (0, num_timesteps - len(flux)), constant_values=np.nan)
        elif len(flux) > num_timesteps:
            print('truncating:', num_timesteps - len(flux))
            flux = flux[:num_timesteps]

        # HERE TOO
        #label = koi_compound.loc[(kepid, ascii_planet_num)]['koi_disposition']
        label = "TRUE NEGATIVE"

        if label not in label_map:
            label_map[label] = next_code
            next_code += 1

        processed_lcs.append(flux)
        kepids.append(kepid)
        planet_nums.append(ascii_planet_num)
        labels.append(label_map[label])
        with open(f"{SAVE_FOLDER}already_processed_tbls.txt", "a") as f:
            f.write(str(tbl_file) + "\n") # TODO: IMPLEMENT ITS USAGE
        i_saved += 1
        if just_saved_flag: just_saved_flag = False

found 0 entries
scanned 100 tables out of 8497
found 100 entries
scanned 200 tables out of 8497
found 200 entries
scanned 300 tables out of 8497
found 300 entries
scanned 400 tables out of 8497
found 400 entries
scanned 500 tables out of 8497
saving: 1
found 500 entries
scanned 600 tables out of 8497
found 600 entries
scanned 700 tables out of 8497
found 700 entries
scanned 800 tables out of 8497
found 800 entries
scanned 900 tables out of 8497
found 900 entries
scanned 1000 tables out of 8497
saving: 2
found 1000 entries
scanned 1100 tables out of 8497
found 1100 entries
scanned 1200 tables out of 8497
found 1200 entries
scanned 1300 tables out of 8497
found 1300 entries
scanned 1400 tables out of 8497
found 1400 entries
scanned 1500 tables out of 8497
saving: 3
found 1500 entries
scanned 1600 tables out of 8497
found 1600 entries
scanned 1700 tables out of 8497
found 1700 entries
scanned 1800 tables out of 8497
found 1800 entries
scanned 1900 tables out of 8497
found 1900 entries
sca

In [9]:
koi_compound = koi_compound.iloc[STARTING_INDEX:]

In [None]:
# # GO OVER KOI
# processed_lcs = []
# kepids = []
# planet_nums = []
# labels = []
# i_koi = STARTING_INDEX
# i_saved = STARTING_INDEX
# just_saved_flag = True
# #iterating over koi csv
# for koi_idx, koi_data in koi_compound.iterrows():
#     i_koi += 1
#     if i_koi % 100 == 0:
#         print(f'went through {i_koi} entries in KOI table out of {koi_compound.shape[0]}')
#     # if we have this curve file
#     if koi_idx in tbls_compound.index:
#         # we read and process the curve
#         try:
#             tbl = ascii.read(TABLES_FOLDER +  tbls_compound.loc[koi_idx]['file_name'])
#             #print(f'read file {tbls_compound.loc[koi_idx]['file_name']}')
#         except:
#             print(f'couldnt read table {tbls_compound.loc[koi_idx]['file_name']}')
#         # df = tbl.to_pandas()[["TIME", TBL_CURVE_COLUMN]]
#         # lc = lk.LightCurve(time=df['TIME'], flux=df[TBL_CURVE_COLUMN])
#         # processed_lc = preprocess_lc(lc)
#         # flux = processed_lc.flux.value
#         flux = tbl.to_pandas()[TBL_CURVE_COLUMN].to_list() # WE SKIP LK PROCESSING AND TAKE THE RAW LC_INIT
#         if len(flux) < num_timesteps:
#             #print('padding:', num_timesteps - len(flux))
#             flux = np.pad(flux, (0, num_timesteps - len(flux)), constant_values=np.nan)
#         elif len(flux) > num_timesteps:
#             print('truncating:', num_timesteps - len(flux))
#             flux = flux[:num_timesteps]

#         # get and save identification
#         label = koi_data['koi_disposition']
#         kepid, ascii_planet_num = koi_idx

#         if label not in label_map:
#             label_map[label] = next_code
#             next_code += 1

#         processed_lcs.append(flux)
#         kepids.append(kepid)
#         planet_nums.append(ascii_planet_num)
#         labels.append(label_map[label])
#         i_saved += 1
#         if just_saved_flag: just_saved_flag = False

#     if i_saved % SAVING_INTERVAL == 0 and not just_saved_flag:
#         if i_saved % 100 == 0:
#             print(f'found {i_saved} entries')
#         print(f'saving: {i_saved}')
#         lcs_df = pd.DataFrame(processed_lcs)
#         identity_df = pd.DataFrame({'LABEL': labels, 'KEPID': kepids, 'PLANET_NUM' : planet_nums})
#         #lcs_df = pd.concat((identity_df, lcs_df))
        
#         identity_df.to_csv(ID_DF_NAME + f'_{i_saved // SAVING_INTERVAL}.csv', index=False)
#         lcs_df.columns = lcs_df.columns.astype(str)
#         df_values = lcs_df[[str(i) for i in range(0,num_timesteps)]].T
#         df_values.columns = df_values.columns.astype(str)
#         df_values.reset_index(drop=True, inplace=True)
#         df_values.columns = [str(i) for i in range(df_values.shape[1])]
#         df_values.to_parquet(VALUE_DF_NAME + f'_{i_saved // SAVING_INTERVAL}.parquet', engine="fastparquet", compression="snappy")

#         processed_lcs = []
#         kepids = []
#         planet_nums = []
#         labels = []

#         just_saved_flag = True


went through 100 entries in KOI table out of 9360
went through 200 entries in KOI table out of 9360
went through 300 entries in KOI table out of 9360
went through 400 entries in KOI table out of 9360
went through 500 entries in KOI table out of 9360
found 500 entries
saving: 500
went through 600 entries in KOI table out of 9360
went through 700 entries in KOI table out of 9360
went through 800 entries in KOI table out of 9360
went through 900 entries in KOI table out of 9360
went through 1000 entries in KOI table out of 9360
found 1000 entries
saving: 1000
went through 1100 entries in KOI table out of 9360
went through 1200 entries in KOI table out of 9360
went through 1300 entries in KOI table out of 9360
went through 1400 entries in KOI table out of 9360
went through 1500 entries in KOI table out of 9360
found 1500 entries
saving: 1500
went through 1600 entries in KOI table out of 9360
went through 1700 entries in KOI table out of 9360
went through 1800 entries in KOI table out of 93

In [None]:
#READING DF
# df_values_loaded = pd.read_parquet(VALUE_DF_NAME + f'_final.parquet', engine="fastparquet")
# df_ids_loaded = pd.read_csv(ID_DF_NAME + f'_final.csv')
# df_full = pd.concat([df_ids_loaded, df_values_loaded.T.reset_index(drop=True)], axis=1)
# df_full.set_index(['KEPID','PLANET_NUM'], inplace=True)
# df_full.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,LABEL,0,1,2,3,4,5,6,7,8,...,69990,69991,69992,69993,69994,69995,69996,69997,69998,69999
KEPID,PLANET_NUM,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
9579641,1,0,-0.312961,-0.095957,-0.319565,-0.261222,-0.074698,0.096192,-0.014046,-0.170279,0.002636,...,,,,,,,,,,
11304958,1,0,-0.021482,-0.13972,0.002269,0.045884,0.026515,0.082143,-0.297937,-0.324884,-0.022064,...,,,,,,,,,,
11391957,1,0,-0.083154,0.10908,-0.163022,0.291097,0.381881,-0.051684,0.220637,0.150476,-0.014663,...,,,,,,,,,,
11403044,1,0,-0.187118,-0.165793,-0.350305,0.107896,0.025164,-0.191581,0.024537,0.071433,0.19261,...,,,,,,,,,,


In [11]:
# # SAVING DF
lcs_df = pd.DataFrame(processed_lcs)
identity_df = pd.DataFrame({'LABEL': labels, 'KEPID': kepids, 'PLANET_NUM' : planet_nums})
#lcs_df = pd.concat((identity_df, lcs_df))

identity_df.to_csv(ID_DF_NAME + f'_final.csv', index=False)
lcs_df.columns = lcs_df.columns.astype(str)
df_values = lcs_df[[str(i) for i in range(0,num_timesteps)]].T
df_values.columns = df_values.columns.astype(str)
df_values.reset_index(drop=True, inplace=True)
df_values.columns = [str(i) for i in range(df_values.shape[1])]
df_values.to_parquet(VALUE_DF_NAME + f'_final.parquet', engine="fastparquet", compression="snappy")


json.dump({'label_map': label_map, 'next_code': next_code}, open("label_mapping.json", "w"))