In [1]:
from astropy.io import ascii
from astropy.io import fits
import matplotlib.pyplot as plt
import lightkurve as lk
from pathlib import Path
import pandas as pd
import numpy as np
import re
import json
import os
%matplotlib inline



In [2]:
folder = Path("./k2_fits")
tbl_files = folder.iterdir()
num_timesteps = 155_554
koi = pd.read_csv('koi_last_cumulative.csv', comment='#')
SAVE_FOLDER = './cooked_df_k2/'
TABLES_FOLDER = "./k2_fits/"
TMP_FOLDER = "./lightcurve_csvs/"
DF_NAME = SAVE_FOLDER + 'df_version_01.csv'
ID_DF_NAME = SAVE_FOLDER + 'df_version_01_id'
VALUE_DF_NAME = SAVE_FOLDER + 'df_version_01_value'
APPENDING = False
SAVING_INTERVAL = 500
STARTING_INDEX = 0
number_of_tables = len(os.listdir('./k2_fits'))

In [3]:
def preprocess_lc(lc):
    lc_cleaned = lc.remove_outliers()
    lc_normalized = lc_cleaned / lc_cleaned.flux.max()
    lc_normalized.time = (lc_normalized.time - lc_normalized.time[0]).value
    return lc_normalized

In [4]:
filts_ids = [file.split(".")[0].split("_")[-1] for file in os.listdir(folder)]

In [5]:
len(filts_ids)

1464

In [6]:
filts_ids[0]

'00054b50-9c4b-11f0-8858-ac198eef3d13'

In [7]:
def read_dataset(folder):
    dfs = []

    for file in os.listdir(folder):
        id = file.split('.')[0]
        if id in filts_ids:
            df = pd.read_csv(f"k2_csvs/{file}")
            df['id'] = id
            dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

In [8]:
combined_df = read_dataset("k2_csvs")

In [8]:
combined_df.drop(labels="pl_name", axis=1, inplace=True)
combined_df.dropna(axis=0, how="all", inplace=True)

KeyError: "['pl_name'] not found in axis"

In [9]:
combined_df.head()

Unnamed: 0,tic_id,hostname,disposition,id
0,TIC 238607982,EPIC 212083455,FALSE POSITIVE,00054b50-9c4b-11f0-8858-ac198eef3d13
1,TIC 178948493,EPIC 212691727,CANDIDATE,000c12df-9c52-11f0-afe4-ac198eef3d13
2,TIC 203282135,EPIC 212075842,FALSE POSITIVE,00205bd6-9c4e-11f0-b6f6-ac198eef3d13
3,TIC 435904279,EPIC 210418253,CANDIDATE,0022c699-9c4f-11f0-b877-ac198eef3d13
4,TIC 242982776,EPIC 210559259,CANDIDATE,004b98c5-9c4a-11f0-8ff1-ac198eef3d13


In [10]:
combined_df.shape

(1464, 4)

In [29]:
combined_df.drop_duplicates(subset=['tic_id', 'hostname'], keep='last')

Unnamed: 0,tic_id,hostname,disposition,id
0,TIC 5882269,K2-284,CONFIRMED,00212052-9c62-11f0-b2b3-ac198eef3d13
18,TIC 422349881,K2-189,CONFIRMED,019892c4-9c66-11f0-9c98-ac198eef3d13
20,TIC 186814470,K2-338,CONFIRMED,01abb15b-9c68-11f0-b729-ac198eef3d13
24,TIC 293412804,EPIC 211621961,FALSE POSITIVE,02788e37-9c63-11f0-a85d-ac198eef3d13
26,TIC 2558796,EPIC 212757601,CANDIDATE,02a00da0-9c64-11f0-9759-ac198eef3d13
...,...,...,...,...
2925,TIC 55360702,EPIC 251281013,FALSE POSITIVE,ffaaaf3d-9c66-11f0-b5b1-ac198eef3d13
2926,TIC 83709114,EPIC 202071289,CANDIDATE,ffb30c42-9c68-11f0-9f10-ac198eef3d13
2927,TIC 4610830,K2-138,CONFIRMED,ffb41ce4-9c63-11f0-8cae-ac198eef3d13
2928,TIC 348667759,EPIC 210389383,CANDIDATE,ffbf05fb-9c68-11f0-a048-ac198eef3d13


In [12]:
combined_df["planet_rank"] = combined_df.groupby("hostname").cumcount() + 1

In [15]:
combined_df.shape

(2674, 4)

In [14]:
combined_df.head()

Unnamed: 0,tic_id,hostname,disposition,id,planet_rank
0,TIC 238607982,EPIC 212083455,FALSE POSITIVE,00054b50-9c4b-11f0-8858-ac198eef3d13,1
1,TIC 178948493,EPIC 212691727,CANDIDATE,000c12df-9c52-11f0-afe4-ac198eef3d13,1
2,TIC 203282135,EPIC 212075842,FALSE POSITIVE,00205bd6-9c4e-11f0-b6f6-ac198eef3d13,1
3,TIC 435904279,EPIC 210418253,CANDIDATE,0022c699-9c4f-11f0-b877-ac198eef3d13,1
4,TIC 242982776,EPIC 210559259,CANDIDATE,004b98c5-9c4a-11f0-8ff1-ac198eef3d13,1


In [13]:
k2_compound = combined_df.set_index(['hostname', 'planet_rank'])
k2_compound = k2_compound.drop(k2_compound[k2_compound.index.duplicated()].index, axis=0)

In [14]:
for idx in k2_compound[k2_compound.index.duplicated()].sort_index().index:
    print(idx)
    display(k2_compound.loc[idx])

In [33]:
duplicates = combined_df[combined_df.duplicated(subset=['tic_id', 'hostname'], keep=False)]
duplicates.sort_values(by=['tic_id']).head(20)

Unnamed: 0,tic_id,hostname,disposition,id
998,TIC 102264230,WASP-47,CONFIRMED,5f244b7f-9c6a-11f0-82c3-ac198eef3d13
917,TIC 102264230,WASP-47,CONFIRMED,5a53f1a6-9c6a-11f0-92cd-ac198eef3d13
944,TIC 102264230,WASP-47,CONFIRMED,5c79477e-9c6a-11f0-95bf-ac198eef3d13
943,TIC 102264230,WASP-47,CONFIRMED,5c5f48be-9c6a-11f0-b1eb-ac198eef3d13
942,TIC 102264230,WASP-47,CONFIRMED,5c455c82-9c6a-11f0-988b-ac198eef3d13
940,TIC 102264230,WASP-47,CONFIRMED,5c2cfd1b-9c6a-11f0-be4b-ac198eef3d13
939,TIC 102264230,WASP-47,CONFIRMED,5c1129a4-9c6a-11f0-8ad6-ac198eef3d13
938,TIC 102264230,WASP-47,CONFIRMED,5bf8104d-9c6a-11f0-858e-ac198eef3d13
905,TIC 102264230,WASP-47,CONFIRMED,59adf477-9c6a-11f0-9e36-ac198eef3d13
937,TIC 102264230,WASP-47,CONFIRMED,5bdc71f5-9c6a-11f0-80c9-ac198eef3d13


In [15]:
label_map = {"CONFIRMED": 0, "CANDIDATE": 1, "FALSE POSITIVE": 2}
next_code = 0
codes = []

In [16]:
processed_lcs = []
k2ids = []
planet_nums = []
labels = []
i_k2 = STARTING_INDEX
i_saved = STARTING_INDEX
just_saved_flag = True

In [17]:
longest = 0
for k2_idx, k2_data in k2_compound.iterrows():
    i_k2 += 1
    if i_k2 % 100 == 0:
        print(f'went through {i_k2} entries in K2 table out of {k2_compound.shape[0]}')
    # if we have this curve file
    try:
        with fits.open(f'{TABLES_FOLDER}k2_lightcurve_{k2_data["id"]}.fits') as hdul:
            data = hdul[1].data
            df = pd.DataFrame(data)
            df.to_csv(f'{TMP_FOLDER}{k2_data["id"]}.csv', index=False)
            df = pd.read_csv(f'{TMP_FOLDER}{k2_data["id"]}.csv')[["TIME","FLUX"]]
            df.dropna(axis=0, inplace=True)
            if df.shape[0] > longest:
                longest = df.shape[0]
            lc = lk.LightCurve(time=df['TIME'], flux=df['FLUX'])
            processed_lc = preprocess_lc(lc)
            flux = processed_lc.flux.value
            if len(flux) < num_timesteps:
                #print('padding:', num_timesteps - len(flux))
                flux = np.pad(flux, (0, num_timesteps - len(flux)), constant_values=np.nan)
            elif len(flux) > num_timesteps:
                print('truncating:', num_timesteps - len(flux))
                flux = flux[:num_timesteps]

            # get and save identification
            label = k2_data['disposition']
            k2id, ascii_planet_num = k2_idx

            if label not in label_map:
                label_map[label] = next_code
                next_code += 1

            processed_lcs.append(flux)
            k2ids.append(k2id)
            planet_nums.append(ascii_planet_num)
            labels.append(label_map[label])
            i_saved += 1
            if just_saved_flag: just_saved_flag = False

            if i_saved % SAVING_INTERVAL == 0 and not just_saved_flag:
                if i_saved % 100 == 0:
                    print(f'found {i_saved} entries')
                print(f'saving: {i_saved}')
                lcs_df = pd.DataFrame(processed_lcs)
                identity_df = pd.DataFrame({'LABEL': labels, 'K2ID': k2ids, 'PLANET_NUM': planet_nums})
                #lcs_df = pd.concat((identity_df, lcs_df))

                identity_df.to_csv(ID_DF_NAME + f'_{i_saved // SAVING_INTERVAL}.csv', index=False)
                lcs_df.columns = lcs_df.columns.astype(str)
                df_values = lcs_df[[str(i) for i in range(0,num_timesteps)]].T
                df_values.columns = df_values.columns.astype(str)
                df_values.reset_index(drop=True, inplace=True)
                df_values.columns = [str(i) for i in range(df_values.shape[1])]
                df_values.to_parquet(VALUE_DF_NAME + f'_{i_saved // SAVING_INTERVAL}.parquet', engine="fastparquet", compression="snappy")

                processed_lcs = []
                k2ids = []
                planet_nums = []
                labels = []

                just_saved_flag = True
    except:
        print(f'couldnt read table k2_lightcurve_{k2_data["id"]}.fits')
    print(df.shape)
    print(df.columns)


(15274, 2)
Index(['TIME', 'FLUX'], dtype='object')
(3277, 2)
Index(['TIME', 'FLUX'], dtype='object')
(3353, 2)
Index(['TIME', 'FLUX'], dtype='object')
(1143, 2)
Index(['TIME', 'FLUX'], dtype='object')
(13977, 2)
Index(['TIME', 'FLUX'], dtype='object')
(3269, 2)
Index(['TIME', 'FLUX'], dtype='object')
(15404, 2)
Index(['TIME', 'FLUX'], dtype='object')
(3631, 2)
Index(['TIME', 'FLUX'], dtype='object')
(3442, 2)
Index(['TIME', 'FLUX'], dtype='object')
(2468, 2)
Index(['TIME', 'FLUX'], dtype='object')
(12954, 2)
Index(['TIME', 'FLUX'], dtype='object')
(15480, 2)
Index(['TIME', 'FLUX'], dtype='object')
(14611, 2)
Index(['TIME', 'FLUX'], dtype='object')
(3620, 2)
Index(['TIME', 'FLUX'], dtype='object')
(2804, 2)
Index(['TIME', 'FLUX'], dtype='object')
(15289, 2)
Index(['TIME', 'FLUX'], dtype='object')
(2452, 2)
Index(['TIME', 'FLUX'], dtype='object')
(10157, 2)
Index(['TIME', 'FLUX'], dtype='object')
(2458, 2)
Index(['TIME', 'FLUX'], dtype='object')
(15108, 2)
Index(['TIME', 'FLUX'], dtype='

In [97]:
longest

115554

In [71]:
from astropy.io import fits
import pandas as pd

# Path to your FITS file
fits_file = "k2_fits/k2_lightcurve_0a8a64c1-9bb9-11f0-bdc6-ac198eef3d13.fits"
csv_file = "output.csv"

# Open the FITS file
with fits.open(fits_file) as hdul:
    # Usually the first extension contains the table
    data = hdul[1].data

    # Convert to DataFrame
    df = pd.DataFrame(data)  # no need for byteswap

    # Save to CSV
    df.to_csv(csv_file, index=False)


print(f"FITS data saved to {csv_file}")

FileNotFoundError: [Errno 2] No such file or directory: './lightcurve_csvs/001a064c-9bb7-11f0-b923-ac198eef3d13.csv'

In [53]:
pd.read_csv(csv_file)[["TIME","FLUX"]]

Unnamed: 0,TIME,FLUX
0,2384.463364,
1,2384.483797,114803.940
2,2384.504229,114828.930
3,2384.524661,114817.950
4,2384.545093,114825.760
...,...,...
3540,2463.289332,114976.234
3541,2463.309764,114984.625
3542,2463.330196,114977.640
3543,2463.350628,114961.516


In [18]:
# # SAVING DF
lcs_df = pd.DataFrame(processed_lcs)
identity_df = pd.DataFrame({'LABEL': labels, 'K2ID': k2ids, 'PLANET_NUM': planet_nums})
#lcs_df = pd.concat((identity_df, lcs_df))

identity_df.to_csv(ID_DF_NAME + f'_final.csv', index=False)
lcs_df.columns = lcs_df.columns.astype(str)
df_values = lcs_df[[str(i) for i in range(0,num_timesteps)]].T
df_values.columns = df_values.columns.astype(str)
df_values.reset_index(drop=True, inplace=True)
df_values.columns = [str(i) for i in range(df_values.shape[1])]
df_values.to_parquet(VALUE_DF_NAME + f'_final.parquet', engine="fastparquet", compression="snappy")


json.dump({'label_map': label_map, 'next_code': next_code}, open("label_mapping.json", "w"))