In [1]:
import pandas as pd
import re
from multiprocessing import Pool
import subprocess
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

In [2]:
filename = 'Kepler_TCE_DV_DR25_wget.bat'
rows = []
with open(filename, "r") as f:
    for line in f:
        l = line.strip()
        if l.startswith('wget'):
            if l.endswith(".tbl' -a search_14666024.log"):
                kepid_match = re.search(r'kplr(\d+)_', line)
                if kepid_match:
                    kepid = kepid_match.group(1)
                    rows.append([kepid, l])
kepid_wget = pd.DataFrame(rows, columns = ['kepid', 'wget_script'])
kepid_wget['kepid'] = kepid_wget['kepid'].apply(int)
kepid_wget.to_csv('kepid_wget_scripts.csv')

In [3]:
kepid_wget.shape

(34031, 2)

In [4]:
koi = pd.read_csv('koi_last_cumulative.csv', comment='#')
koi.shape

(9564, 141)

In [5]:
kepid_intersection = list(set(koi['kepid']) & set(kepid_wget['kepid']))
kepid_not_in_koi = list(set(kepid_wget['kepid']) - set(koi['kepid']))
len(kepid_not_in_koi)

10088

In [6]:
WORKING_DF = kepid_not_in_koi.copy()
TABLES_FOLDER = './tbls_not_in_koi'

In [7]:
#drop already present
tbl_files = Path(TABLES_FOLDER).iterdir()
for tbl_file in tbl_files:
    r = re.search(r'kplr(\d+)_', str(tbl_file))
    if r:
        kepid = int(r.group(1))
        #ascii_planet_num = int(re.search(r'tce_(\d+)_', str(tbl_file)).group(1))
        if kepid in WORKING_DF : WORKING_DF.remove(kepid)

In [8]:
len(WORKING_DF)

7318

In [9]:
def process_kepid_list_download(kepid_command_df, kepid_sublist, dest_folder, num_threads=4):
    for kepid in kepid_sublist:
        rows = kepid_command_df[kepid_command_df['kepid'] == kepid]
        with ThreadPoolExecutor(max_workers = num_threads) as executor:
            for _, row in rows.iterrows():
                executor.submit(subprocess.run, row['wget_script'], shell=True, cwd=dest_folder)

def run(process_num, num_threads, kepids_list, dest_folder):
    chunks = [kepids_list[i::process_num] for i in range(process_num)]
    for chunk in chunks:
        print("chunk len", len(chunk))
    func_args = [(kepid_wget, chunk, dest_folder, num_threads) for chunk in chunks]
    with Pool(processes=process_num) as pool:
        pool.starmap(process_kepid_list_download, func_args)

In [None]:
PROCESS_NUM = 4
THREADS_NUM = 8 #for each process
run(PROCESS_NUM, THREADS_NUM, WORKING_DF, TABLES_FOLDER)

chunk len 1830
chunk len 1830
chunk len 1829
chunk len 1829
