In [3]:
# Clone or update the repo, then install only what Colab lacks
!rm -rf technosignature-pipeline-v2
!git clone https://github.com/rbaner21/technosignature-pipeline-v2.git
!pip install -q s3fs astroquery lightkurve papermill pyarrow pandas


Cloning into 'technosignature-pipeline-v2'...
remote: Enumerating objects: 81, done.[K
remote: Counting objects: 100% (81/81), done.[K
remote: Compressing objects: 100% (76/76), done.[K
remote: Total 81 (delta 28), reused 23 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (81/81), 28.50 KiB | 2.04 MiB/s, done.
Resolving deltas: 100% (28/28), done.


In [None]:
import pandas as pd

# Pull the confirmed‑planets table
url = (
    "https://exoplanetarchive.ipac.caltech.edu/TAP/sync"
    "?query=select+*+from+pscomppars&format=csv"
)
pl = pd.read_csv(url)
pl.to_csv("inputs/confirmed_planets.csv", index=False)
print(f"✅ Confirmed planets: {len(pl)} rows saved")


In [None]:
import os
import pyarrow.dataset as ds
import pyarrow.fs    as fs

# Prepare output
os.makedirs("inputs", exist_ok=True)
neo_out = "inputs/neowise_filtered.parquet"
if os.path.exists(neo_out): os.remove(neo_out)

# Point at the public NEOWISE Parquet bucket
s3 = fs.S3FileSystem(region="us-west-2", anonymous=True)
dataset = ds.dataset(
    "s3://nasa-irsa-wise/wise/neowiser",
    filesystem=s3,
    format="parquet",
    partitioning="hive"
)

# Scan in 100k‑row batches, computing W2–W1 > 0.5 flag
scanner = dataset.scanner(
    columns=["ra","dec","w1mpro","w2mpro"],
    batch_size=100_000
)
for i,batch in enumerate(scanner.scan_batches()):
    df = batch.to_pandas()
    df["ir_flag"] = (df["w2mpro"] - df["w1mpro"]) > 0.5
    flagged = df[df["ir_flag"]]
    flagged.to_parquet(neo_out, index=False, append=True)
    print(f"Batch {i:03d}: scanned {len(df)}, flagged {len(flagged)}")
print("✅ NEOWISE IR‐excess streaming complete")


In [None]:
import os, pandas as pd

# Download full BL hits CSV (one time)
url = "https://public.breakthroughlisten.org/dataset/hits.csv"
pd.read_csv(url, nrows=0).to_csv  # just test URL
!wget -qO inputs/bl_hits.csv {url}

# Stream & flag 1 420 MHz hits
bl_in  = "inputs/bl_hits.csv"
bl_out = "inputs/bl_filtered.csv"
if os.path.exists(bl_out): os.remove(bl_out)

for chunk in pd.read_csv(bl_in, chunksize=1_000_000):
    mask = chunk["frequency_mhz"].between(1419,1421)
    chunk[mask].to_csv(
        bl_out,
        mode="a",
        index=False,
        header=not os.path.exists(bl_out)
    )
    print(f"Processed {len(chunk)}, kept {mask.sum()}")
print("✅ BL radio‐hit streaming complete")


In [None]:
import numpy as np
import pandas as pd
from astroquery.mast import Observations
import lightkurve as lk
from astropy.timeseries import BoxLeastSquares

pl = pd.read_csv("inputs/confirmed_planets.csv")
flags = []
for tic in pl["tic_id"].dropna().unique():
    try:
        # Download & flatten
        lc = lk.search_lightcurve(f"TIC {int(tic)}", mission="TESS").download()
        lc_flat = lc.flatten(window_length=401)
        # BoxLeastSquares power
        bls = BoxLeastSquares(lc_flat.time, lc_flat.flux)
        periods = np.linspace(0.5, 10, 5000)
        power  = bls.power(periods, 0.1).power
        depth  = bls.depth.max()
        snr    = power.max() / np.std(lc_flat.flux)
        flags.append({
            "tic_id":    tic,
            "tess_flag": bool((depth > 0.05) & (snr > 20)),
            "tess_period": periods[np.argmax(power)],
            "tess_snr":    float(snr)
        })
    except Exception as e:
        print(f"TIC {tic} error:", e)

pd.DataFrame(flags).to_csv("inputs/tess_flags.csv", index=False)
print(f"✅ TESS flagging complete ({len(flags)} objects)")


In [None]:
from astroquery.gaia import Gaia
import pandas as pd

pl = pd.read_csv("inputs/confirmed_planets.csv")
gaia_flags = []
for _,row in pl.iterrows():
    ra,dec = row["ra"], row["dec"]
    q = f"""
      SELECT ruwe
      FROM gaiaedr3.gaia_source
      WHERE CONTAINS(
        POINT('ICRS',ra,dec),
        CIRCLE('ICRS',{ra},{dec},0.00027778)
      )=1
      LIMIT 1
    """
    try:
        res = Gaia.launch_job(q).get_results().to_pandas()
        ruwe = res["ruwe"].iloc[0] if not res.empty else None
        gaia_flags.append({
            "planet_id": row["pl_name"],
            "gaia_flag": bool(ruwe and ruwe>1.4),
            "ruwe":       float(ruwe) if ruwe else None
        })
    except Exception as e:
        print("Gaia error:", e)

pd.DataFrame(gaia_flags).to_csv("inputs/gaia_flags.csv", index=False)
print(f"✅ Gaia RUWE flagging complete ({len(gaia_flags)} objects)")


In [None]:
from astroquery.eso import Eso
import pandas as pd, os, glob

# 1) Download one HARPS file (example)
eso = Eso()
# eso.login()  # if you have ESO creds
prod = eso.query_program("HARPS")[0]
eso.retrieve_data(prod, output_dir="inputs/harps/")

# 2) Stream all HARPS CSVs and flag spikes
spec_flags = []
for f in glob.glob("inputs/harps/*.csv"):
    df = pd.read_csv(f)
    spike_count = (df["flux"] > 5*df["flux"].median()).sum()
    spec_flags.append({
        "file":       os.path.basename(f),
        "spike_flag": bool(spike_count>0),
        "spike_count": int(spike_count)
    })

pd.DataFrame(spec_flags).to_csv("inputs/spec_flags.csv", index=False)
print(f"✅ HARPS flagging complete ({len(spec_flags)} files)")
