In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import fastavro
from pathlib import Path
from fastavro import reader
from tqdm import tqdm
from astropy.time import Time
import numpy as np
import gzip
from astropy.io import fits
import io
from astropy import visualization
from matplotlib.colors import Normalize
from matplotlib.figure import Figure
from matplotlib.backends.backend_pdf import PdfPages
from nuclass import output_dir
from alerce.core import Alerce
import numpy as np

# Read in data

In [None]:
# Replace with your own path!
avro_dir = Path("/Users/robertstein/Downloads/ztf_public_20191001/")
assert avro_dir.exists(), f"No avros found at {avro_dir}"

In [None]:
avro_files = list(avro_dir.glob("*.avro"))

In [None]:
# Loop to read in avro files from a directory
records = []
for avro_path in tqdm(avro_files):
    with open(avro_path, "rb") as avro_f:
        avro_reader = reader(avro_f)
        schema = avro_reader.writer_schema
        try:
            for record in avro_reader:
    
                # Flatten nested ZTF data
                new = {k:v for k, v in record.items() if ("candidate" not in k) & ("cutout" not in k)}
                new.update(**record["candidate"])
                new["prv_candidates"] = pd.DataFrame(record["prv_candidates"])   
                for key in record:
                    if "cutout" in key:
                        new[key] = record[key]["stampData"]
                
                records.append(new)
        except ValueError:
            print(f"problem with {avro_path}. Skipping!")

In [None]:
df = pd.DataFrame(records)
df

The ZTF data contains roughly 100000 detections for that one night of data. Too many to classify spectroscopically!

We need to cut aggresively to get this down to a reasonable number.

# Cutting down the ZTF data

In [None]:
list(x for x in df.columns)

These are all the fields available to us. The names may not be obvious to you.

Look at the ZTF avro alert schema to work out what the field names mean, and which ones you need to use in this section: https://zwickytransientfacility.github.io/ztf-avro-alert/schema.html

In [None]:
# Look up the information for neutrino IceCube-191009A from the GCN Circular website (https://gcn.nasa.gov/circulars)

ra_min = # Your answer goes here
ra_max = # Your answer goes here
dec_min = # Your answer goes here
dec_max = # Your answer goes here

neutrino_time_jd = # Your answer goes here

In [None]:
# Cut sources based on RA/DEC
# Sources should lie in the rectangle

mask = # your answer here
df = df[mask]

In [None]:
# Remove moving objects
# Moving objects are not detected twice in the same place

mask = # your answer here
df = df[mask]

In [None]:
# Remove image artefacts ("bogus") detections
# ZTF includes a score for this, and I recommend removing sources with scores < 0.6

mask = # your answer here
df = df[mask]

In [None]:
# Remove likely stars
# Nearby sources (detected in 'PS1') are classified automatically in the ZTF data
# I suggest you remove ZTF sources if there is a PS1 sources that is within 2" and that PS1 source is classified as a likely a star (>0.5) 

mask = # your answer here
df = df[mask]

In [None]:
# Select transients that are brighter than the reference image. 
# These are sources which come from a 'positive subtraction'

mask = # your answer here
df = df[mask]

In [None]:
# deduplicate
new = []

for name in set(df["objectId"]):
    mask = df["objectId"] == name
    df_cut = df[mask]
    new.append(df_cut.sort_values(by="jd").iloc[-1])

print(f"Removing {len(df) - len(new)}, leaving {len(new)} sources")

df = pd.DataFrame(new)

In [None]:
df

You should now have a more manageable number of sources left. Out of 100000, I had 8 left at this stage.

# Vetting Candidates

Now comes the critical stage. We will not be able to take spectra of all remaining sources. Instead, we need to do our best to work out what they are first.

We should check if any are classified already, and whether any are known varaiable sources e.g AGN/stars. 

We can plot the source lightcurves to get an idea of what things might be, as well.

In [None]:
BAND_NAMES = {
    1: "g",
    2: "r",
    3: "i",
}

BAND_COLORS = {
    1: "g",
    2: "r",
    3: "orange",
}

def decode_img(compressed_bytes: bytes) -> np.ndarray:
    """
    Function to parse a cutout (gziped fits file) into a numpy array

    :param compressed_bytes: Gziped fits file bytes
    :return: Numpy array of the image
    """
    with gzip.open(io.BytesIO(compressed_bytes), "r") as gzipped_f:
        with fits.open(io.BytesIO(gzipped_f.read()), ignore_missing_simple=True) as hdu:
            data = hdu[0].data  # pylint: disable=no-member
    return data


def generate_single_page(row: pd.Series, ann_fields: list[str] | None = None) -> plt.Figure:
    """
    Generate a page for a given row of data.

    :param row: Single detection in the data
    :param ann_fields: Fields to annotate
    :return: Figure
    """
    if ann_fields is None:
        ann_fields = ["candid", "sgscore1", "distpsnr1", "drb", "jdstarthist"]
    
    cutouts = [x for x in row.index if "cutout" in x]

    base_width = 5.0

    fig = plt.figure(figsize=(len(cutouts) * base_width, 2.0 * base_width))

    for i, cutout in enumerate(cutouts):
        ax = plt.subplot(2, len(cutouts), i + 1)

        data = decode_img(row[cutout])

        vmin, vmax = np.nanpercentile(data, [0, 100])
        data_ = visualization.AsinhStretch()((data - vmin) / (vmax - vmin))
        ax.imshow(
            data_,
            norm=Normalize(*np.nanpercentile(data_, [0.5, 99.5])),
            aspect="auto",
        )
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_title(cutout.split("cutout")[1], fontdict={"fontsize": "small"})

    ax_l = plt.subplot(2, 2, 3)

    # Combine history and latest detection
    hist = pd.concat([row["prv_candidates"], row.to_frame().T], axis=0)
    

    # Plot lightcurve, colour-coded by filter
    for fid in set(hist["fid"]):
        df = hist[(hist["fid"] == fid) & (hist["isdiffpos"].isin(["t", "true", "True", 1, "1", 1.0]))]
        plt.errorbar(
            df["jd"],
            df["magpsf"],
            abs(df["sigmapsf"]),
            fmt=".",
            label=BAND_NAMES[fid],
            color=BAND_COLORS[fid],
            mec="black",
            mew=0.5,
        )
    plt.legend()
    plt.axvline(neutrino_time_jd, label=r"$\nu$", linestyle=":")

    # Astronomers plot magnitude upside down
    ax_l.set_xlabel("JD")
    ax_l.set_ylabel("mag")
    ax_l.invert_yaxis()

    # Add a bunch of annotations to the PDF
    # One row per entry in ann_fields

    ax = plt.subplot(2, 2, 4)
    ax.axis(False)

    plot_fields = []

    for field in ann_fields:
        val = row[field]
        if isinstance(val, float):
            plot_fields.append(f"{field}: {val:.3f}")
        else:
            plot_fields.append(f"{field}: {val}")

    plt.annotate(
        "\n".join(plot_fields), xy=(0.05, 0.98), xycoords="axes fraction", va="top"
    )
    plt.suptitle(f"{row['objectId']}")
    return fig

In [None]:
fig = generate_single_page(df.iloc[0])

In [None]:
outpath = output_dir / "candidates.pdf"
print(f"Saving PDF to {outpath}")

with PdfPages(outpath) as pdf:
    for i, row in tqdm(df.iterrows(), total=len(df)):
        generate_single_page(row)
        pdf.savefig()
        plt.close()