In [1]:
from snowshovel.io import read_single_alert, parse_alerts
import pandas as pd
from tqdm import tqdm
from pathlib import Path

%matplotlib inline

Set up the config here for running the notebook

In [2]:
# Set this to the directory containing the unzipped avro alerts
data_dir = Path("C:/Users/natha/Downloads/ztf_public_20230609/")

# Set the number of alerts to read
n_to_read = 1000

## What is an avro alert?

We can firstly look at a single alert, to see what it looks like:

In [3]:
alert_path = data_dir.joinpath("2350249170015010010.avro")
print(f"We will look at the avro alert with path '{alert_path}'")

We will look at the avro alert with path 'C:\Users\natha\Downloads\ztf_public_20230609\2350249170015010010.avro'


In [4]:
alert = read_single_alert(alert_path)
print("An avro alert is just a python dictionary, as you can see below:")
alert

An avro alert is just a python dictionary, as you can see below:


{'jd': 2460104.7491782,
 'fid': 1,
 'pid': 2350249170015,
 'diffmaglim': 20.196775436401367,
 'pdiffimfilename': 'ztf_20230609249155_000326_zg_c01_o_q1_scimrefdiffimg.fits',
 'programpi': 'Kulkarni',
 'programid': 1,
 'candid': 2350249170015010010,
 'isdiffpos': 'f',
 'tblid': 10,
 'nid': 2350,
 'rcid': 0,
 'field': 326,
 'xpos': 1293.8194580078125,
 'ypos': 841.447509765625,
 'ra': 230.9148832,
 'dec': -19.2385929,
 'magpsf': 17.376073837280273,
 'sigmapsf': 0.047354377806186676,
 'chipsf': 14.490418434143066,
 'magap': 17.40329933166504,
 'sigmagap': 0.03269999846816063,
 'distnr': 0.16106265783309937,
 'magnr': 15.559000015258789,
 'sigmagnr': 0.013000000268220901,
 'chinr': 0.6050000190734863,
 'sharpnr': -0.017999999225139618,
 'sky': 0.5778443217277527,
 'magdiff': 0.027225999161601067,
 'fwhm': 2.9700000286102295,
 'classtar': 0.9950000047683716,
 'mindtoedge': 841.447509765625,
 'magfromlim': 2.793475389480591,
 'seeratio': 0.9369040727615356,
 'aimage': 0.8059999942779541,
 'b

## Handling many avro alerts:

That was just one avro alert, but our data will contain many sources, each with their own alert. python dictionaries are not very convenient for a large number of entries. Instead, we can use a function to read in a number of the alerts and convert them to a pandas dataframe.

In [5]:
alerts_df = parse_alerts(data_dir=data_dir, n_to_read=n_to_read)
print("Here you can see a pandas dataframe. The avro alert above is the first row in the table.")
alerts_df

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:00<00:00, 16.44it/s]


Here you can see a pandas dataframe. The avro alert above is the first row in the table.


Unnamed: 0,jd,fid,pid,diffmaglim,pdiffimfilename,programpi,programid,candid,isdiffpos,tblid,...,zpmed,clrmed,clrrms,neargaia,neargaiabright,maggaia,maggaiabright,exptime,drb,drbversion
0,2.460105e+06,1,2350249170015,20.196775,ztf_20230609249155_000326_zg_c01_o_q1_scimrefd...,Kulkarni,1,2350249170015010010,f,10,...,26.086000,0.554,0.230397,0.228274,-999.000000,15.315500,-999.000000,30.0,0.999971,d6_m7
1,2.460105e+06,1,2350249170015,20.196775,ztf_20230609249155_000326_zg_c01_o_q1_scimrefd...,Kulkarni,1,2350249170015010011,f,11,...,26.086000,0.554,0.230397,0.066207,-999.000000,14.556261,-999.000000,30.0,0.999255,d6_m7
2,2.460105e+06,1,2350249170015,20.196775,ztf_20230609249155_000326_zg_c01_o_q1_scimrefd...,Kulkarni,1,2350249170015010013,f,13,...,26.086000,0.554,0.230397,0.175491,-999.000000,16.767410,-999.000000,30.0,1.000000,d6_m7
3,2.460105e+06,1,2350249170015,20.196775,ztf_20230609249155_000326_zg_c01_o_q1_scimrefd...,Kulkarni,1,2350249170015010023,f,23,...,26.086000,0.554,0.230397,0.398773,-999.000000,16.487196,-999.000000,30.0,1.000000,d6_m7
4,2.460105e+06,1,2350249170015,20.196775,ztf_20230609249155_000326_zg_c01_o_q1_scimrefd...,Kulkarni,1,2350249170015010024,f,24,...,26.086000,0.554,0.230397,0.105536,52.636383,18.771164,13.606915,30.0,0.999883,d6_m7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2.460105e+06,1,2350249650415,20.518072,ztf_20230609249641_000427_zg_c02_o_q1_scimrefd...,Kulkarni,1,2350249650415015015,t,15,...,26.273001,0.569,0.284820,38.405056,-999.000000,19.375727,-999.000000,30.0,0.999941,d6_m7
996,2.460105e+06,1,2350249650415,20.518072,ztf_20230609249641_000427_zg_c02_o_q1_scimrefd...,Kulkarni,1,2350249650415015017,t,17,...,26.273001,0.569,0.284820,0.110178,74.885857,19.273678,13.588749,30.0,0.999999,d6_m7
997,2.460105e+06,1,2350249650515,20.420591,ztf_20230609249641_000427_zg_c02_o_q2_scimrefd...,Kulkarni,1,2350249650515010002,f,2,...,26.290001,0.560,0.299331,0.184441,84.466194,15.270944,13.869880,30.0,0.999985,d6_m7
998,2.460105e+06,1,2350249650515,20.420591,ztf_20230609249641_000427_zg_c02_o_q2_scimrefd...,Kulkarni,1,2350249650515010003,f,3,...,26.290001,0.560,0.299331,0.112007,-999.000000,15.355320,-999.000000,30.0,1.000000,d6_m7


The table above contains many rows and columns. It can be hard to see what the important information. Let's just display a few relevant columns, rather than all 103.

In [6]:
useful_columns = [
    "isdiffpos", # Is the detection postive (more flux in science image than reference), or negative (less flux)?
    "ra", # What is the sky position in Right Ascension?
    "dec", # What is the sky position in Declination?
    "magpsf", # What is the magnitude (brightness) of the detection?
    "sigmapsf", # What is the uncertainty on the magnitude?
    "distnr", # How close is the detection to a known source in the reference image?
    "ndethist", # How many times has this object been detected?
]
slim_alerts_df = alerts_df[useful_columns]
slim_alerts_df

Unnamed: 0,isdiffpos,ra,dec,magpsf,sigmapsf,distnr,ndethist
0,f,230.914883,-19.238593,17.376074,0.047354,0.161063,620
1,f,230.630090,-19.273569,17.239075,0.078380,0.173305,639
2,f,231.014147,-19.282204,18.312075,0.061234,0.134472,611
3,f,230.909926,-19.485533,18.983074,0.085728,0.424015,721
4,f,230.435069,-19.499763,20.108074,0.164924,0.145266,65
...,...,...,...,...,...,...,...
995,t,222.018785,-5.363314,20.120691,0.146097,12.357216,1
996,t,222.300244,-5.390949,19.616692,0.098350,0.192763,134
997,f,220.804638,-4.758334,19.417479,0.189056,0.228310,104
998,f,220.716001,-4.964255,18.074478,0.091027,0.163497,513


In [7]:
# Bad subtractions are often not repeated
# Let's restrict ourselves to sources which are detected at least twice
mask = slim_alerts_df["ndethist"] > 1
slim_alerts_df = slim_alerts_df[mask].reset_index()
print(f"There are {len(slim_alerts_df)} alerts passing our cut")
slim_alerts_df

There are 409 alerts passing our cut


Unnamed: 0,index,isdiffpos,ra,dec,magpsf,sigmapsf,distnr,ndethist
0,0,f,230.914883,-19.238593,17.376074,0.047354,0.161063,620
1,1,f,230.630090,-19.273569,17.239075,0.078380,0.173305,639
2,2,f,231.014147,-19.282204,18.312075,0.061234,0.134472,611
3,3,f,230.909926,-19.485533,18.983074,0.085728,0.424015,721
4,4,f,230.435069,-19.499763,20.108074,0.164924,0.145266,65
...,...,...,...,...,...,...,...,...
404,993,t,222.257711,-5.459212,19.301691,0.095494,9.619986,2
405,996,t,222.300244,-5.390949,19.616692,0.098350,0.192763,134
406,997,f,220.804638,-4.758334,19.417479,0.189056,0.228310,104
407,998,f,220.716001,-4.964255,18.074478,0.091027,0.163497,513
