# Making Jet clusters and exporting them
## Need Aggregator files, SOL_Tc_stats (or T3) and subjects_Tc (or T3) made during the aggregation in BoxTheJets
This notebook takes the jets detected per subjects and looks for clusters in space and time. If two jets of different clusters fall within the epsilon given by the user (set by eps and time_eps) they are clustered together to make a jet cluster, this can be repeated such that more jets are added to the cluster. Clusters can only contain one jet per subject such that closeby jets are detected seperatly. 
The second part of this notebook requires the database of the Zooniverse to make the conversion between pixels ans solar coordinates. For now this can only be done on the foxsiaadmins computer of Minnesota University. 

In [None]:
import os
from aggregation import Aggregator, get_subject_image
from aggregation import SOL
from aggregation import json_export_list
from aggregation import get_box_edges, sigma_shape
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.dates import DateFormatter
import numpy as np
from scipy.cluster.hierarchy import dendrogram
plt.style.use('default')
%matplotlib inline

Read in the data made by the aggregation steps. This analysis can also be done using the SOL_T3_stats and subjects_T3 files 

In [None]:
aggregator = Aggregator('reductions/point_reducer_hdbscan_box_the_jets.csv', 
                        'reductions/shape_reducer_dbscan_box_the_jets.csv')
aggregator.load_extractor_data('extracts/point_extractor_by_frame_box_the_jets.csv',
                               'extracts/shape_extractor_rotateRectangle_box_the_jets.csv')
sol = SOL('../SOL_Tc_stats.csv', aggregator) 

SOL_small,SOL_subjects,filenames0,times,Num,start,end,notes=np.loadtxt('../SOL_{}_stats.csv'.format('Tc'),delimiter=',',unpack=True,dtype=str)
Num=Num.astype(float)

subjects,date,end_date,ans,agreement,subject_file, subject_sol=np.loadtxt('../subjects_{}.csv'.format('Tc'),delimiter=',',unpack=True,dtype=str)
date=np.array(date,dtype='datetime64')
end_date=np.array(end_date,dtype='datetime64')
subjects=subjects.astype(int)
agreement=agreement.astype(float)

### Start the Jet clustering, we picked the variabels eps= 3.0 and time_eps = 2.0. This was done based on looking at the results.  
The epsilon voor space clustering is chosen to be 3, however in some cases with a small viewpoint this can still be quite large. Since the smallest distance is chosen for clustering this often does not interfer with good results
The time epsilon meant a jet that was 'missing' in one subject could still be detected if it was present in the next subject. This can mean rapidly reoccuring jets can be clustered together unintensionally

In [None]:
Jet_clusters=np.array([])
#Set the space and time epsilon
eps,time_eps=3.0,2.0

for s in range(len(SOL_small)):
    del_index=np.array([],dtype=int)
    SOL_event=SOL_small[s]
    print(f'SOL_small[{s}] ', SOL_event)
    try:
        clusters, distance_met, point_met, box_met = sol.filter_jet_clusters(SOL_event, eps=eps, time_eps=time_eps)
    except:
        print(f"No jets in {SOL_event}")
        continue
    for j, cluster in enumerate(clusters):
        print(j, len(cluster.jets))
        cluster.adding_new_attr("SOL",SOL_event)
        if len(cluster.jets)==1 and ans[subjects==cluster.jets[0].subject][0]=='n':
            #jets that only last 1 subject and do not have 50% agreement yes are excluded
            del_index=np.append(del_index,j)
    if len(del_index)>0:
        print(f'Remove {len(del_index)} clusters from list due to too low agreement')
        clusters = np.delete(clusters, del_index)        
    Jet_clusters=np.append(Jet_clusters,clusters)
        
        

### From here on the code will work on the foxsiadmins computer to have access to the visual files of the Zooniverse subjects. 

In [None]:
from aggregation.image_handler import solar_conversion

def get_solar_distance(subject_id,pair):
    '''
        Get the solar projected distance between the two pairs of X,Y coordinates
            Inputs:
            -------
            subject_id : int
                subject_id used in the Zooniverse subject
            pair : np.array
                x,y Coordinates of the two points 1,2 for which the solar distance needs to be calculated
                format [[x1,y1],[x2,y2]]
    '''
    solw1=solar_conversion(subject_id,pair[0][0],pair[0][1])
    solw2=solar_conversion(subject_id,pair[1][0],pair[1][1])
    #Euclidean distance
    distance=np.sqrt((solw1[0]-solw2[0])**2 +(solw1[1]-solw2[1])**2 )
    return distance


### Go through the list of jet clusters and determine their propeties in physical coordinates

In [None]:
ID=1

for C in Jet_clusters:
    print('Jet start')
    H=np.array([])
    W=np.array([])
    X=np.array([])
    Y=np.array([])
    sig=np.array([])
    H_sig=np.zeros((len(C.jets),2))
    obs_time=np.array([],dtype='datetime64')
    end_time=np.array([],dtype='datetime64')
    for j, jet in enumerate(C.jets):
        print(j, len(C.jets))
        width_pair,height_pair=jet.get_width_height_pairs()
        #Find sigma of maximum height by first getting the pixel height
        H_pix_box=np.sqrt((height_pair[1][0]-height_pair[0][0])**2 +(height_pair[1][1]-height_pair[0][1])**2 )
        index=list(map(int, jet.cluster_values)).index(int(H_pix_box))
        #Get the height of the box in pixels for the +-1 sigma
        plus_sigma, minus_sigma = sigma_shape(jet.cluster_values, jet.sigma)
        H_pix_minus= minus_sigma[index]
        H_pix_plus= plus_sigma[index]
        #print(width_pair,height_pair)
        file=subject_file[subjects==jet.subject][0]
        #Get the solar locations on the jet
        try:
            Bx,By=solar_conversion(jet.subject,jet.start[0],jet.start[1])
        except:
            print('This one breaks', jet.subject)
            continue
        Ex,Ey=solar_conversion(jet.subject,jet.end[0],jet.end[1])
        print('Start base',Bx,By)
        print('sigma',jet.sigma)
        #Add as attributes and as a list
        jet.adding_new_attr("solar_start",[Bx,By])
        jet.adding_new_attr("solar_end",[Ex,Ey])
        sig=np.append(sig,jet.sigma)
        X=np.append(X,Bx)
        Y=np.append(Y,By)
        #Get the dates the subjecst were observed
        O=date[subjects==jet.subject][0]
        obs_time=np.append(obs_time,O)
        E=end_date[subjects==jet.subject][0]
        end_time=np.append(end_time,E)
        #Calculate the height an wisth in arcsec
        height=get_solar_distance(jet.subject,height_pair)
        width=get_solar_distance(jet.subject,width_pair)
        #Add as attributes and list
        jet.adding_new_attr("solar_H",height)
        jet.adding_new_attr("solar_W",width)
        H=np.append(H,height)
        W=np.append(W,width)
        #Get the error on the height by scaling the height with the (height_sigma/height -1)
        err_plus, err_minus = height*(H_pix_plus/H_pix_box-1) , height*(H_pix_minus/H_pix_box-1)
        H_sig[j]=np.array([err_plus,err_minus])
        jet.adding_new_attr("solar_H_sig",[err_plus,err_minus])
    
    #duration=(obs_time[-1]-obs_time[0])/np.timedelta64(1, 'm')
    duration=(end_time[-1]-obs_time[0])/np.timedelta64(1, 'm')
    vel=np.max(H)/((obs_time[np.argmax(H)]-obs_time[0])/ np.timedelta64(1, 's'))
    if np.isinf(vel)==True:
        vel=np.NaN
    
    C.adding_new_attr("ID",ID)
    C.adding_new_attr('Max_Height', np.max(H))
    C.adding_new_attr('std_maxH', H_sig[np.argmax(H)])
    C.adding_new_attr("Height",np.average(H))
    C.adding_new_attr("std_H",np.std(H))
    C.adding_new_attr("Width",np.average(W))
    C.adding_new_attr("std_W",np.std(W))
    C.adding_new_attr("Bx",np.average(X))
    C.adding_new_attr("std_Bx",np.std(X))
    C.adding_new_attr("By",np.average(Y))
    C.adding_new_attr("std_By",np.std(Y))
    C.adding_new_attr("obs_time",obs_time[0])
    C.adding_new_attr("sigma",np.average(sig))
    C.adding_new_attr("Duration",duration)
    C.adding_new_attr("Velocity",vel)
    
    ID+=1

### Add the longitude and latitude of the measured basepoints as properties to the Jet_cluster objects

In [None]:
import astropy.units as u
from astropy.coordinates import SkyCoord

import sunpy.map
from sunpy.coordinates import frames


In [None]:
for C in Jet_clusters:
    #print(C.Bx,C.By)
    X,Y=C.Bx,C.By
    sky_coord = SkyCoord(X*u.arcsec, Y*u.arcsec, frame=frames.Helioprojective(observer="earth", 
                                                                               obstime=str(C.obs_time)))
    #print(sky_coord.heliographic_stonyhurst)
    Coord=sky_coord.heliographic_stonyhurst
    if np.isnan(Coord.lat):
        print('Coordinates off limb')
        with frames.Helioprojective.assume_spherical_screen(sky_coord.observer):
            print(sky_coord.heliographic_stonyhurst)
            Coord=sky_coord.heliographic_stonyhurst
            C.adding_new_attr("Lat",float(str(Coord.lat).split('d')[0]))
            C.adding_new_attr("Lon",float(str(Coord.lon).split('d')[0]))

    else:
        C.adding_new_attr("Lat",float(str(Coord.lat).split('d')[0]))
        C.adding_new_attr("Lon",float(str(Coord.lon).split('d')[0]))


## Export the results of the clustering
Export the JetCluster objects to a JSON file
or 
Export the results to a csv file 

In [None]:
Jet_clusters_trimmed=Jet_clusters
d=0
for i,C in enumerate(Jet_clusters):
    for jet in C.jets:
        try:
            T=jet.solar_H
            #Jet_clusters_trimmed=np.append(Jet_clusters_trimmed,C)
        except:
            print(jet.subject,i,'Did not work')
            #Jet_clusters_trimmed=np.delete(Jet_clusters_trimmed,i)
            d+=1
#print(len(Jet_clusters),len(Jet_clusters_trimmed))

In [None]:
Jet_clusters_trimmed=np.delete(Jet_clusters,[409,426])

In [None]:
json_export_list(Jet_clusters_trimmed,f'exports/Jet_clusters_{eps}_{time_eps}_paper') #Export all the JetCluster objects
#Jet_clusters[0].json_export('output_single') #Export a single JetCluster object

In [None]:
Cluster_date = np.array([Jet_clusters[i].obs_time for i in range(len(Jet_clusters))],dtype=str)
Cluster_SOL= np.array([Jet_clusters[i].SOL for i in range(len(Jet_clusters))],dtype=str)
stat_Bx = np.array([Jet_clusters[i].Bx for i in range(len(Jet_clusters))],dtype=str)
stat_By = np.array([Jet_clusters[i].By for i in range(len(Jet_clusters))],dtype=str)
stat_Lon = np.array([Jet_clusters[i].Lon for i in range(len(Jet_clusters))],dtype=str)
stat_Lat = np.array([Jet_clusters[i].Lat for i in range(len(Jet_clusters))],dtype=str)
stat_H = np.array([Jet_clusters[i].Max_Height for i in range(len(Jet_clusters))],dtype=str)
stat_W = np.array([Jet_clusters[i].Width for i in range(len(Jet_clusters))],dtype=str)
stat_dur = np.array([Jet_clusters[i].Duration for i in range(len(Jet_clusters))],dtype=str)
stat_vel = np.array([Jet_clusters[i].Velocity for i in range(len(Jet_clusters))],dtype=str)
stat_sigma = np.array([Jet_clusters[i].sigma for i in range(len(Jet_clusters))],dtype=str)
std_H= np.array([Jet_clusters[i].std_maxH for i in range(len(Jet_clusters))],dtype=str)
std_W= np.array([Jet_clusters[i].std_W for i in range(len(Jet_clusters))],dtype=str)
std_Bx= np.array([Jet_clusters[i].std_Bx for i in range(len(Jet_clusters))],dtype=str)
std_By= np.array([Jet_clusters[i].std_By for i in range(len(Jet_clusters))],dtype=str)

In [None]:
csvfile = open(f'exports/Jet_clusters_{eps}_{time_eps}_paper.csv','w')
csvfile.writelines('#date, SOL_event, duration, basepoint_X, std_X, basepoint_Y, std_Y, basepoint_X_longitude, basepoint_Y_latitude, max_height, upper_H, lower_H, avg_width, std_width, velocity, sigma')
csvfile.writelines('\n')
with open(f'exports/Jet_clusters_{eps}_{time_eps}_test.csv','a') as csvfile:
    np.savetxt(csvfile, np.column_stack((Cluster_date,Cluster_SOL,stat_dur,stat_Bx,std_Bx,stat_By,std_By,stat_Lon,stat_Lat,stat_H,std_H,stat_W,std_W,stat_vel,stat_sigma)), delimiter=",",newline='\n',fmt='%s')
csvfile.close()