In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functions import angular_distance
import sys

In [31]:
def get_data_points_before_start_time(flare, matching_rows, before_start_num):
    if matching_rows.shape[0] == 0:
        shmarp_parameters = pd.DataFrame([0]*len(matching_rows.columns), index=matching_rows.columns).transpose()
        #print('SHMARPs:',shmarp_parameters)
    else:
        t_start = pd.to_datetime(flare['t_start'])
        t_start = t_start - pd.Timedelta(hours=before_start_num)
        shmarp_parameters = matching_rows[pd.to_datetime(matching_rows['T_OBS']) < t_start].iloc[-1:]
        if shmarp_parameters.empty: shmarp_parameters = pd.DataFrame([0]*len(matching_rows.columns), index=matching_rows.columns).transpose()
    return shmarp_parameters

def round_to_n(x,n):
    if np.isnan(x) or x == 0: return x
    else: return round(x, -int(np.floor(np.log10(abs(x)))) + (n - 1))

Load the SHMARPs dataset

In [32]:
SHMARPs = pd.read_csv("MERGED_mdi.smarp_cea_96m_1996.04.23_000000_TAI-hmi.sharp_cea_720s_2023.05.09_000000_TAI_v3b.csv") # Read in the data as a single column, skipping the first row (header)
unique_noaa_ar = SHMARPs['NOAA_AR'].unique()
print("Number of unique NOAA_AR in SHMARPs:", len(unique_noaa_ar))
print(SHMARPs.tail(1))

  SHMARPs = pd.read_csv("MERGED_mdi.smarp_cea_96m_1996.04.23_000000_TAI-hmi.sharp_cea_720s_2023.05.09_000000_TAI_v3b.csv") # Read in the data as a single column, skipping the first row (header)


Number of unique NOAA_AR in SHMARPs: 3870
                                                   DBINDEX  \
3354153  hmi.sharp_cea_720s[9520][2023.05.09_00:00:00_TAI]   

                       T_OBS     UNIX_TIME  ARPNUM  NOAA_AR NOAA_ARS  CAR_ROT  \
3354153  05-09-2023 00:00:05  19486.000058    9520        0      NaN     2270   

              USFLUXL  R_VALUE  MEANGBL_GMM  ...       USFLUXZ  CMASKL  \
3354153  6.748416e+19      0.0   120.952713  ...  8.574106e+19   421.0   

           LAT_FWT  CRLT_OBS    LON_FWT    CRLN_OBS  CDELT1      DSUN_OBS  \
3354153  28.806408 -3.392118 -21.547314  117.569489    0.03  1.509737e+11   

           RSUN_OBS  QUALITY  
3354153  950.899353        0  

[1 rows x 21 columns]


Load the Matched Flares-SEP file

In [33]:
flares = pd.read_csv('flares_matched_manual.dat', sep="\s+", header=0)
unique_ar_flares = flares['AR'].unique()
print("Number of unique NOAA_AR in matched flares file:", unique_ar_flares.shape[0])

Number of unique NOAA_AR in matched flares file: 1178


In [34]:
common_ar = np.isin(unique_ar_flares, unique_noaa_ar)
print("Number of Active Regions common to both datasets:", np.sum(common_ar))
print(len(unique_noaa_ar))

Number of Active Regions common to both datasets: 872
3870


Append the SHMARP values to the matched Flares-SEP list

In [35]:
# Initialize empty list to store each row
rows = []
before_start_time = 60 #hours

for index, flare in flares.iterrows():
    matching_rows = SHMARPs[SHMARPs['NOAA_AR'] == flare['AR']] # find the correct AR SHMARP
    shmarp_parameters = get_data_points_before_start_time(flare, matching_rows, before_start_time) # Get the SHMARP datapoint you want
    flare_ = flare.to_frame().T.reset_index(drop=True)  # Convert series to DataFrame, transpose, and reset index
    shmarp_parameters = shmarp_parameters.reset_index(drop=True)  # Reset index
    combined_row = pd.concat([flare_, shmarp_parameters], axis=1)
    combined_row["ANG_DIST_AR"] = angular_distance((combined_row["LAT_FWT"][0], combined_row["LON_FWT"][0]))
    print(combined_row["ANG_DIST_AR"],combined_row["LAT_FWT"][0],combined_row["LON_FWT"][0])
    rows.append(combined_row.values.tolist()[0]) # Convert the combined row to a list or numpy array and append it to rows # .values returns a numpy array, .tolist() converts it to a list 

0   -0.785398
Name: ANG_DIST_AR, dtype: float64 0 0
0   -1.077105
Name: ANG_DIST_AR, dtype: float64 -11.5 -16.08
0   -0.959178
Name: ANG_DIST_AR, dtype: float64 -11.08 -9.19
0   -0.743209
Name: ANG_DIST_AR, dtype: float64 -10.5 3.49
0   -0.609258
Name: ANG_DIST_AR, dtype: float64 -4.44 10.34
0   -0.333082
Name: ANG_DIST_AR, dtype: float64 -4.48 26.43
0   -1.584716
Name: ANG_DIST_AR, dtype: float64 -23.55 -45.87
0   -1.487507
Name: ANG_DIST_AR, dtype: float64 -23.63 -39.79
0   -1.447415
Name: ANG_DIST_AR, dtype: float64 -23.63 -37.28
0   -1.657638
Name: ANG_DIST_AR, dtype: float64 -28.24 -50.65
0   -0.785398
Name: ANG_DIST_AR, dtype: float64 0 0
0   -1.224149
Name: ANG_DIST_AR, dtype: float64 20.52 -23.73
0   -1.187709
Name: ANG_DIST_AR, dtype: float64 5.07 -22.96
0   -0.785398
Name: ANG_DIST_AR, dtype: float64 0 0
0   -0.786445
Name: ANG_DIST_AR, dtype: float64 24.59 5.97
0   -0.730402
Name: ANG_DIST_AR, dtype: float64 24.75 10.11
0   -1.246679
Name: ANG_DIST_AR, dtype: float64 -26.52 

Some Final Touches (machine learning ready file)

In [29]:
# After the loop, convert the master list or array to a DataFrame

combined_df = pd.DataFrame(rows, columns=combined_row.columns)
rows_before = len(combined_df); print('Total number of flares: {}'.format(rows_before))
combined_df = combined_df[combined_df['RSUN_OBS'] != 0] # get rid of the ones that no shmarp information was found
print('-')
print('Flares without SHARP info: {}'.format(rows_before-len(combined_df)))

combined_df['T_OBS'] = pd.to_datetime(combined_df['T_OBS'], format='%m-%d-%Y %H:%M:%S') # Convert 'T_OBS' column to datetime object
combined_df['T_OBS'] = combined_df['T_OBS'].dt.strftime('%Y-%m-%d %H:%M:%S') # Format 'T_OBS' column to match the other date columns
combined_df = combined_df.drop(columns=['DBINDEX','UNIX_TIME', 'NOAA_ARS', 'QUALITY']) #drop some key values (reading reasons)
combined_df['intensity'] = combined_df['intensity'].apply(lambda x: round_to_n(x, 7)) # fix intensity decimal problem (reading reasons)

rows_before = len(combined_df)
combined_df['time_diff'] = (pd.to_datetime(combined_df['T_OBS']) - pd.to_datetime(combined_df['t_start'])).dt.total_seconds() / 60
combined_df = combined_df[combined_df['time_diff'] < 0] # get rid of the ones that are positive
print('-')
print('Flares that the SHMARP information is after emergeance: {}'.format(rows_before-len(combined_df)))

combined_df.to_csv('Flares_SEPs_SHMARPs_t-{}.dat'.format(before_start_time), sep='\t', index=False) # SAVING THE MACHINE LEARNING READY FILE


Total number of flares: 4497
-
Flares without SHARP info: 1756
-
Flares that the SHMARP information is after emergeance: 0
