In [1]:
from hapiclient import hapi, hapitime2datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import calendar
from geopack import geopack
import datetime as datetime

Load IGRF coefficients ...


In [2]:
def fetch_omni_data(params,data, start, stop):
    server      = 'https://cdaweb.gsfc.nasa.gov/hapi'
    dataset     = data
    parameters  = params
    data, meta  = hapi(server, dataset, parameters, start, stop)

    return data

In [3]:
def extract_data(data, params):

    header = params.split(',')
    header.insert(0, 'Timestamp')

    #extract the data from the list
    extracted_data = [[x[i] for x in data] for i in range(len(header))]

    #create a dictionary and convert to dataframe using param names as headers
    df_dict = {header: values for header, values in zip(header, extracted_data)}
    df = pd.DataFrame(df_dict).set_index('Timestamp')

    #convert to datetime from '\b' time
    df.index = hapitime2datetime(df.index.values.astype(str))
    df.index = df.index.tz_convert("UTC").tz_convert(None)


    return df

In [4]:
def extract_omni_1min(data, params):

    df = extract_data(data, params)
    df['Timestamp_1min_omni'] = df.index
    df['Timestamp_2min_omni'] = pd.to_datetime(df['Timestamp_1min_omni'], errors='coerce').dt.floor('2min')
    df['Timestamp_1hr_omni'] = pd.to_datetime(df['Timestamp_1min_omni'], errors='coerce').dt.floor('1h')
    df = df.reset_index(drop=True)
    
    return df


In [5]:
def extract_omni_1hr(params, data):

    df = extract_data(data, params)

    df['Timestamp_1hr_omni'] = df.index
    df['Timestamp_1hr_omni'] = pd.to_datetime(df['Timestamp_1hr_omni'], errors='coerce').dt.floor('1h')

    df = df.reset_index(drop=True)
    
    return df

In [6]:
omni_1min = "OMNI_HRO_1MIN"
#omni_1min_params = 'percent_interp,BY_GSE,BZ_GSE,flow_speed,proton_density,T,Pressure,E,Mach_num,BSN_x,BSN_y,BSN_z,SYM_H'
omni_1min_params = 'BX_GSE,BY_GSE,BZ_GSE,flow_speed,proton_density,AL_INDEX,AU_INDEX,SYM_H,ASY_H'


omni_1hr = "OMNI2_H0_MRG1HR"
omni_1hr_params = 'F10_INDEX1800,KP1800'

#Pressure = flow pressure in nPa

#yr = 2021
months = []
years = [2017]
for year in years:
    for month in range(1,13):

        print('Processing: ',year, month)

        _, num_days = calendar.monthrange(year, month)
        #num_days = 1

        start_time = f'{year}-{month:02d}-{1:02d}T00:00:000Z'
        end_time = f'{year}-{month:02d}-{num_days:02d}T23:59:59Z'

        #download the data then clean it
        omni_1min_data = fetch_omni_data(omni_1min_params, omni_1min, start_time, end_time)
        omni_1min_df = extract_omni_1min(omni_1min_data, omni_1min_params)

        omni_1hr_data = fetch_omni_data(omni_1hr_params,omni_1hr, start_time, end_time)
        omni_1hr_df = extract_omni_1hr(omni_1hr_params, omni_1hr_data)

        df = pd.merge(omni_1min_df, omni_1hr_df, on='Timestamp_1hr_omni', how='left')
        df = df.rename(columns={'F10_INDEX1800':'F10.7', 'KP1800':'Kp'})
        months.append(df)

months_df = pd.concat(months)

def calculate_tilt_angle(row):
    t0 = datetime.datetime(1970, 1, 1)
    t1 = row['Timestamp_1min_omni']
    tdiff = (t1 - t0).total_seconds()
    tilt_angle = geopack.recalc(tdiff) # Calculate dipole tilt angle
    
    return tilt_angle

#months_df['tilt_angle'] = months_df.apply(calculate_tilt_angle, axis=1)

months_df = months_df.reset_index(drop=True)

#export_path = f'/Users/sr2/My Drive/Career/Employment/Current/JSPS/Research/Analysis/Apr-24/data/omni/' #macbook
#export_path = f'/home/ryuho/Documents/reddy/research/SMRAI/Data/OMNI/' #linux
export_path = f'/home/sachin/Documents/NIPR/Research/Data/OMNI/' #server

df_name = f'omni_hro_1min_2017'
export_filename = export_path + df_name +'.csv'
months_df.to_csv(export_filename, index=False, header=True)
months_df
        

Processing:  2017 1


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Processing:  2017 2


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Processing:  2017 3


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Processing:  2017 4


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Processing:  2017 5


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Processing:  2017 6


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Processing:  2017 7


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Processing:  2017 8


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Processing:  2017 9


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Processing:  2017 10


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Processing:  2017 11


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Processing:  2017 12


  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()
  Time = pandas.to_datetime(Time, infer_datetime_format=True).tz_convert(tzinfo).to_pydatetime()


Unnamed: 0,BX_GSE,BY_GSE,BZ_GSE,flow_speed,proton_density,AL_INDEX,AU_INDEX,SYM_H,ASY_H,Timestamp_1min_omni,Timestamp_2min_omni,Timestamp_1hr_omni,F10.7,Kp
0,-6.04,-0.11,-3.75,547.799988,6.88,-63,19,-18,9,2017-01-01 00:00:00,2017-01-01 00:00:00,2017-01-01 00:00:00,70.099998,33
1,-6.12,-0.45,-3.44,547.799988,6.88,-66,27,-18,9,2017-01-01 00:01:00,2017-01-01 00:00:00,2017-01-01 00:00:00,70.099998,33
2,-5.92,0.02,-4.01,99999.900000,999.99,-64,20,-18,7,2017-01-01 00:02:00,2017-01-01 00:02:00,2017-01-01 00:00:00,70.099998,33
3,-5.89,-0.11,-4.15,543.599976,7.99,-60,24,-17,8,2017-01-01 00:03:00,2017-01-01 00:02:00,2017-01-01 00:00:00,70.099998,33
4,-5.55,-3.45,-3.33,546.900024,7.95,-55,19,-17,8,2017-01-01 00:04:00,2017-01-01 00:04:00,2017-01-01 00:00:00,70.099998,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525595,-3.95,4.47,-4.47,370.700012,14.09,-197,90,0,25,2017-12-31 23:55:00,2017-12-31 23:54:00,2017-12-31 23:00:00,68.300003,20
525596,-2.32,-1.42,-5.96,369.100006,13.16,-178,93,-1,26,2017-12-31 23:56:00,2017-12-31 23:56:00,2017-12-31 23:00:00,68.300003,20
525597,-0.38,-5.68,-6.27,366.899994,11.87,-167,91,-1,27,2017-12-31 23:57:00,2017-12-31 23:56:00,2017-12-31 23:00:00,68.300003,20
525598,-1.68,-3.49,-5.62,99999.900000,999.99,-170,93,-1,26,2017-12-31 23:58:00,2017-12-31 23:58:00,2017-12-31 23:00:00,68.300003,20


In [7]:
def clean_omni(df):
    df = df.copy()
    df = df.dropna()

    #replace 99999.900000 in flow_speed with NaN then interpolate over
    df['flow_speed'] = df['flow_speed'].replace(99999.900000, np.nan)
    df['proton_density'] = df['proton_density'].replace(999.99, np.nan)
    df['F10.7'] = df['F10.7'].replace(999.9, np.nan)
    df['BX_GSE'] = df['BX_GSE'].replace(9999.99, np.nan)
    df['BY_GSE'] = df['BY_GSE'].replace(9999.99, np.nan)
    df['BZ_GSE'] = df['BZ_GSE'].replace(9999.99, np.nan)
    df = df.interpolate(limit=10)

    #check to see where NaNs are    
    #df = df.set_index('Timestamp_1min_omni')
    #msno.matrix(df, freq='M')
    
    #filters
    df = df[df['flow_speed'] < 1400]
    df = df[df['proton_density'] < 150]
    df = df[df['F10.7'] < 400]
    df = df[df['BX_GSE'].between(-100,100)]
    df = df[df['BY_GSE'].between(-100,100)]
    df = df[df['BZ_GSE'].between(-100,100)]
    df = df[df['SYM_H'].between(-1000,100)]
    df = df[df['ASY_H'] < 1000]
    df = df[df['AU_INDEX'] < 2000]
    df = df[df['AL_INDEX'] > -2000]
    df = df[df['Kp'] < 100]
    df['Kp'] = df['Kp'] / 10

    #new feature
    df['Timestamp_1min_omni'] = pd.to_datetime(df['Timestamp_1min_omni'])
    df['doy'] = df['Timestamp_1min_omni'].dt.dayofyear

    
    def calculate_tilt_angle(row):
        t0 = datetime.datetime(1970, 1, 1)
        t1 = row['Timestamp_1min_omni']
        tdiff = (t1 - t0).total_seconds()
        tilt_angle = geopack.recalc(tdiff) # Calculate dipole tilt angle
        
        return tilt_angle
    
    #df['tilt_angle'] = df.apply(calculate_tilt_angle, axis=1)

    #histplot of all features
    #for index, column in enumerate(df.columns):
    #    plt.figure(index)
    #    sns.histplot(df[column])

    df = df.groupby('Timestamp_2min_omni').mean().reset_index(drop=False)
    df = df.sort_values(by='Timestamp_2min_omni')

    #format datatypes
    df['doy'] = df['doy'].astype(int)
    df['F10.7'] = df['F10.7'].astype(int)
    df['flow_speed'] = df['flow_speed'].astype(int)

    df = df.dropna()

    return df

omni_df = clean_omni(months_df)
omni_df.to_csv(export_filename, index=False, header=True)
omni_df

Unnamed: 0,Timestamp_2min_omni,BX_GSE,BY_GSE,BZ_GSE,flow_speed,proton_density,AL_INDEX,AU_INDEX,SYM_H,ASY_H,Timestamp_1min_omni,Timestamp_1hr_omni,F10.7,Kp,doy
0,2017-01-01 00:00:00,-6.080,-0.2800,-3.595,547,6.88000,-64.5,23.0,-18.0,9.0,2017-01-01 00:00:30,2017-01-01 00:00:00,70,3.3,1
1,2017-01-01 00:02:00,-5.905,-0.0450,-4.080,544,7.71250,-62.0,22.0,-17.5,7.5,2017-01-01 00:02:30,2017-01-01 00:00:00,70,3.3,1
2,2017-01-01 00:04:00,-5.680,-2.5550,-3.355,548,7.93500,-49.5,23.0,-16.5,8.0,2017-01-01 00:04:30,2017-01-01 00:00:00,70,3.3,1
3,2017-01-01 00:06:00,-5.845,-1.7875,-2.990,551,6.88000,-37.0,65.5,-16.5,7.5,2017-01-01 00:06:30,2017-01-01 00:00:00,70,3.3,1
4,2017-01-01 00:08:00,-6.005,-0.0400,-2.635,551,6.84625,-44.0,109.0,-17.0,7.5,2017-01-01 00:08:30,2017-01-01 00:00:00,70,3.3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250662,2017-12-31 23:50:00,-3.185,3.0950,-6.305,374,13.14500,-212.0,80.0,-1.5,26.0,2017-12-31 23:50:30,2017-12-31 23:00:00,68,2.0,365
250663,2017-12-31 23:52:00,-2.965,3.1800,-6.220,373,13.85000,-235.5,77.0,-0.5,25.0,2017-12-31 23:52:30,2017-12-31 23:00:00,68,2.0,365
250664,2017-12-31 23:54:00,-3.710,3.7550,-5.165,371,14.37000,-209.0,88.0,0.0,25.0,2017-12-31 23:54:30,2017-12-31 23:00:00,68,2.0,365
250665,2017-12-31 23:56:00,-1.350,-3.5500,-6.115,368,12.51500,-172.5,92.0,-1.0,26.5,2017-12-31 23:56:30,2017-12-31 23:00:00,68,2.0,365


In [None]:
def check_data_gaps(df):
    df = df.copy()

    blank_date_range = pd.date_range(start='2011-01-01', end='2011-12-31', freq='2min')
    blank_df = pd.DataFrame(blank_date_range, columns=['Timestamp_2min_omni'])

    merged = pd.merge(blank_df, df, on='Timestamp_2min_omni', how='left')
    merged = merged.drop(columns=['Timestamp_1min_omni', 'Timestamp_1hr_omni'])
    merged = merged.set_index('Timestamp_2min_omni')

    #plt.figure(figsize=(5,5))
    fig, ax = plt.subplots(figsize=(10,5))
    msno.matrix(merged, freq='M', ax=ax)

    return merged

check_data_gaps(omni_df)

In [None]:
#Read MHD dates and expand-out the date ranges
mhd_data = pd.read_csv('mhd_dates.csv')
expanded_dt = pd.concat([pd.Series(pd.date_range(start, end)) 
        for start, end in zip(mhd_data['start'], mhd_data['end'])])

time = np.arange(288) #24hrs in 5min intervals
dt = [] 
for day in expanded_dt:
    for t in time:
        dt.append(day + pd.Timedelta(minutes=t*5)) #add 5min intervals to each day
dt = np.array(dt) #convert from list to numpy array
mhd_dates = pd.DataFrame({'Timestamp_1min_omni': dt}) 
mhd_dates

In [None]:
omni_mhd_dt_merged = pd.merge(mhd_dates, omni_df, on='Timestamp_1min_omni', how='left')
omni_mhd_dt_merged.rename(columns={'Timestamp_1min_omni':'dt'}, inplace=True)
omni_mhd_dt_merged.sort_values(by='dt', inplace=True)
#omni_mhd_dt_merged =omni_mhd_dt_merged.interpolate()
#check for nan values
omni_mhd_dt_merged.isnull().sum()

In [None]:
# Merge the dataframes
omni_mhd_dt_merged = pd.merge(mhd_dates, omni_df, on='Timestamp_1min_omni', how='left')
omni_mhd_dt_merged.rename(columns={'Timestamp_1min_omni': 'dt'}, inplace=True)
omni_mhd_dt_merged.sort_values(by='d, inplace=True)

# Interpolate missing values
omni_mhd_dt_merged['interpolated'] = omni_mhd_dt_merged.isnull().any(axis=1).astype(int)
omni_mhd_dt_merged = omni_mhd_dt_merged.interpolate()

# Plotting
fig, axes = plt.subplots(nrows=len(omni_mhd_dt_merged.columns[1:-1]), ncols=1, figsize=(10, 2 * len(omni_mhd_dt_merged.columns[1:-1])), sharex=True)
for i, column in enumerate(omni_mhd_dt_merged.columns[1:-1]):
    axes[i].plot(omni_mhd_dt_merged['dt'], omni_mhd_dt_merged[column], label=column)
    axes[i].scatter(omni_mhd_dt_merged['dt'][omni_mhd_dt_merged['interpolated'] == 1], omni_mhd_dt_merged[column][omni_mhd_dt_merged['interpolated'] == 1], color='red', label='Interpolated')
    axes[i].set_ylabel(column)
    axes[i].legend()
    axes[i].grid(True)
plt.xlabel('Date')
plt.tight_layout()
plt.show()


In [None]:
#export omni_mhd_dt_merged
#export_path = f'/Users/sr2/My Drive/Career/Employment/Current/JSPS/Research/Analysis/Apr-24/data/omni/'
#export_path = f'/home/sachin/Documents/NIPR/Research/VSCode/REPPU-ESN2/SR_ML/'
df_name = f'omni_add-feats_mhd_5min'
#export_filename = export_path + df_name +'.csv'
export_filename = df_name +'.csv'
omni_mhd_dt_merged.to_csv(export_filename, index=False, header=True)


In [None]:
#show nan values
nan_values = omni_mhd_dt_merged[omni_mhd_dt_merged.isna().any(axis=1)]
nan_values.sort_values(by='dt')

Plots are below. Not part of main code

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
omni_df.hist(ax=ax)
plt.tight_layout()

In [None]:
y = omni_df['doy']
x = omni_df['tilt_angle']
r2 = x.corr(y)
sns.jointplot(x=x, y=y, kind='scatter', s=1, height=4.5)
RE = r'R$_E$'
pcc = r'cm$^{-3}$'
#plt.xlabel(f'Proton Density [{pcc}]')
#plt.ylabel(f'Bow Shock Nose Location (GSE-X) [{RE}]')
#plt.xlabel('Pressure [nPa]')

#plt.ylabel('Electric Field [mV/m]')
#plt.xlabel('Bz [nT]')

r2_lab = r'R$^2$'    
annotation = f'{r2_lab} = {r2:.2f}'
plt.annotate(annotation, xy=(0.75, 0.7), xycoords='axes fraction', fontsize=11)
plt.tight_layout()
plt.savefig('/Users/sr2/OneDrive - University College London/PhD/Experiences/Postdocs/JSPS/Research/Analysis/Apr-24/plots/omni/R2_tilt-angle_doy.png', dpi=300)

In [None]:
corr = omni_df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(7.5, 6.5))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap='coolwarm', vmin=-1, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .7}, annot=True,
             fmt=".2f")

plt.title('OMNI 5min Data Correlation Matrix \n January 2021 - December 2022',pad=-40)
plt.tight_layout()
#plt.savefig('/Users/sr2/OneDrive - University College London/PhD/Experiences/Postdocs/JSPS/Research/Analysis/Apr-24/plots/omni/omni_heatmap.png', dpi=400)

In [None]:
sns.histplot(omni_mhd_dt_merged['flow_speed'], bins=50)
omni_mhd_dt_merged['flow_speed'].describe()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(omni_df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, fmt=".2f")