# MMS Ion Composition Study - Compare dispersion list

#### Purpose
In attampt to make sense of the automatically identified dispersion list and it's bias, we compare the results from the eye identified dispersions analysis results with the program - identified ones.

#### Import libraries and set up variables

In [1]:
from collections import Counter
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from functions import data_preprocess_functions
from shutil import copyfile
import glob
from IPython.display import Image
import ipyplot


In [2]:
goodness_of_fit_threshold = 0.6
model = "t89"
radius_earth = 6378.14 #km

data_filename = 'data/fulldata_20160101_to_20171231.csv'
dispersion_filename = 'output/dispersion_list.csv'
dispersion_m_filename = 'data/dispersion list - mms.csv'

In [3]:
data = data_preprocess_functions.preprocess_data(pd.read_csv(data_filename))
data = data.loc[data['N_DISPERSION_PARA'].notnull() | data['N_DISPERSION_ANTI'].notnull(),:]
data['fit_error'] = data['DIS_FITTING_SIGMA_PARA']*radius_earth

dispersion_list = data_preprocess_functions.extract_dispersions(data)
dispersion_list_m = pd.read_csv(dispersion_m_filename)

dispersion_tplot_list = glob.glob("idl_plots/dispersion_day/*.png")
dispersion_fitting_plot_list = glob.glob("idl_plots/dispersion/*.png")

#### Read in Marissa's dispersion list from MMS

In [4]:
dispersion_list_m['max_v'] = dispersion_list_m['Max E'].apply(data_preprocess_functions.calculate_velocity)
dispersion_list_m['min_v'] = dispersion_list_m['Min E'].apply(data_preprocess_functions.calculate_velocity)
dispersion_list_m['mean_E'] = (dispersion_list_m['Max E'] + dispersion_list_m['Min E'])/2
dispersion_list_m['estimated_distance_m'] = dispersion_list_m['Duration'] * 60. / (1./dispersion_list_m['min_v'] - 1./dispersion_list_m['max_v']) / radius_earth

#### Compare the timing with the auto-identified dispersion list and link the same events

* There are 47 MMS manual identification has match with the auto-identification (Not all of them are exact match)
* Not all of them have a match. 7 of them do not have match.
* 5 of them are too short. 
* Other 2: 
    * 2017-04-06/19:08:00 (42 min) - gap in the dispersion
    * 2017-08-17/12:39:00 (56 min) - Too flat

In [5]:
#display(Image(url= "idl_plots/dispersion_day/o_beam20170406_180000_to_20170407_000000_dispersion.png", width = 450))
#display(Image(url= "idl_plots/dispersion_day/o_beam20170817_120000_to_20170817_180000_dispersion.png", width = 450))
images_array = ["idl_plots/dispersion_day/o_beam20170406_180000_to_20170407_000000_dispersion.png"
                ,"idl_plots/dispersion_day/o_beam20170817_120000_to_20170817_180000_dispersion.png"]

In [6]:
ipyplot.plot_images(images_array, img_width=370)

NameError: name 'ipyplot' is not defined

In [None]:
dispersion_list_m_merged = pd.merge(left=dispersion_list_m, right=dispersion_list, how='left', left_on='Full list Index', right_on='index')
dispersion_list_m_merged['good_fit'] = dispersion_list_m_merged['p_value'] > goodness_of_fit_threshold

# index of all those data has a match in auto-list
index = dispersion_list_m_merged['p_value'] > 0

In [None]:
toplot = dispersion_list_m_merged.loc[dispersion_list_m_merged['Full list Index'].notnull(),:]


In [None]:
for i in toplot.index:
    display_array = []
    i_toplot = toplot.loc[i,:]
    match_date = lambda x: (i_toplot['date'][0:4]+i_toplot['date'][5:7]+i_toplot['date'][8:10]) in x
    match_list1 = list(map(match_date, dispersion_tplot_list))
    if match_list1.count(True) > 0:
        index1 = match_list1.index(True)
        i_tplot1 = dispersion_tplot_list[index1]
        display_array.append(i_tplot1)

    match_list2 = list(map(match_date, dispersion_fitting_plot_list))
    if match_list2.count(True) > 0:
        index2 = match_list2.index(True)
        i_tplot2 = dispersion_fitting_plot_list[index2]
        display_array.append(i_tplot2)

    ipyplot.plot_images(display_array, img_width=370)

# display(Image(url = display_array[0]))

#### Reproduce the manual identification results

In [None]:
fig = px.scatter(dispersion_list_m_merged, x="RE", y="estimated_distance_m", trendline="ols", color = "good_fit", size="Duration", hover_name="Event Time")
reference_line = go.Scatter(x=[0, 35], y=[0, 35], mode="lines", line=go.scatter.Line(color="gray"), showlegend=False)
fig.add_trace(reference_line)
fig.update_layout( title = 'Replicate manual identification',font=dict( family="Courier New, monospace",size=18    ), legend_x = 0, legend_y=1)

##### Discussion 

1. Many of the data are not included in the "good" dispersion list. We need to investigate each plots to find out the reason. The next step is to plot the start and end point in Manul list in the tplot
2. The duration of the plots varies (8 min to 68 min). In contrast to auto-list (20min to 50min)
3. It seems that there are two group of data. 

#### Compare estimated distance auto to estimated distance manual

In [None]:
fig = px.scatter(dispersion_list_m_merged.loc[index,], x="estimated_distance_m", y="estimated_distance", hover_name="Event Time" ,size="Duration", trendline="ols", hover_data = ["index", "BETA", "direction", "GSM_Z", "p_value", 'GSM_X', 'BX_GSM'], color = "good_fit")
reference_line = go.Scatter(x=[0, 35], y=[0, 35], mode="lines", line=go.scatter.Line(color="gray"), showlegend=False)
fig.add_trace(reference_line)
fig.update_layout(title = 'Comparison of estimated distance (auto vs manual)', font=dict(family="Courier New, monospace",size=18    ), legend_x = 1, legend_y=1)

##### Discussion

The auto process consistently makes bigger estimation of distance than manual process

<img src="idl_plots/dispersion_day/o_beam20160506_000000_to_20160506_060000_dispersion.png"  style="display: inline; width:500px">
<img src="idl_plots/dispersion/o_beam20160506_051230_to_20160506_052730dispersion_fitting.png"  style="display: inline; width:400px">



#### Manul estimated distance vs model field line length

In [None]:
fig = px.scatter(dispersion_list_m_merged.loc[index,], x="model_field_line_length_idl", y="estimated_distance_m", size="Duration", hover_name="Event Time", hover_data = ["index", "BETA", "direction", "GSM_Z", "p_value", 'GSM_X', 'BX_GSM'], trendline="ols", color = "good_fit")
reference_line = go.Scatter(x=[0, 35], y=[0, 35], mode="lines", line=go.scatter.Line(color="gray"), showlegend=False)
fig.add_trace(reference_line)
fig.update_layout(title = 'Manual estiamted distance VS traced field line length ('+model+')' ,font=dict(family="Courier New, monospace",size=18    ), legend_x = 0, legend_y=1)

#### Auto estimated distance vs model field line length

In [None]:
fig = px.scatter(dispersion_list_m_merged.loc[index,], x="model_field_line_length_idl", y="estimated_distance", size="dispersion_time", hover_name="TIME", hover_data = ["index", "BETA", "direction", "GSM_Z", "p_value", 'GSM_X', 'BX_GSM']
                , trendline="ols", color = "good_fit")
reference_line = go.Scatter(x=[0, 35], y=[0, 35], mode="lines", line=go.scatter.Line(color="gray"), showlegend=False)
fig.add_trace(reference_line)
fig.update_layout(title = 'Auto estiamted distance VS traced field line length ('+model+')', font=dict(family="Courier New, monospace",size=18), legend_x = 0, legend_y=1)

In [None]:
fig = px.scatter(dispersion_list_m_merged.loc[index,:], x="model_field_line_length_idl", y="estimated_distance_m"
                 , size="dispersion_time"
                 , hover_name="datetime_str"
                 , hover_data = ["index", "BETA", "direction", "GSM_Z", "p_value", 'GSM_X', 'BX_GSM','MLT_y']
                , trendline="ols"
                 , symbol = "direction_et"
          #       , facet_col="direction_et"
         #        , facet_row="region"
                , color = "region" #np.log10(dispersion_list_m_merged.loc[index,"mean_E"]), range_color=[3,5]
                 )

reference_line = go.Scatter(x=[0, 35], y=[0, 35], mode="lines", line=go.scatter.Line(color="gray"), showlegend=False)

fig.add_trace(reference_line)
#fig.add_trace(reference_line, row=1, col=1)
#fig.add_trace(reference_line, row=1, col=2)

fig.update_layout(#title = ' ' + ', goodness of fit > ' + str(goodness_of_fit_threshold),
 font=dict(
        family="Courier New, monospace",
        size=18    ), legend_x = 1, legend_y=1)

fig.update_layout(coloraxis_colorbar=dict(
    title="energy",
    tickvals=[1,2,3,4,5],
    ticktext=["10", "100", "1k","100k","100k"],
))

fig.show()


In [None]:
fig = px.scatter(my_data, x="datetime_str", y="INVERSE_V_PARA",error_y="fit_error")

reference_line1 = go.Scatter(
     x=[my_plot['Event Time']
         ,(datetime.datetime.fromisoformat(my_plot['Event Time'])
           + datetime.timedelta(hours= my_plot['Duration']/60.)).strftime("%Y-%m-%d/%H:%M:%S")]
    ,    y=[1/my_plot['max_v'], 1/my_plot['min_v']]
    , mode="lines", line=go.scatter.Line(color="gray"), showlegend=False)

reference_line2 = go.Scatter(
     x=[my_plot['Event Time']
         ,(datetime.datetime.fromisoformat(my_plot['Event Time'])
           + datetime.timedelta(hours= my_plot['Duration']/60.)).strftime("%Y-%m-%d/%H:%M:%S")]
    ,    y=[1/my_plot['max_v'], 1/my_plot['min_v']]
    , mode="lines", line=go.scatter.Line(color="gray"), showlegend=False)

fig.add_trace(reference_line1)
fig.update_layout( title = 'Replicate manual identification',font=dict( family="Courier New, monospace",size=12)
                  , legend_x = 0, legend_y=1)

fig.show()