# EDA

In [None]:
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd
from fitter import Fitter
import numpy as np
from functions import read_data, add_time_differences, add_energy, reorder_columns, scatter_plot, plot_histogram, show_best_distribution

In [None]:
data_1 = read_data('data/out_1.csv')
data_1['zone'] = 1
data_2 = read_data('data/out_2.csv')
data_2['zone'] = 2
# join the two dataframes and sort by date
df = pd.concat([data_1, data_2]).sort_values(by=['date'])
# drop the zone column in the original dataframes
data_1 = data_1.drop(columns=['zone'])
data_2 = data_2.drop(columns=['zone'])


## Time Differences and Energy

In [None]:
# Add time differences and energy to the dataframes and reorder the columns.
# Also convert the zone column to a categorical variable.
# This is done so that the zone column is not used as a numerical variable.
data_1 = reorder_columns(add_energy(add_time_differences(data_1)))
data_2 = reorder_columns(add_energy(add_time_differences(data_2)))
df = reorder_columns(add_energy(add_time_differences(df)))
df['zone'] = df['zone'].astype('category')


## Write data to disk

In [None]:
# Save the zones into seperate files, so they can be compared to the original files.
data_1.to_csv('data/data_1.csv', index=False)
data_2.to_csv('data/data_2.csv', index=False)
df.to_csv('data/data.csv', index=False)


## Visualization
### Scatter

In [None]:
for col in ['m', 'v', 'e']:
    scatter_plot(df, col)
scatter_plot(df, 'e', c='m', colorbar=True)
plt.show()


### Histogram

In [None]:
for col in ['m', 'v', 'e']:
    for zone in [1, 2]:
        plot_histogram(df[df['zone'] == zone], col, zone)


# Fit Distributions

In [None]:
show_best_distribution(df[df['zone'] == 1]['time_differences'].dropna().values)


In [None]:
show_best_distribution(df[df['zone'] == 2]['time_differences'].dropna().values)


In [None]:
show_best_distribution(df[df['zone'] == 1]['m'].dropna().values)


In [None]:
show_best_distribution(df[df['zone'] == 2]['m'].dropna().values)


In [None]:
show_best_distribution(df[df['zone'] == 1]['v'].dropna().values)


In [None]:
show_best_distribution(df[df['zone'] == 2]['v'].dropna().values)


In [None]:
show_best_distribution(df[df['zone'] == 1]['e'].dropna().values)


In [None]:
show_best_distribution(df[df['zone'] == 2]['e'].dropna().values)
