# EDA

In [None]:
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd
from fitter import Fitter
import numpy as np
from pprint import pprint
from functions import (
    read_data,
    add_time_differences,
    add_energy,
    reorder_columns,
    scatter_plot,
    plot_histogram,
    best_distributions,
    print_fit,
)

In [None]:
data_1 = read_data("data/out_1.csv")
data_1["zone"] = 1
data_2 = read_data("data/out_2.csv")
data_2["zone"] = 2
# join the two dataframes and sort by date
df = pd.concat([data_1, data_2]).sort_values(by=["date"])

In [None]:
# summarize the data
print(df.describe())

## NAs, Zeros and Empty Strings

In [None]:
na_count = df.isna().sum()
zero_count = (df == 0).sum()
empty_string_count = (df == "").sum()

print("Number of NAs in each column:")
print(na_count)
print("\nNumber of zeros in each column:")
print(zero_count)
print("\nNumber of empty strings in each column:")
print(empty_string_count)

In [None]:
# summarize column 'm' for each zone
print("\n", df.groupby("zone")["m"].describe())

# replace zeros with median of the same zone
df["m"] = df.groupby("zone")["m"].transform(lambda x: x.replace(0, x.median()))

# summarize column 'm' for each zone
print("\n", df.groupby("zone")["m"].describe())

## Time Differences and Energy

In [None]:
# Add time differences and energy to the dataframes and reorder the columns.
# Also convert the zone column to a categorical variable.
# This is done so that the zone column is not used as a numerical variable.
df = reorder_columns(add_energy(add_time_differences(df)))
df["zone"] = df["zone"].astype("category")

## Write data to disk

In [None]:
# Save the zones into seperate files, so they can be compared to the original files.
df.to_csv("data/data.csv", index=False)

## Visualization
### Scatter

In [None]:
for col in ["m", "v", "e"]:
    scatter_plot(df, col)
scatter_plot(df, "e", c="m", colorbar=True)
plt.show()

### Histogram

In [None]:
for col in ["m", "v", "e"]:
    for zone in [1, 2]:
        plot_histogram(df[df["zone"] == zone], col, zone)

# Fit Distributions

In [None]:
fit = best_distributions(df)

# Mass Prediction

In [None]:
print_fit(fit, 1, "m")

In [None]:
print_fit(fit, 2, "m")

In [None]:
print_fit(fit, 1, "v")

In [None]:
print_fit(fit, 2, "v")

In [None]:
print_fit(fit, 1, "e")

In [None]:
print_fit(fit, 2, "e")

In [None]:
print_fit(fit, 1, "time_differences")

In [None]:
print_fit(fit, 2, "time_differences")