In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as ss
import helpers
from scipy.stats import  expon, norm, gamma, beta, lognorm
from scipy.stats._continuous_distns import beta_gen, gamma_gen
import scipy.stats as ss
from datetime import datetime, timedelta

In [None]:
zone1 = pd.read_csv('data/clean_1.csv')
zone2 = pd.read_csv('data/clean_2.csv')

In [None]:
def distribution_scores(data):
    # Define candidate distributions
    dist_names = [norm, lognorm, expon, gamma]

    # Set up initial best parameters and likelihoods
    best_dist = None
    best_params = {}
    best_ll = 1_000_000

    # Set up a dictionary to store the log-likelihoods of each distribution
    ll_dict = {}

    # Iterate through candidate distributions and find the best fit
    for dist_name in dist_names:
        # Fit the distribution to the data using MLE
        params = dist_name.fit(data)

        # Get the negative log-likelihood of the data under the distribution
        ll = -dist_name.logpdf(data, *params).sum()

        # If the fit is better than the current best, update the best fit
        if ll < best_ll:
            best_dist = dist_name
            best_params = params
            best_ll = ll

        # Store the log-likelihood of the fit for this distribution
        ll_dict[dist_name] = ll

    # Sort the distributions by the log-likelihood of their fit
    sorted_dists = sorted(ll_dict, key=ll_dict.get)

    # Generate a list of strings describing the fit of each distribution
    dist_strings = []
    for dist_name in sorted_dists:
        dist_strings.append(f'{dist_name.name}  score: {ll_dict[dist_name]}')

    # Return the best distribution, its parameters, and the list of distribution fit strings
    return dist_strings

def cdf_fit(data):
    
    distributions = [ss.norm, ss.lognorm, ss.expon, ss.gamma]


    # Plot the CDF of the data and the fitted distributions
    plt.hist(data, bins=len(data), density=True, cumulative=True, alpha=0.5, label='Data')
    x = np.linspace(data.min(), data.max()*1.2, 100)
    
    for dist in distributions:
        params = dist.fit(data)
        ll = -dist.logpdf(data, *params).sum().round(0)
        plt.plot(x, dist(*params).cdf(x), label=f'{dist.name}, score: {ll}')
        plt.xlabel(data.name)
        plt.ylabel('Cumulative probability')
        plt.legend()
    
    plt.show()

In [None]:
cdf_fit(zone1['kg'])

In [None]:
cdf_fit(zone2['kg'])

In [None]:
cdf_fit(zone1['m/s'])

In [None]:
cdf_fit(zone2['m/s'])

In [None]:
cdf_fit(zone1['timediv h'])

In [None]:
cdf_fit(zone2['timediv h'])

In [None]:

# Plot some data
x = [1, 2, 3, 4, 5]
y = [1, 2, 3, 4, 5]
plt.plot(x, y)

# Add text underneath the plot
plt.text(2, -1, "This is some text", ha='center', fontsize=12)

# Show the plot
plt.show()
