In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats as ss
from scipy.stats import  expon, norm, gamma, beta, lognorm
from scipy.stats._continuous_distns import beta_gen, gamma_gen
from scipy import stats
from datetime import datetime, timedelta

In [None]:
zone1 = pd.read_csv('data/out_1.csv', usecols=["Datum", "Uhrzeit", "Masse [kg]", "Geschwindigkeit [m/s]"])
zone1.columns = ["date", "time", "kg", "m/s"]
zone1 = zone1.dropna(how='all')
zone1['kj'] = 0.5 * zone1['kg'] * (zone1['m/s']**2) /1000
zone1['datetime'] = pd.to_datetime(zone1['date'] + ' ' + zone1['time'])
zone1['timediv h'] = (zone1['datetime'] - zone1['datetime'].shift()).fillna(pd.Timedelta(seconds=0))
zone1['timediv h'] = zone1['timediv h'].apply(lambda x: int(round(x.total_seconds() / 3600)))
zone1.loc[0, 'timediv h'] = zone1['timediv h'].median()


In [None]:
zone2 = pd.read_csv('data/out_2.csv', usecols=["Date", "Uhrzeit", "m [kg]", "v [m/s]"])
zone2.columns = ["date", "time", "kg", "m/s"]
zone2 = zone2.dropna(how='all')
zone2.loc[zone2['kg'] == 0.0, 'kg'] = zone2['kg'].median()
zone2['kj'] = 0.5 * zone2['kg'] * (zone2['m/s']**2) /1000
zone2['datetime'] = pd.to_datetime(zone2['date'] + ' ' + zone2['time'])
zone2['timediv h'] = (zone2['datetime'] - zone2['datetime'].shift()).fillna(pd.Timedelta(seconds=0))
zone2['timediv h'] = zone2['timediv h'].apply(lambda x: int(round(x.total_seconds() / 3600)))
zone2.loc[0, 'timediv h'] = zone2['timediv h'].median()

In [None]:
distributions = [norm, lognorm, expon, gamma, beta]
bounds = [(0, 10000), (-100, 100)]
res = ss.fit(ss.norm, zone1['kg'], bounds)
res.plot()
plt.show()


In [None]:
x = np.linspace(zone1['kg'].min(), zone1['kg'].max(), 1000)

plt.hist(zone1['kg'], bins=50, density=True, cumulative=True, alpha=0.5)
plt.xlabel('kg')
plt.ylabel('Cumulative Probability')
plt.show()

In [None]:
def plot_fit(data):
    
    distributions = [ss.norm, ss.lognorm, ss.expon, ss.gamma]


    # Plot the CDF of the data and the fitted distribution
    plt.hist(data, bins=len(data), density=True, cumulative=True, alpha=0.5, label='Data')
    x = np.linspace(data.min(), data.max(), 1000)
    
    for dist in distributions:
        params = dist.fit(data)
        plt.plot(x, dist(*params).cdf(x), label='Fitted distribution')
        plt.xlabel(data.name)
        plt.ylabel('Cumulative probability')
        plt.legend()
    
    plt.show()
    
plot_fit(zone1['kg'])

In [None]:

def plot_fit(data, dists=[ss.norm, ss.expon, ss.lognorm, ss.gamma]):

    fit_results = pd.DataFrame(columns=['distribution', 'params', 'log-likelihood'])

    # Fit each distribution to the data
    for dist in dists:
        params = dist.fit(data)
        loglik = np.sum(dist.logpdf(data, *params))
        fit_results = fit_results.append({
            'distribution': dist.__name__,
            'params': params,
            'log-likelihood': loglik
        }, ignore_index=True)

    # Sort the results by log-likelihood
    fit_results = fit_results.sort_values('log-likelihood', ascending=False)

    # Plot the CDF of the data and the fitted distributions
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.hist(data, bins=50, density=True, cumulative=True, alpha=0.5, label='Data')
    x = np.linspace(data.min(), data.max(), 1000)
    for i, row in fit_results.iterrows():
        dist = getattr(ss, row['distribution'])
        ax.plot(x, dist(*row['params']).cdf(x), label=row['distribution'])
    ax.set_xlabel('kg')
    ax.set_ylabel('Cumulative probability')
    ax.legend()
    plt.show()

    return fit_results
