# spark-bestfit Quick Start

This notebook demonstrates how to use spark-bestfit to fit statistical distributions to your data using Apache Spark.

## Setup

In [None]:
import numpy as np
from pyspark.sql import SparkSession

from spark_bestfit import DistributionFitter

# Create Spark session
spark = SparkSession.builder.appName("DistFitDemo").getOrCreate()

## Generate Sample Data

We'll create data from a known distribution (normal) to verify the fitting works correctly.

In [3]:
np.random.seed(42)
data = np.random.normal(loc=50, scale=10, size=100_000)

df = spark.createDataFrame([(float(x),) for x in data], ["value"])
df.show(5)

+------------------+
|             value|
+------------------+
| 54.96714153011233|
| 48.61735698828815|
| 56.47688538100692|
| 65.23029856408026|
|47.658466252766644|
+------------------+
only showing top 5 rows


Exception ignored in: <_io.BufferedWriter name=5>
Traceback (most recent call last):
  File "/Users/dustin/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 200, in manager
BrokenPipeError: [Errno 32] Broken pipe


## Basic Usage

Fit distributions with default settings.

In [None]:
fitter = DistributionFitter()
results = fitter.fit(df, column="value", max_distributions=25)

# Get best fit
best = results.best(n=1)[0]
print(f"Best distribution: {best.distribution}")
print(f"SSE: {best.sse:.6f}")
print(f"Parameters: {best.parameters}")

In [6]:
# Top 5 distributions
for i, result in enumerate(results.best(n=5), 1):
    print(f"{i}. {result.distribution:20s} SSE={result.sse:.6f}")

1. fatiguelife          SSE=0.000011
2. f                    SSE=0.000012
3. erlang               SSE=0.000014
4. crystalball          SSE=0.000015
5. exponnorm            SSE=0.000015




## Custom Fitting Parameters

You can customize fitting behavior by passing parameters directly to the `fit()` method.

In [None]:
# Custom fitting with specific parameters
fitter_custom = DistributionFitter(spark, random_seed=123)
results_custom = fitter_custom.fit(
    df,
    column="value",
    bins=100,
    use_rice_rule=False,
    enable_sampling=True,
    sample_fraction=0.5,
    max_sample_size=500_000,
    max_distributions=35,
)

best_custom = results_custom.best(n=1)[0]
print(f"Best: {best_custom.distribution} (SSE={best_custom.sse:.6f})")

## Plotting

Visualize the fitted distribution against the data histogram.

In [None]:
# Plot the best fitting distribution
fitter.plot(
    best,
    df,
    "value",
    figsize=(14, 8),
    dpi=100,
    show_histogram=True,
    histogram_alpha=0.6,
    pdf_linewidth=3,
    title_fontsize=16,
    label_fontsize=12,
    grid_alpha=0.3,
    title="Best Fit Distribution",
    xlabel="Value",
    ylabel="Density",
)

## Non-Negative Data Example

When your data is strictly non-negative (e.g., prices, durations), use `support_at_zero = true` to only fit appropriate distributions.

In [None]:
# Generate exponential data (non-negative)
data_exp = np.random.exponential(scale=5, size=100_000)
df_exp = spark.createDataFrame([(float(x),) for x in data_exp], ["value"])

# Fit only non-negative distributions using support_at_zero=True
fitter_exp = DistributionFitter(spark)
results_exp = fitter_exp.fit(
    df_exp,
    column="value",
    bins=75,
    support_at_zero=True,  # Only non-negative distributions
    enable_sampling=True,
    max_distributions=10,
)

print("Top 5 non-negative distributions:")
for i, result in enumerate(results_exp.best(n=5), 1):
    print(f"{i}. {result.distribution:20s} SSE={result.sse:.6f}")

## Using Fitted Distributions

Once you have a fitted distribution, you can use it to generate samples or evaluate PDF/CDF.

In [11]:
# Generate samples from the fitted distribution
samples = best.sample(size=10000, random_state=42)

print(f"Original data - mean: {data.mean():.2f}, std: {data.std():.2f}")
print(f"Fitted samples - mean: {samples.mean():.2f}, std: {samples.std():.2f}")

Original data - mean: 50.01, std: 10.01
Fitted samples - mean: 50.15, std: 10.16


In [12]:
# Evaluate PDF at specific points
x = np.array([30, 40, 50, 60, 70])
pdf_values = best.pdf(x)
cdf_values = best.cdf(x)

print("PDF and CDF values:")
for xi, pdf, cdf in zip(x, pdf_values, cdf_values):
    print(f"  x={xi}: PDF={pdf:.6f}, CDF={cdf:.4f}")

PDF and CDF values:
  x=30: PDF=0.005296, CDF=0.0214
  x=40: PDF=0.024297, CDF=0.1575
  x=50: PDF=0.039399, CDF=0.4974
  x=60: PDF=0.024089, CDF=0.8341
  x=70: PDF=0.005893, CDF=0.9731




## Export Results

In [None]:
# Convert all results to pandas DataFrame for further analysis
results_df = results.to_pandas()
results_df.head(10)

## Cleanup

In [None]:
spark.stop()