<a href="https://colab.research.google.com/github/rmhyps1/statistics/blob/main/statistik1TDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import math

In [None]:
# Rumus distribusi frekuensi

def distribusi_frekuensi(x, k="sturges"):
    n = len(x)
    if n == 0:
        return [], [], [], [], [], [], []

    # gunakan edges dari numpy
    edges = np.histogram_bin_edges(x, bins=k)
    k = len(edges) - 1
    i = edges[1] - edges[0]  # lebar interval

    mid = [edges[q] + 0.5 * i for q in range(k)]
    f, _ = np.histogram(x, bins=edges)

    fr = f / n
    fk = np.cumsum(f)
    frk = fk / n

    rows = []
    for q in range(k):
        left = edges[q]
        right = edges[q+1]
        label = f"[{left}, {right})" if q < k-1 else f"[{left}, {right}]"
        rows.append({
            "interval": label,
            "midpoint": mid[q],
            "f": int(f[q]),
            "fr": fr[q],
            "fk": int(fk[q]),
            "frk": frk[q]
        })

    return rows, edges, mid, f, fr, fk, frk

In [None]:
# Percobaan menggunakan dataset berat badan
data = [128,63,97,134,133,136,125,110,118,94,
        76,84,132,105,80,87,100,77,120,109,
        90,72,103,78,94,118,117,80,140,94]

n = len(data)

# edges & counts pakai aturan Sturges
edges = np.histogram_bin_edges(data, bins="sturges")
counts, edges = np.histogram(data, bins=edges)

# hitung nilai tengah tiap kelas
mid = (edges[:-1] + edges[1:]) / 2.0

# frekuensi relatif & kumulatif
fk = np.cumsum(counts)
fr = counts / n
frk = fk / n

# buat tabel
rows = []
for left, right, m, c, r, cum, rcum in zip(edges[:-1], edges[1:], mid, counts, fr, fk, frk):
    left_str = f"{left:g}"
    right_str = f"{right:g}"
    label = f"[{left_str}, {right_str})"
    rows.append({
        "interval": label,
        "midpoint": m,
        "f": int(c),
        "fr": float(r),
        "fk": int(cum),
        "frk": float(rcum)
    })
df = pd.DataFrame(rows)

# tampilkan rapi
df_print = pd.DataFrame({
    "Interval": df["interval"],
    "Midpoint": df["midpoint"].map(lambda x: f"{x:8.1f}"),
    "f": df["f"].map(lambda x: f"{x:4d}"),
    "fr": df["fr"].map(lambda x: f"{x:8.4f}"),
    "fk": df["fk"].map(lambda x: f"{x:4d}"),
    "frk": df["frk"].map(lambda x: f"{x:8.4f}")
})

# cetak ringkasan
print(f"\nn = {n}, k = {len(counts)}")  # k dari histogram (aturan Sturges)
interval_widths = np.diff(edges)        # lebar tiap kelas
print(f"Interval width i ≈ {interval_widths[0]:.2f}\n")  # ambil lebar pertama

print(df_print.to_string(index=False))


n = 30, k = 6
Interval width i ≈ 12.83

          Interval Midpoint    f       fr   fk      frk
     [63, 75.8333)     69.4    2   0.0667    2   0.0667
[75.8333, 88.6667)     82.2    7   0.2333    9   0.3000
  [88.6667, 101.5)     95.1    6   0.2000   15   0.5000
  [101.5, 114.333)    107.9    4   0.1333   19   0.6333
[114.333, 127.167)    120.8    5   0.1667   24   0.8000
    [127.167, 140)    133.6    6   0.2000   30   1.0000


In [None]:
# membaca data csv yang didapat dari kaggle
data = pd.read_csv("/content/drive/MyDrive/top2022.csv")

drop_cols = ["gap", "npsn", "kab.kota", "ranking"] # Mengabaikan kolom yang ada
data = data.drop(columns=drop_cols, errors="ignore") # drop kolom
values = data["Nilai.Total"].dropna().tolist()

data = data[['Nilai.Total', 'Provinsi', 'Jenis']]
data.head()

Unnamed: 0,Nilai.Total,Provinsi,Jenis
0,666.494,BANTEN,MA
1,641.482,JAWA TIMUR,SMA
2,640.747,JAWA TENGAH,SMA
3,637.499,JAWA TENGAH,MA
4,635.347,DKI JAKARTA,SMA


In [None]:
n = len(values)

# edges & counts pakai aturan Sturges
edges = np.histogram_bin_edges(values, bins="sturges")
counts, edges = np.histogram(values, bins=edges)

# titik tengah, frekuensi relatif, kumulatif
mid = (edges[:-1] + edges[1:]) / 2.0
fk = np.cumsum(counts)
fr = counts / n
frk = fk / n

# buat tabel distribusi
rows = []
for left, right, m, c, r, cum, rcum in zip(edges[:-1], edges[1:], mid, counts, fr, fk, frk):
    left_str = f"{left:g}"
    right_str = f"{right:g}"
    label = f"[{left_str}, {right_str})" if right < edges[-1] else f"[{left_str}, {right_str}]"
    rows.append({
        "interval": label,
        "midpoint": m,
        "f": int(c),
        "fr": float(r),
        "fk": int(cum),
        "frk": float(rcum)
    })

df = pd.DataFrame(rows)

# lebih rapi
df_print = pd.DataFrame({
    "Interval": df["interval"],
    "Midpoint": df["midpoint"].map(lambda x: f"{x:8.1f}"),
    "f": df["f"].map(lambda x: f"{x:4d}"),
    "fr": df["fr"].map(lambda x: f"{x:8.4f}"),
    "fk": df["fk"].map(lambda x: f"{x:4d}"),
    "frk": df["frk"].map(lambda x: f"{x:8.4f}")
})

# ringkasan
interval_widths = np.diff(edges)  # lebar kelas per interval
i = interval_widths[0] if len(interval_widths) > 0 else 1

print(f"\nn = {n}, k = {len(counts)}")
print(f"Interval width i ≈ {i:.2f}\n")
print(df_print.to_string(index=False))


n = 1000, k = 11
Interval width i ≈ 13.95

          Interval Midpoint    f       fr   fk      frk
[513.023, 526.975)    520.0  376   0.3760  376   0.3760
[526.975, 540.927)    534.0  257   0.2570  633   0.6330
[540.927, 554.879)    547.9  122   0.1220  755   0.7550
[554.879, 568.831)    561.9  110   0.1100  865   0.8650
[568.831, 582.783)    575.8   48   0.0480  913   0.9130
[582.783, 596.734)    589.8   41   0.0410  954   0.9540
[596.734, 610.686)    603.7   18   0.0180  972   0.9720
[610.686, 624.638)    617.7   13   0.0130  985   0.9850
 [624.638, 638.59)    631.6   12   0.0120  997   0.9970
 [638.59, 652.542)    645.6    2   0.0020  999   0.9990
[652.542, 666.494]    659.5    1   0.0010 1000   1.0000
