<a href="https://colab.research.google.com/github/rmhyps1/statistics/blob/main/PROGRES_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [55]:
# Rumus distribusi frekuensi

def distribusi_frekuensi(x, k="sturges"):
    n = len(x)
    if n == 0:
        return [], [], [], [], [], [], []

    # gunakan edges dari numpy
    edges = np.histogram_bin_edges(x, bins=k)
    k = len(edges) - 1
    i = edges[1] - edges[0]  # lebar interval

    mid = [edges[q] + 0.5 * i for q in range(k)]
    f, _ = np.histogram(x, bins=edges)

    fr = f / n
    fk = np.cumsum(f)
    frk = fk / n

    rows = []
    for q in range(k):
        left = edges[q]
        right = edges[q+1]
        label = f"[{left}, {right})" if q < k-1 else f"[{left}, {right}]"
        rows.append({
            "interval": label,
            "midpoint": mid[q],
            "f": int(f[q]),
            "fr": fr[q],
            "fk": int(fk[q]),
            "frk": frk[q]
        })

    return rows, edges, mid, f, fr, fk, frk

In [56]:
# Percobaan menggunakan dataset berat badan
data = [128,63,97,134,133,136,125,110,118,94,
        76,84,132,105,80,87,100,77,120,109,
        90,72,103,78,94,118,117,80,140,94]

n = len(data)

# edges & counts pakai aturan Sturges
edges = np.histogram_bin_edges(data, bins="sturges")
counts, edges = np.histogram(data, bins=edges)

# hitung nilai tengah tiap kelas
mid = (edges[:-1] + edges[1:]) / 2.0

# frekuensi relatif & kumulatif
fk = np.cumsum(counts)
fr = counts / n
frk = fk / n

# buat tabel
rows = []
for left, right, m, c, r, cum, rcum in zip(edges[:-1], edges[1:], mid, counts, fr, fk, frk):
    left_str = f"{left:g}"
    right_str = f"{right:g}"
    label = f"[{left_str}, {right_str})"
    rows.append({
        "interval": label,
        "midpoint": m,
        "f": int(c),
        "fr": float(r),
        "fk": int(cum),
        "frk": float(rcum)
    })
df = pd.DataFrame(rows)

# tampilkan rapi
df_print = pd.DataFrame({
    "Interval": df["interval"],
    "Midpoint": df["midpoint"].map(lambda x: f"{x:8.1f}"),
    "f": df["f"].map(lambda x: f"{x:4d}"),
    "fr": df["fr"].map(lambda x: f"{x:8.4f}"),
    "fk": df["fk"].map(lambda x: f"{x:4d}"),
    "frk": df["frk"].map(lambda x: f"{x:8.4f}")
})

# cetak ringkasan
print(f"\nn = {n}, k = {len(counts)}")  # k dari histogram (aturan Sturges)
interval_widths = np.diff(edges)        # lebar tiap kelas
print(f"Interval width i ≈ {interval_widths[0]:.2f}\n")  # ambil lebar pertama

print(df_print.to_string(index=False))


n = 30, k = 6
Interval width i ≈ 12.83

          Interval Midpoint    f       fr   fk      frk
     [63, 75.8333)     69.4    2   0.0667    2   0.0667
[75.8333, 88.6667)     82.2    7   0.2333    9   0.3000
  [88.6667, 101.5)     95.1    6   0.2000   15   0.5000
  [101.5, 114.333)    107.9    4   0.1333   19   0.6333
[114.333, 127.167)    120.8    5   0.1667   24   0.8000
    [127.167, 140)    133.6    6   0.2000   30   1.0000


In [57]:
import pandas as pd
import numpy as np

# 1. Baca data
file_path = '/content/Most Streamed Spotify Songs 2024.csv'
df = pd.read_csv(file_path, encoding='latin1')

# 2. Kolom target
kolom_numerik = 'Spotify Streams'

# 3. Bersihin data -> buang koma biar bisa jadi angka
df[kolom_numerik] = df[kolom_numerik].replace(",", "", regex=True).astype(float)

# 4. Ambil data numerik & drop NaN
data = df[kolom_numerik].dropna().values

# 5. Tentuin jumlah interval (k) pakai Sturges atau manual
n = len(data)
k = int(1 + 3.3 * np.log10(n))   # aturan Sturges

# 6. Hitung min, max, range, interval
min_val = data.min()
max_val = data.max()
range_val = max_val - min_val
interval = np.ceil(range_val / k)

# 7. Bikin batas interval
bins = [min_val + i*interval for i in range(k+1)]

# 8. Hitung frekuensi pakai np.histogram
freq, edges = np.histogram(data, bins=bins)

# 9. Hitung titik tengah interval
midpoints = (edges[:-1] + edges[1:]) / 2

# 10. Frekuensi relatif & kumulatif
rel_freq = freq / freq.sum()
cum_freq = np.cumsum(freq)

# 11. Buat tabel hasil
tabel = pd.DataFrame({
    'Interval': [f'{int(edges[i])} - {int(edges[i+1])}' for i in range(len(freq))],
    'Midpoint': np.round(midpoints, 2),
    'Frequency': freq,
    'Relative Frequency': np.round(rel_freq, 4),
    'Cumulative Frequency': cum_freq
})

print("Distribusi Frekuensi Spotify Streams")
print(tabel)


Distribusi Frekuensi Spotify Streams
                   Interval      Midpoint  Frequency  Relative Frequency  \
0          1071 - 329344737  1.646729e+08       2578              0.5745   
1     329344737 - 658688403  4.940166e+08        845              0.1883   
2     658688403 - 988032069  8.233602e+08        451              0.1005   
3    988032069 - 1317375735  1.152704e+09        239              0.0533   
4   1317375735 - 1646719401  1.482048e+09        168              0.0374   
5   1646719401 - 1976063067  1.811391e+09        101              0.0225   
6   1976063067 - 2305406733  2.140735e+09         59              0.0131   
7   2305406733 - 2634750399  2.470079e+09         24              0.0053   
8   2634750399 - 2964094065  2.799422e+09          6              0.0013   
9   2964094065 - 3293437731  3.128766e+09          8              0.0018   
10  3293437731 - 3622781397  3.458110e+09          4              0.0009   
11  3622781397 - 3952125063  3.787453e+09          