<a href="https://colab.research.google.com/github/rmhyps1/statistics/blob/main/TUGAS8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import math
import glob
import numpy as np
import pandas as pd
from numpy.linalg import inv, pinv
from scipy import stats
import kagglehub

path = kagglehub.dataset_download("nelgiriyewithana/most-streamed-spotify-songs-2024")
csv_files = glob.glob(os.path.join(path, "**", "*.csv"), recursive=True)

if not csv_files:
    raise FileNotFoundError("No CSV file found in the downloaded dataset.")
CSV_PATH = csv_files[0]

def read_csv_fallback(path):
    encs = ["utf-8", "latin1", "cp1252"]
    last_exc = None
    for e in encs:
        try:
            return pd.read_csv(path, encoding=e)
        except Exception as ex:
            last_exc = ex
    raise last_exc

df = read_csv_fallback(CSV_PATH)
df = df.dropna(axis=1, how='all')

for col in df.columns:
    if df[col].dtype == 'object':
        try:
            cleaned_ser = df[col].astype(str).str.replace(',', '', regex=False)
            numeric_ser = pd.to_numeric(cleaned_ser, errors='coerce')
            if numeric_ser.notna().mean() > 0.5:
                df[col] = numeric_ser
        except Exception:
            pass

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols) == 0:
    raise ValueError("No numeric columns found.")

if len(num_cols) == 2:
    predictor_cols = [num_cols[0]]
    response_col = num_cols[1]
else:
    predictor_cols = num_cols[:-1]
    response_col = num_cols[-1]

X_raw = df[predictor_cols].apply(pd.to_numeric, errors="coerce")
Y_raw = pd.to_numeric(df[response_col], errors="coerce")

valid_mask = X_raw.notna().all(axis=1) & Y_raw.notna()
X_clean = X_raw[valid_mask].to_numpy(dtype=float)
Y_clean = Y_raw[valid_mask].to_numpy(dtype=float).reshape(-1,1)

n_obs = X_clean.shape[0]
p = X_clean.shape[1]

if n_obs == 0:
    raise ValueError("Observations is 0.")

ones = np.ones((n_obs,1))
X = np.hstack([ones, X_clean])
Xt = X.T
XtX = Xt.dot(X)

try:
    XtX_inv = inv(XtX)
except Exception:
    XtX_inv = pinv(XtX)

XtY = Xt.dot(Y_clean)
beta_hat = XtX_inv.dot(XtY)

Y_hat = X.dot(beta_hat)
residuals = Y_clean - Y_hat
SSE = float((residuals**2).sum())
y_mean = float(Y_clean.mean())
SST = float(((Y_clean - y_mean)**2).sum())
SSR = float(SST - SSE)

df_model = p
df_error = n_obs - (p+1)
df_total = n_obs - 1

MSTr = SSR / df_model if df_model>0 else float("nan")
MSE = SSE / df_error if df_error>0 else float("nan")
F_stat = MSTr / MSE if (not math.isnan(MSTr) and not math.isnan(MSE) and MSE!=0) else float("nan")
p_value = stats.f.sf(F_stat, df_model, df_error) if (not math.isnan(F_stat) and df_model>0 and df_error>0) else float("nan")

cov_beta = MSE * XtX_inv if df_error>0 else np.full(XtX_inv.shape, np.nan)
se_beta = np.sqrt(np.diag(cov_beta)).reshape(-1,1)
t_stats = beta_hat / se_beta
p_vals_beta = 2 * stats.t.sf(np.abs(t_stats).reshape(-1), df_error) if df_error>0 else np.full((p+1,), np.nan)

col_names = ["Intercept"] + predictor_cols
coef_table = pd.DataFrame({
    "variable": col_names,
    "beta": beta_hat.flatten(),
    "std_error": se_beta.flatten(),
    "t_stat": t_stats.flatten(),
    "p_value": p_vals_beta
})

print("File used:", CSV_PATH)
print("Observations:", n_obs, "Predictors:", p)
print("SST:", SST)
print("SSR:", SSR)
print("SSE:", SSE)
print("df_model:", df_model, "df_error:", df_error, "df_total:", df_total)
print("MSTr:", MSTr, "MSE:", MSE)
print("F-statistic:", F_stat, "p-value:", p_value)
print(coef_table.to_string(index=False))

Downloading from https://www.kaggle.com/api/v1/datasets/download/nelgiriyewithana/most-streamed-spotify-songs-2024?dataset_version_number=1...


100%|██████████| 496k/496k [00:00<00:00, 891kB/s]

Extracting files...





File used: /root/.cache/kagglehub/datasets/nelgiriyewithana/most-streamed-spotify-songs-2024/versions/1/Most Streamed Spotify Songs 2024.csv
Observations: 1675 Predictors: 21
SST: 398.865671641791
SSR: 84.32337473608061
SSE: 314.5422969057104
df_model: 21 df_error: 1653 df_total: 1674
MSTr: 4.0153987969562195 MSE: 0.19028572105608613
F-statistic: 21.101944878841923 p-value: 1.1006399819994437e-70
                  variable          beta    std_error    t_stat      p_value
                 Intercept  8.705451e-01 8.893552e-02  9.788498 4.940246e-22
             All Time Rank -3.404069e-05 1.111894e-05 -3.061506 2.237753e-03
               Track Score  2.283105e-03 3.938209e-04  5.797317 8.055211e-09
           Spotify Streams -1.620299e-10 5.275258e-11 -3.071506 2.164550e-03
    Spotify Playlist Count  3.487656e-06 2.931625e-07 11.896665 2.256258e-31
    Spotify Playlist Reach -2.586135e-09 6.622685e-10 -3.904964 9.803090e-05
        Spotify Popularity -7.217574e-03 1.256282e-03 -5.7451