<a href="https://colab.research.google.com/github/rmhyps1/statistics/blob/main/TUGAS8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, math, glob
import numpy as np
import pandas as pd
from numpy.linalg import inv, pinv
from scipy import stats

BASE_NAME = "Most Streamed Spotify Songs 2024"
CSV_PATH = None

search_dirs = [os.getcwd(), "/content", "/mnt/data"]
candidates = []
for d in search_dirs:
    if not os.path.isdir(d):
        continue
    candidates.extend(glob.glob(os.path.join(d, BASE_NAME + ".*")))
    candidates.extend(glob.glob(os.path.join(d, BASE_NAME + "*.*")))
if candidates:
    csv_path = [c for c in candidates if c.lower().endswith(".csv")]
    if csv_path:
        CSV_PATH = csv_path[0]
    else:
        CSV_PATH = candidates[0]
else:
    raise FileNotFoundError(f"No file found with base name '{BASE_NAME}' in {search_dirs}")

def read_csv_fallback(path):
    encs = ["utf-8", "latin1", "cp1252"]
    last_exc = None
    for e in encs:
        try:
            return pd.read_csv(path, encoding=e)
        except Exception as ex:
            last_exc = ex
    raise last_exc

if str(CSV_PATH).lower().endswith((".xls", ".xlsx")):
    df = pd.read_excel(CSV_PATH)
else:
    df = read_csv_fallback(CSV_PATH)

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols) == 0:
    coerced = []
    for c in df.columns:
        ser = pd.to_numeric(df[c], errors='coerce')
        if ser.notna().sum() > 0:
            df[c] = ser
            coerced.append(c)
    num_cols = coerced
if len(num_cols) == 0:
    raise ValueError("No numeric columns found in the dataset.")

if len(num_cols) == 2:
    predictor_cols = [num_cols[0]]
    response_col = num_cols[1]
else:
    predictor_cols = num_cols[:-1]
    response_col = num_cols[-1]

X_raw = df[predictor_cols].apply(pd.to_numeric, errors="coerce")
Y_raw = pd.to_numeric(df[response_col], errors="coerce")
valid_mask = X_raw.notna().all(axis=1) & Y_raw.notna()
X_clean = X_raw[valid_mask].to_numpy(dtype=float)
Y_clean = Y_raw[valid_mask].to_numpy(dtype=float).reshape(-1,1)

n_obs = X_clean.shape[0]
p = X_clean.shape[1]

ones = np.ones((n_obs,1))
X = np.hstack([ones, X_clean])
Xt = X.T
XtX = Xt.dot(X)
try:
    XtX_inv = inv(XtX)
except Exception:
    XtX_inv = pinv(XtX)
XtY = Xt.dot(Y_clean)
beta_hat = XtX_inv.dot(XtY)

Y_hat = X.dot(beta_hat)
residuals = Y_clean - Y_hat
SSE = float((residuals**2).sum())
y_mean = float(Y_clean.mean())
SST = float(((Y_clean - y_mean)**2).sum())
SSR = float(SST - SSE)

df_model = p
df_error = n_obs - (p+1)
df_total = n_obs - 1

MSTr = SSR / df_model if df_model>0 else float("nan")
MSE = SSE / df_error if df_error>0 else float("nan")
F_stat = MSTr / MSE if (not math.isnan(MSTr) and not math.isnan(MSE) and MSE!=0) else float("nan")
p_value = stats.f.sf(F_stat, df_model, df_error) if (not math.isnan(F_stat) and df_model>0 and df_error>0) else float("nan")

cov_beta = MSE * XtX_inv if df_error>0 else np.full(XtX_inv.shape, np.nan)
se_beta = np.sqrt(np.diag(cov_beta)).reshape(-1,1)
t_stats = beta_hat / se_beta
p_vals_beta = 2 * stats.t.sf(np.abs(t_stats).reshape(-1), df_error) if df_error>0 else np.full((p+1,), np.nan)

col_names = ["Intercept"] + predictor_cols
coef_table = pd.DataFrame({
    "variable": col_names,
    "beta": beta_hat.flatten(),
    "std_error": se_beta.flatten(),
    "t_stat": t_stats.flatten(),
    "p_value": p_vals_beta
})

out_dir = "/mnt/data"
os.makedirs(out_dir, exist_ok=True)
coef_table.to_csv(os.path.join(out_dir, "ols_coefficients.csv"), index=False)
pd.DataFrame([{
    "n": n_obs, "p": p, "SST": SST, "SSR": SSR, "SSE": SSE,
    "df_model": df_model, "df_error": df_error, "df_total": df_total,
    "MSTr": MSTr, "MSE": MSE, "F": F_stat, "p_value": p_value
}]).to_csv(os.path.join(out_dir, "ols_summary.csv"), index=False)

print("File used:", CSV_PATH)
print("Observations:", n_obs, "Predictors:", p)
print("SST:", SST)
print("SSR:", SSR)
print("SSE:", SSE)
print("df_model:", df_model, "df_error:", df_error, "df_total:", df_total)
print("MSTr:", MSTr, "MSE:", MSE)
print("F-statistic:", F_stat, "p-value:", p_value)
print("Coefficient table saved to /mnt/data/ols_coefficients.csv and summary to /mnt/data/ols_summary.csv")
print(coef_table.to_string(index=False))


File used: /content/Most Streamed Spotify Songs 2024.csv
Observations: 0 Predictors: 6
SST: 0.0
SSR: 0.0
SSE: 0.0
df_model: 6 df_error: -7 df_total: -1
MSTr: 0.0 MSE: nan
F-statistic: nan p-value: nan
Coefficient table saved to /mnt/data/ols_coefficients.csv and summary to /mnt/data/ols_summary.csv
                  variable  beta  std_error  t_stat  p_value
                 Intercept   0.0        NaN     NaN      NaN
               Track Score   0.0        NaN     NaN      NaN
        Spotify Popularity   0.0        NaN     NaN      NaN
Apple Music Playlist Count   0.0        NaN     NaN      NaN
     Deezer Playlist Count   0.0        NaN     NaN      NaN
     Amazon Playlist Count   0.0        NaN     NaN      NaN
          TIDAL Popularity   0.0        NaN     NaN      NaN


  y_mean = float(Y_clean.mean())
  ret = ret.dtype.type(ret / rcount)
