# ОИАД. Лабораторная работа №2

Датасет: `datasets/students_simple.csv`

Вверху задайте свой номер `N` для выбора столбцов: первый индекс `N % 5`, второй индекс `N**2 % 5 + 5`. Выполняются расчёты корреляций, визуализации, регрессии разных видов и проверка по критерию Фишера для лучшей и худшей моделей.



In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import math

pd.set_option('display.float_format', lambda v: f"{v:.5f}")


In [None]:
N = 24

df = pd.read_csv('../datasets/students_simple.csv')
cols = list(df.columns)
idx1 = N % 5
idx2 = (N * N) % 5 + 5
col_x = cols[idx1]
col_y = cols[idx2]
x = df[col_x].astype(float)
y = df[col_y].astype(float)
mask = x.notna() & y.notna()
x = x[mask].reset_index(drop=True)
y = y[mask].reset_index(drop=True)
print(col_x, col_y)
df[[col_x, col_y]].head()


In [None]:
def fechner_correlation(a, b):
    am = a.mean()
    bm = b.mean()
    sa = np.sign(a - am)
    sb = np.sign(b - bm)
    m = (sa * sb) > 0
    p = m.sum() / len(a)
    return 2 * p - 1

pearson_r, pearson_p = stats.pearsonr(x, y)
n = len(x)
z = np.arctanh(pearson_r)
se = 1 / math.sqrt(n - 3) if n > 3 else np.inf
z_low = z - 1.96 * se
z_high = z + 1.96 * se
ci_low = np.tanh(z_low)
ci_high = np.tanh(z_high)
spearman_rho, spearman_p = stats.spearmanr(x, y)
kendall_tau, kendall_p = stats.kendalltau(x, y)
fechner_r = fechner_correlation(x.values, y.values)
print('Фехнера:', f"{fechner_r:.5f}")
print('Пирсона:', f"{pearson_r:.5f}", 'p=', f"{pearson_p:.5f}", 'CI95%=', (f"{ci_low:.5f}", f"{ci_high:.5f}"))
print('Спирмена:', f"{spearman_rho:.5f}", 'p=', f"{spearman_p:.5f}")
print('Кенделла:', f"{kendall_tau:.5f}", 'p=', f"{kendall_p:.5f}")


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
axes[0].hist(x, bins=10, edgecolor='black')
axes[0].set_title(col_x)
axes[1].hist(y, bins=10, edgecolor='black')
axes[1].set_title(col_y)
plt.tight_layout()
plt.show()

plt.figure(figsize=(5,4))
plt.scatter(x, y)
plt.xlabel(col_x)
plt.ylabel(col_y)
plt.title('Scatter')
plt.tight_layout()
plt.show()


In [None]:
def linear_fit(xv, yv):
    xm = xv.mean()
    ym = yv.mean()
    cov = ((xv - xm) * (yv - ym)).sum()
    var = ((xv - xm) ** 2).sum()
    w1 = cov / var
    w0 = ym - w1 * xm
    return w1, w0

def quadratic_fit(xv, yv):
    X = np.column_stack([xv**2, xv, np.ones_like(xv)])
    w, _, _, _ = np.linalg.lstsq(X, yv, rcond=None)
    return w

def hyperbolic_fit(xv, yv):
    z = 1.0 / xv
    X = np.column_stack([z, np.ones_like(z)])
    w, _, _, _ = np.linalg.lstsq(X, yv, rcond=None)
    return w

def exponential_fit(xv, yv):
    Y = np.log(yv)
    X = np.column_stack([xv, np.ones_like(xv)])
    a_b, _, _, _ = np.linalg.lstsq(X, Y, rcond=None)
    a, b = a_b
    w1 = np.exp(a)
    w0 = np.exp(b)
    return w1, w0
# SST=SSR+SSE
# SST -- наблюдение - среднее
# SSR -- модель - среднее
# SSE - наблюдение - модель
def metrics(y_true, y_pred, p):
    n = len(y_true)
    y_mean = y_true.mean()
    sse = ((y_true - y_pred) ** 2).sum()
    ssr = ((y_pred - y_mean) ** 2).sum()
    sst = ((y_true - y_mean) ** 2).sum()
    r2 = 1 - sse / sst if sst != 0 else np.nan
    f = (ssr / p) / (sse / (n - p - 1)) if n > p + 1 and sse > 0 and p > 0 else np.nan
    return {'SSE': sse, 'SSR': ssr, 'SST': sst, 'R2': r2, 'F': f}

w1_lin, w0_lin = linear_fit(x.values.astype(float), y.values.astype(float))
y_lin = w1_lin * x + w0_lin
m_lin = metrics(y.values, y_lin.values, p=1)

w_quad = quadratic_fit(x.values.astype(float), y.values.astype(float))
y_quad = w_quad[0] * x**2 + w_quad[1] * x + w_quad[2]
m_quad = metrics(y.values, y_quad.values, p=2)

mask_h = x != 0
gx = x[mask_h]
gy = y[mask_h]
w_hyp = hyperbolic_fit(gx.values.astype(float), gy.values.astype(float))
y_hyp = pd.Series(w_hyp[0] / x + w_hyp[1])
m_hyp = metrics(y.values, y_hyp.values, p=1)

mask_e = y > 0
gx2 = x[mask_e]
gy2 = y[mask_e]
w_exp = exponential_fit(ngx := gx2.values.astype(float), ngy := gy2.values.astype(float))
y_exp = pd.Series(w_exp[1] * (w_exp[0] ** x))
m_exp = metrics(y.values, y_exp.values, p=1)

models = {
    'linear': {'pred': y_lin, 'metrics': m_lin, 'params': (w1_lin, w0_lin)},
    'quadratic': {'pred': y_quad, 'metrics': m_quad, 'params': tuple(w_quad)},
    'hyperbolic': {'pred': y_hyp, 'metrics': m_hyp, 'params': tuple(w_hyp)},
    'exponential': {'pred': y_exp, 'metrics': m_exp, 'params': tuple(w_exp)},
}

for name, obj in models.items():
    r2 = obj['metrics']['R2']
    print(name, 'R2=', f"{r2:.5f}")


In [None]:
x_sorted_idx = np.argsort(x.values)
x_sorted = x.values[x_sorted_idx]
plt.figure(figsize=(6, 4))
plt.scatter(x, y, label='data')
plt.plot(x_sorted, (w1_lin * x_sorted + w0_lin), label='linear')
plt.plot(x_sorted, (w_quad[0] * x_sorted**2 + w_quad[1] * x_sorted + w_quad[2]), label='quadratic')
plt.plot(x_sorted, (w_hyp[0] / x_sorted + w_hyp[1]), label='hyperbolic')
plt.plot(x_sorted, (w_exp[1] * (w_exp[0] ** x_sorted)), label='exponential')
plt.xlabel(col_x)
plt.ylabel(col_y)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
best = max(models.items(), key=lambda kv: kv[1]['metrics']['R2'])[0]
worst = min(models.items(), key=lambda kv: kv[1]['metrics']['R2'])[0]
print('Лучшая модель:', best)
print('Худшая модель:', worst)

def fisher_test(y_true, y_pred, p):
    n = len(y_true)
    y_mean = y_true.mean()
    sse = ((y_true - y_pred) ** 2).sum()
    ssr = ((y_pred - y_mean) ** 2).sum()
    f = (ssr / p) / (sse / (n - p - 1)) if n > p + 1 and sse > 0 and p > 0 else np.nan
    df1 = p
    df2 = n - p - 1
    pval = stats.f.sf(f, df1, df2) if np.isfinite(f) else np.nan
    return f, df1, df2, pval

p_map = {'linear': 1, 'quadratic': 2, 'hyperbolic': 1, 'exponential': 1}
for name in [best, worst]:
    preds = models[name]['pred'].values
    p = p_map[name]
    f, df1, df2, pval = fisher_test(y.values, preds, p)
    print(name, 'F=', f, 'df=', (df1, df2), 'p=', pval)
