# OLS - метод наименьших квадратов

In [1]:
import numpy as np # linear algebra 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# stats models: regression fitting
import statsmodels.formula.api as smf
# data visualization
import seaborn as sns

In [2]:
df = pd.read_csv('sleep75.csv')
df.shape

(706, 34)

In [4]:
# подгонка прямой
fitted_line = smf.ols(formula='sleep~totwrk', data=df).fit()
# коэффициенты с округлением
fitted_line.params.round(2)

Intercept    3586.38
totwrk         -0.15
dtype: float64

# Суммы квадратов (TSS, ESS, RSS)

In [7]:
df_sleep = pd.read_csv('sleep75.csv')
# Подгонка модели
sleep_eq = smf.ols(formula='sleep~totwrk+age+male+smsa', data=df_sleep).fit()

In [8]:
# TSS
sleep_eq.centered_tss

139239835.76345608

In [9]:
# ESS
sleep_eq.ess

17188382.233617336

In [10]:
# RSS
sleep_eq.ssr

122051453.52983874

# Предсказанные значения и остатки

In [12]:
ind = np.array([1, 4, 6, 10, 508, 700])-1

In [16]:
# Предсказанные
sleep_eq.fittedvalues[ind]

0      3094.453743
3      2943.579539
5      3494.142891
9      3111.983473
507    3476.913947
699    3295.398425
dtype: float64

In [14]:
# Остатки
sleep_eq.resid[ind]

0       18.546257
3      139.420461
5      568.857109
9      -93.983473
507    153.086053
699   -302.398425
dtype: float64

In [17]:
# Фактические
df_sleep['sleep'].iloc[ind]

0      3113
3      3083
5      4063
9      3018
507    3630
699    2993
Name: sleep, dtype: int64

## $R^2$ & $adj.R^2$

In [18]:
# R^2
sleep_eq.rsquared

0.12344443053507592

In [19]:
# adj R^2
sleep_eq.rsquared_adj

0.11844268691473403