## 1. Подготовка данных
1. проверить наличие пропусков и выбросов

In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('../datasets/insurance_test.csv')
test_df = pd.read_csv('../datasets/insurance_train.csv')

In [3]:
train_df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
test_df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:
numerical_cols = ['age', 'bmi', 'children', 'charges']

In [6]:
def detect_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    return outliers

In [7]:
for col in numerical_cols:
    outliers = detect_outliers(train_df, col)
    print(f'{col}: {len(outliers)} outliers')

age: 0 outliers
bmi: 4 outliers
children: 0 outliers
charges: 112 outliers


In [8]:
for col in numerical_cols:
    outliers = detect_outliers(test_df, col)
    print(f'{col}: {len(outliers)} outliers')

age: 0 outliers
bmi: 5 outliers
children: 0 outliers
charges: 19 outliers


2. привести категориальные признаки к числовым

In [9]:
for df in [train_df, test_df]:
    df['sex'] = df['sex'].map({'male': 1, 'female': 0})
    df['smoker'] = df['smoker'].map({'yes': 1, 'no': 0})

In [10]:
train_df = pd.get_dummies(train_df, columns=['region'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['region'], drop_first=True)

In [11]:
train_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,61,1,38.38,0,0,12950.0712,True,False,False
1,59,0,34.8,2,0,36910.60803,False,False,True
2,31,1,30.875,0,0,3857.75925,False,False,False
3,24,0,33.345,0,0,2855.43755,True,False,False
4,31,1,28.5,5,0,6799.458,False,False,False


In [12]:
test_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,26,1,27.06,0,1,17043.3414,False,True,False
1,58,1,36.955,2,1,47496.49445,True,False,False
2,20,0,24.42,0,1,26125.67477,False,True,False
3,51,0,38.06,0,1,44400.4064,False,True,False
4,62,0,25.0,0,0,13451.122,False,False,True


3. вычислить парные корреляции признаков

In [13]:
train_df.corr()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
age,1.0,-0.012197,0.119652,0.03941,-0.03638,0.300982,0.021549,0.002511,-0.010583
sex,-0.012197,1.0,0.049637,0.024072,0.060458,0.055372,-0.002325,0.018193,0.006937
bmi,0.119652,0.049637,1.0,-0.005036,-0.013503,0.19045,-0.117061,0.241591,-0.003031
children,0.03941,0.024072,-0.005036,1.0,0.008991,0.066099,0.041736,-0.040869,0.023658
smoker,-0.03638,0.060458,-0.013503,0.008991,1.0,0.788283,-0.046974,0.095373,-0.027985
charges,0.300982,0.055372,0.19045,0.066099,0.788283,1.0,-0.042276,0.096425,-0.038272
region_northwest,0.021549,-0.002325,-0.117061,0.041736,-0.046974,-0.042276,1.0,-0.339416,-0.328012
region_southeast,0.002511,0.018193,0.241591,-0.040869,0.095373,0.096425,-0.339416,1.0,-0.343085
region_southwest,-0.010583,0.006937,-0.003031,0.023658,-0.027985,-0.038272,-0.328012,-0.343085,1.0


## 2. Многомерная линейная регрессия
Построить модель линейной регрессии и подобрать параметры:
1. аналитически (реализовать самому)

In [14]:
X_train = train_df.drop('charges', axis=1).astype(float).values
y_train = train_df['charges'].values
X_test = test_df.drop('charges', axis=1).astype(float).values
y_test = test_df['charges'].values

In [15]:
XTX = np.dot(X_train.T, X_train)
XTy = np.dot(X_train.T, y_train)
w_analytical = np.linalg.solve(XTX, XTy)

In [16]:
w_analytical

array([  205.34952397,  -444.82970889,    59.02314415,   261.98869035,
       23416.98493153, -1719.47187844, -1241.76569815, -1836.11232879])


2. численно, с помощью методов градиентного спуска (реализовать самому)

In [17]:
def gradient_descent(X, y, learning_rate=1e-7, max_iter=1000000, tol=1e-6):
    m, n = X.shape
    w = np.zeros(n)
    prev_w = w.copy()

    for i in range(max_iter):
        predictions = np.dot(X, w)
        errors = predictions - y
        gradient = 2 * np.dot(X.T, errors)

        if np.linalg.norm(gradient) < tol:
            break

        w = w - learning_rate * gradient

        if np.linalg.norm(w - prev_w) < tol:
            break

        prev_w = w.copy()

    return w

In [18]:
w_gd = gradient_descent(X_train, y_train, learning_rate=1e-7)

In [19]:
w_gd

array([  205.34943809,  -444.83152114,    59.02221018,   261.987945  ,
       23416.98110569, -1719.42725107, -1241.71722294, -1836.06648385])

In [20]:
def stochastic_gradient_descent(X, y, learning_rate=1e-5, max_iter=1000000, tol=1e-6, batch_size=32):
    m, n = X.shape
    w = np.zeros(n)
    prev_w = w.copy()

    for i in range(max_iter):
        indices = np.random.choice(m, batch_size)
        X_batch = X[indices]
        y_batch = y[indices]

        predictions = np.dot(X_batch, w)
        errors = predictions - y_batch
        gradient = 2 * np.dot(X_batch.T, errors)

        if np.linalg.norm(gradient) < tol:
            break

        w = w - learning_rate * gradient

        if np.linalg.norm(w - prev_w) < tol:
            break

        prev_w = w.copy()

    return w

In [21]:
w_sgd = stochastic_gradient_descent(X_train, y_train, learning_rate=1e-5, batch_size=32)

In [22]:
w_sgd

array([  219.98344937,  -477.03824618,    66.79485857,   285.69782731,
       23455.01577528, -1726.75502103, -1233.652041  , -1807.9993939 ])

## 3. Добавление регуляризации
Модифицировать линейную модель путем добавления регуляризационного слагаемого. Найти оптимальные веса:
1. аналитически

In [23]:
def ridge_analytical(X, y, alpha):
    n_features = X.shape[1]
    A = np.dot(X.T, X) + alpha * np.eye(n_features)
    b = np.dot(X.T, y)
    return np.linalg.solve(A, b)

In [24]:
w_ridge_analytical = ridge_analytical(X_train, y_train, 1)

In [25]:
w_ridge_analytical

array([  205.33105615,  -435.22895961,    59.02605674,   262.64328084,
       23267.63753809, -1693.06962898, -1203.37605555, -1807.42466729])

2. численно

In [26]:
def ridge_gradient_descent(X, y, alpha, learning_rate=1e-7, max_iter=1000000, tol=1e-6):
    m, n = X.shape
    w = np.zeros(n)
    prev_w = w.copy()

    for i in range(max_iter):
        predictions = np.dot(X, w)
        errors = predictions - y
        gradient = 2 * np.dot(X.T, errors) + 2 * alpha * w

        if np.linalg.norm(gradient) < tol:
            break

        w = w - learning_rate * gradient

        if np. linalg.norm(w - prev_w) < tol:
            break

        prev_w = w.copy()

    return w

In [27]:
w_ridge_gd = ridge_gradient_descent(X_train, y_train, 1, learning_rate=1e-7)

In [28]:
w_ridge_gd

array([  205.33097163,  -435.23074316,    59.02513756,   262.64254729,
       23267.63377281, -1693.02570819, -1203.32834784, -1807.37954819])

In [29]:
def lasso_gradient_descent(X, y, beta, learning_rate=1e-7, max_iter=1000000, tol=1e-6):
    m, n = X.shape
    w = np.zeros(n)
    prev_w = w.copy()

    for i in range(max_iter):
        predictions = np.dot(X, w)
        errors = predictions - y
        gradient = 2 * np.dot(X.T, errors) + beta * np.sign(w)

        if np.linalg.norm(gradient) < tol:
            break

        w = w - learning_rate * gradient

        if np.linalg.norm(w - prev_w) < tol:
            break

        prev_w = w.copy()

    return w

In [30]:
w_lasso_gd = lasso_gradient_descent(X_train, y_train, 1e-3, learning_rate=1e-7)

In [31]:
w_lasso_gd

array([  205.34943807,  -444.83151916,    59.02221001,   261.98794453,
       23416.98110196, -1719.42724321, -1241.71721434, -1836.06647581])

## 4. Оценка обобщающей способности
Сравнить между собой модели на тестовых данных по среднему квадрату ошибки:
1. константную - прогноз средним значением

In [32]:
def calculate_mse(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

In [33]:
constant_pred = np.full_like(y_test, np.mean(y_train))
mse_constant = calculate_mse(y_test, constant_pred)

In [34]:
mse_constant

np.float64(164374128.13344663)

2. из пункта 2

In [35]:
y_pred_analytical = np.dot(X_test, w_analytical)
mse_analytical = calculate_mse(y_test, y_pred_analytical)

In [36]:
mse_analytical

np.float64(48389895.97215576)

In [37]:
y_pred_gd = np.dot(X_test, w_gd)
mse_gd = calculate_mse(y_test, y_pred_gd)

In [38]:
mse_gd

np.float64(48389898.94365072)

In [39]:
y_pred_sgd = np.dot(X_test, w_sgd)
mse_sgd = calculate_mse(y_test, y_pred_sgd)

In [40]:
mse_sgd

np.float64(48541129.23346516)

In [41]:
y_pred_ridge_analytical = np.dot(X_test, w_ridge_analytical)
mse_ridge_analytical = calculate_mse(y_test, y_pred_ridge_analytical)

In [42]:
mse_ridge_analytical

np.float64(48389141.59636644)

In [43]:
y_pred_ridge_gd = np.dot(X_test, w_ridge_gd)
mse_ridge_gd = calculate_mse(y_test, y_pred_ridge_gd)

In [44]:
mse_ridge_gd

np.float64(48389145.42266351)

In [45]:
y_pred_lasso = np.dot(X_test, w_lasso_gd)
mse_lasso = calculate_mse(y_test, y_pred_lasso)

In [46]:
mse_lasso

np.float64(48389898.94506356)