In [1]:
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.optimize import curve_fit
from sklearn.preprocessing import LabelEncoder
import os
from sklearn.model_selection import train_test_split

# Download latest version
path = kagglehub.dataset_download("mosapabdelghany/medical-insurance-cost-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Dima\.cache\kagglehub\datasets\mosapabdelghany\medical-insurance-cost-dataset\versions\1


In [2]:
!ls C:\Users\Dima\.cache\kagglehub\datasets\mosapabdelghany\medical-insurance-cost-dataset\versions\1

insurance.csv


In [3]:
path = os.path.join(path, "insurance.csv")

In [4]:
df = pd.read_csv(path)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [5]:
df = df.dropna()

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

numerical_columns = ['age', 'bmi', 'children', 'charges']
for col in numerical_columns:
    df = remove_outliers(df, col)

df = pd.get_dummies(df, columns=['region'], dtype=int)

label_encoders = {}
categorical_columns = ['sex', 'smoker']
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

correlation_matrix = df.corr()

print("Матрица корреляций:")
print(correlation_matrix)


Матрица корреляций:
                       age       sex       bmi  children    smoker   charges  \
age               1.000000 -0.021440  0.123827  0.038179 -0.062095  0.448798   
sex              -0.021440  1.000000  0.016487  0.015564  0.014027 -0.022893   
bmi               0.123827  0.016487  1.000000  0.007546 -0.260657 -0.064483   
children          0.038179  0.015564  0.007546  1.000000 -0.001901  0.089083   
smoker           -0.062095  0.014027 -0.260657 -0.001901  1.000000  0.596213   
charges           0.448798 -0.022893 -0.064483  0.089083  0.596213  1.000000   
region_northeast  0.001492  0.004247 -0.132354 -0.023610  0.032499  0.067055   
region_northwest -0.002236  0.004669 -0.112282  0.011655  0.019187  0.037770   
region_southeast -0.016127 -0.003229  0.230793 -0.014751 -0.002254 -0.029093   
region_southwest  0.017103 -0.005751  0.013616  0.026831 -0.049917 -0.076394   

                  region_northeast  region_northwest  region_southeast  \
age                      

In [6]:
print(df)

      age  sex     bmi  children  smoker      charges  region_northeast  \
0      19    0  27.900         0       1  16884.92400                 0   
1      18    1  33.770         1       0   1725.55230                 0   
2      28    1  33.000         3       0   4449.46200                 0   
3      33    1  22.705         0       0  21984.47061                 0   
4      32    1  28.880         0       0   3866.85520                 0   
...   ...  ...     ...       ...     ...          ...               ...   
1333   50    1  30.970         3       0  10600.54830                 0   
1334   18    0  31.920         0       0   2205.98080                 1   
1335   18    0  36.850         0       0   1629.83350                 0   
1336   21    0  25.800         0       0   2007.94500                 0   
1337   61    0  29.070         0       1  29141.36030                 0   

      region_northwest  region_southeast  region_southwest  
0                    0                

In [19]:
X = df.drop('charges', axis=1).values
y = df['charges'].values.reshape(-1, 1)

X = np.hstack((np.ones((X.shape[0], 1)), X))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
def analytical_solution(X, y):
    w = np.linalg.inv(X.T @ X) @ X.T @ y
    return w

w_analytical = analytical_solution(X_train, y_train)
print("Веса (аналитическое решение):\n", w_analytical)

def gradient_descent(X, y, alpha=0.0001, iterations=10000):
    m = X.shape[0]
    w = np.zeros((X.shape[1], 1))
    for _ in range(iterations):
        y_pred = X @ w
        gradient = (-2/m) * X.T @ (y - y_pred)
        w = w - alpha * gradient
    return w

w_gradient = gradient_descent(X_train, y_train)
print("Веса (градиентный спуск):\n", w_gradient)

y_pred_analytical = X @ w_analytical
y_pred_gradient = X @ w_gradient
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)


Веса (аналитическое решение):
 [[ 7.51185681e+04]
 [ 4.92770796e+01]
 [-3.19000631e+03]
 [-2.34777861e+03]
 [ 9.21136407e+00]
 [ 1.36865794e+04]
 [ 3.29386214e+04]
 [-1.07335009e+04]
 [ 4.88561589e+04]
 [ 4.06965792e+04]]
Веса (градиентный спуск):
 [[ 238.85901715]
 [ 241.22805815]
 [  47.04607913]
 [ -25.96645757]
 [ 432.28152473]
 [2521.7087242 ]
 [ 426.93360459]
 [ 165.38237712]
 [ -94.46142194]
 [-258.99554261]]


In [23]:
import numpy as np

lambda_ridge = 0.01

def ridge_analytical(X, y, lambda_):
    m = X.shape[1]
    w = np.linalg.inv(X.T @ X + lambda_ * np.eye(m)) @ X.T @ y
    return w

w_ridge_analytical = ridge_analytical(X_train, y_train, lambda_ridge)
print("Веса (Ridge, аналитическое):\n", w_ridge_analytical)

def ridge_gradient_descent(X, y, lambda_, alpha=0.0001, iterations=10000):
    m = X.shape[0]
    w = np.zeros((X.shape[1], 1))
    for _ in range(iterations):
        gradient = (1/m) * X.T @ (X @ w - y) + (lambda_/m) * w
        w = w - alpha * gradient
    return w

w_ridge_gradient = ridge_gradient_descent(X_train, y_train, lambda_ridge)
print("Веса (Ridge, градиентный спуск):\n", w_ridge_gradient)


Веса (Ridge, аналитическое):
 [[-2628.41256949]
 [  239.00002517]
 [ -158.59415466]
 [   60.53872088]
 [  401.06838292]
 [14850.22321607]
 [   23.10273521]
 [ -383.35364658]
 [-1041.25973149]
 [-1226.9019266 ]]
Веса (Ridge, градиентный спуск):
 [[ 137.19465677]
 [ 244.65205833]
 [  36.24633903]
 [ -20.2197116 ]
 [ 365.9315516 ]
 [1325.76132973]
 [ 245.92778089]
 [  97.10202404]
 [ -59.30153138]
 [-146.53361678]]


In [24]:
y_pred_analytical1 = X_test @ w_analytical
y_pred_gradient1 = X_test @ w_gradient
y_pred_analytical2 = X_test @ w_ridge_analytical
y_pred_gradient2 = X_test @ w_ridge_gradient
y_pred_average = np.full((y_test.shape[0]), np.mean(y_train))


In [25]:
def mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

print(mse(y_test, y_pred_analytical1))
print(mse(y_test, y_pred_gradient1))
print(mse(y_test, y_pred_analytical2))
print(mse(y_test, y_pred_gradient2))
print(mse(y_test, y_pred_average))



1326939921.5979514
32724081.473612566
19877053.503119007
36184095.53481284
54710216.02854098
