In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv(r'data\\train\\train.csv')
y = data['target']
X = data.drop(['target'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)

# МНК

In [4]:
W_MNK = np.array(np.linalg.inv(X_train.T @ X_train) @ X_train.T @ np.array(y_train).reshape((54729,1)))

In [5]:
def predict_by_weights(W, data):
    return data @ W

In [6]:
y_train_pred = predict_by_weights(W_MNK, X_train)
y_test_pred = predict_by_weights(W_MNK, X_test)

In [7]:
r2_score(y_train, y_train_pred)

0.8642112823707959

In [8]:
r2_score(y_test, y_test_pred)

0.8625037720302423

# Градиентный спуск

In [9]:
def get_grad(W_0, X_train, y_train):
    grad_list = []
    for j in np.arange(X_train.shape[1]):
        grad_list.append(np.sum(-2 * X_train.iloc[:, j] * (-X_train @ W_0 + y_train)))
    return np.array(grad_list)

In [10]:
W_0 = np.zeros(X_train.shape[1])
get_grad(W_0, X_train, y_train)

array([-1133.18989613,  2161.3346387 , -3260.83081164,  2712.6470718 ,
         708.32502658, -3631.60031107, -1649.07151547,  3909.52041962,
       -3221.61625614, -3226.85148522, -3820.99952602,  3151.38208049,
       -1748.58416789, -3311.73995026,  2905.86296158,  -330.03565972,
        3412.83334918,  5625.29995312, -3217.37294386,   199.14735469,
        1724.38629859,  3052.02598064,  2905.91950152,  4414.89220094,
        -359.38973427,  3165.67672897,   262.05887968,    71.38428711,
        2318.70331045, -4065.28256426,  -970.01262852,  4945.16684021,
       -2660.64819036, -2685.23027468,   418.99364758, -3141.13602046,
        1100.3312454 ,  1095.87177294, -1678.29618696,  3794.82853903,
       -2664.70707933, -3230.72393948,  -710.69129525, -6626.01794002,
        1986.78415281,   -55.46307484,  1480.97570028, -2022.8586836 ,
        4337.58945363,  1977.16664518,  1347.71802162, -2383.51630919,
       -2919.85543994, -2895.67279721, -2605.82558837, -2707.9190374 ])

In [11]:
lam = 0.000001
eps = 0.0000001
i = 0
max_iter = 10_000
W_0 = np.ones(X_train.shape[1])

In [None]:
%%time
while i < max_iter:
    W_1 = W_0 - lam * get_grad(W_0, X_train, y_train)
    norm = np.linalg.norm(W_1 - W_0)
    if norm < eps:
        break
    i += 1
    print(i, norm)
    W_0 = W_1

1 1.5988439314454375
2 1.1113252938172278
3 0.8171986906683022
4 0.622134366047553
5 0.4839743341491702
6 0.38185935750007777
7 0.3043125504986517
8 0.24441193075980652
9 0.19766131185011823
10 0.16096365246126496
11 0.13208553615644725
12 0.10935813002387194
13 0.09149955544941293
14 0.07750340816893725
15 0.06656582419799613
16 0.058036808996928715
17 0.05138779816276056
18 0.04619004017207241
19 0.042099482744402915
20 0.0388447773701666
21 0.036216300694983466
22 0.03405544909953929
23 0.032244399048990575
24 0.030696853681771514
25 0.029350187166929692
26 0.028159133288593083
27 0.02709093467917229
28 0.02612173931473852
29 0.02523399034319679
30 0.02441456960824731
31 0.023653493910254696
32 0.02294300699226819
33 0.02227695005475272
34 0.021650325970700483
35 0.021058997092577737
36 0.020499474691919338
37 0.019968771053653977
38 0.01946429436652034
39 0.01898377287608563
40 0.018525199113617274
41 0.018086787980625022
42 0.01766694448285477
43 0.017264238269141117
44 0.01687738

In [None]:
y_train_pred = predict_by_weights(W_0, X_train)
y_test_pred = predict_by_weights(W_0, X_test)

In [None]:
r2_score(y_train, y_train_pred)

In [None]:
r2_score(y_test, y_test_pred)

# Линейная регрессия в реализации sklearn

In [None]:
reg = LinearRegression().fit(X_train, y_train)

In [None]:
y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)

In [None]:
np.linalg.norm(reg.coef_ - W_MNK)

In [None]:
r2_score(y_train, y_train_pred)

In [None]:
r2_score(y_test, y_test_pred)