# Семинар 5. Модели нелинейной регрессии

In [40]:
from sklearn.datasets import load_boston
import pandas as pd
boston = load_boston()
X, y = pd.DataFrame(boston['data'], columns=boston['feature_names']), boston['target']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)



# Бейслайн

In [41]:
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.metrics import r2_score
import numpy as np

model = Pipeline([
    ('imputer', Imputer(missing_values=np.nan, strategy='mean')), # заполнит за вас пропущенные значения
    ('scaling', StandardScaler()),
    ('model', ElasticNetCV())])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.7336669633884503

# Добавление нелинейных признаков

In [42]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

model = Pipeline([('scaling', StandardScaler()),
                ('features', PolynomialFeatures(2)),
                ('model', ElasticNetCV())])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.8791673848122153

In [46]:
model = Pipeline([('scaling', StandardScaler()),
                ('features', PolynomialFeatures(3)),
                ('model', ElasticNetCV(max_iter=10))])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)



0.8425062892769868

Алгоритм не сошёлся. Увеличим количество итераций

In [10]:
model = Pipeline([('scaling', StandardScaler()),
                ('features', PolynomialFeatures(3)),
                ('model', ElasticNetCV(max_iter=10000))])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.7537468306770078

На лицо недообучение

## Трансформация данных

In [47]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer

model = Pipeline([('scaling', StandardScaler()),
                ('features', FunctionTransformer(np.tanh)),
                ('model', ElasticNetCV(max_iter=10000))])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.6824730944672667

# Добавление своих фичей

# SVR

In [48]:
from sklearn.svm import SVR


model = Pipeline([('scaling', StandardScaler()),
                ('features', PolynomialFeatures(2)),
                ('model', SVR())])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.45257275770760585

In [49]:
from sklearn.model_selection import GridSearchCV

params_grid = {
    'degree': range(3, 10, 2),
    'epsilon': [1e-1, 1e-2, 1e-3],
    'C': [0.1, 0.5] + [i for i in range(1, 10, 2)],
    'kernel': ['rbf', 'poly']
}

model = Pipeline([('scaling', StandardScaler()),
                ('features', PolynomialFeatures(2)),
                ('model', GridSearchCV(SVR(), params_grid))])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.7687532999082248

In [50]:
from sklearn.neighbors import KNeighborsRegressor


model = Pipeline([('scaling', StandardScaler()),
                ('features', PolynomialFeatures(2)),
                ('model', KNeighborsRegressor())])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.7724308664687143

# Формула Надарая-Ватсона

In [51]:
!pip install git+https://github.com/jmetzen/kernel_regression

Collecting git+https://github.com/jmetzen/kernel_regression
  Cloning https://github.com/jmetzen/kernel_regression to /tmp/pip-req-build-3nxkt0cu
Building wheels for collected packages: kernel-regression
  Running setup.py bdist_wheel for kernel-regression ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-gxinpwxe/wheels/f9/1a/37/86090eaa433aef1011daae54862924d25f46b7fed1534a14ce
Successfully built kernel-regression


In [52]:
from kernel_regression import KernelRegression

params_grid = {
    'gamma': [None] + [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
}

model = Pipeline([('scaling', StandardScaler()),
                ('features', PolynomialFeatures(2)),
                ('model', GridSearchCV(KernelRegression(), params_grid))])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.7690984724588406

# Дополнительно

* [Пакет](https://pygam.readthedocs.io/en/latest/notebooks/tour_of_pygam.html) для backfitting'a: 