In [5]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
     |████████████████████████████████| 26.7 MB 1.9 MB/s            
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.0.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.1.0
  Downloading scipy-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.3 MB)
     |████████████████████████████████| 39.3 MB 1.4 MB/s            
Collecting joblib>=0.11
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
     |████████████████████████████████| 306 kB 2.0 MB/s            
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.1.0 scikit-learn-1.0.2 scipy-1.7.3 threadpoolctl-3.0.0
Note: you may need to restart the kernel to use updated packages.


## Задание 1

In [80]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston

In [82]:
boston = load_boston()

In [83]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename', 'data_module'])

In [84]:
data = boston.data

In [85]:
target = boston.target

In [86]:
feature_names = boston.feature_names

In [87]:
for line in boston.DESCR.split('\n'):
    print(line)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [88]:
X = pd.DataFrame(data, columns=feature_names)

In [89]:
y = pd.DataFrame(target, columns=["price"])

In [90]:
from sklearn.model_selection import train_test_split

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [92]:
from sklearn.linear_model import LinearRegression

In [93]:
lr = LinearRegression()

In [94]:
lr.fit(X_train, y_train)

LinearRegression()

In [95]:
y_pred = lr.predict(X_test)

y_pred.shape

(152, 1)

In [96]:
check_test = pd.DataFrame({"y_test": y_test["price"],
                           "y_pred": y_pred.flatten()},
                          columns=['y_test', 'y_pred'])
check_test.head(10)

Unnamed: 0,y_test,y_pred
173,23.6,28.64896
274,32.4,36.495014
491,13.6,15.411193
72,22.8,25.403213
452,16.1,18.85528
76,20.0,23.146689
316,17.8,17.392124
140,14.0,14.078599
471,19.6,23.036927
500,16.8,20.599433


In [97]:
from sklearn.metrics import r2_score

In [98]:
r2_score(y_test, y_pred)

0.7112260057484974

## Задание 2

In [99]:
from sklearn.ensemble import RandomForestRegressor 

In [101]:
from sklearn.model_selection import GridSearchCV 

In [102]:
model = RandomForestRegressor()

In [103]:
parameters = [{'n_estimators': [1000],
               'max_depth': [12]}]

In [104]:
clf = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                   param_grid=parameters,
                   scoring='accuracy',
                   cv=5)

In [105]:
model.fit(X_train, y_train.values[:, 0])

RandomForestRegressor()

In [106]:
y_pred = model.predict(X_test)

y_pred.shape

(152,)

In [107]:
check_test = pd.DataFrame({"y_test": y_test["price"],
                           "y_pred": y_pred.flatten()},
                          columns=['y_test', 'y_pred'])
check_test.head(10)

Unnamed: 0,y_test,y_pred
173,23.6,22.743
274,32.4,31.684
491,13.6,16.717
72,22.8,23.738
452,16.1,17.376
76,20.0,21.261
316,17.8,19.808
140,14.0,14.831
471,19.6,21.306
500,16.8,21.145


In [108]:
r2_score(y_test, y_pred)

0.8708210657071842

Величина r2_score в первом задании меньше величины r2_score во втором задании, а значит, вторая модель работает лучше

## Задание 3

In [112]:
 print(RandomForestRegressor.__doc__)


    A random forest regressor.

    A random forest is a meta estimator that fits a number of classifying
    decision trees on various sub-samples of the dataset and uses averaging
    to improve the predictive accuracy and control over-fitting.
    The sub-sample size is controlled with the `max_samples` parameter if
    `bootstrap=True` (default), otherwise the whole dataset is used to build
    each tree.

    Read more in the :ref:`User Guide <forest>`.

    Parameters
    ----------
    n_estimators : int, default=100
        The number of trees in the forest.

        .. versionchanged:: 0.22
           The default value of ``n_estimators`` changed from 10 to 100
           in 0.22.

    criterion : {"squared_error", "absolute_error", "poisson"},             default="squared_error"
        The function to measure the quality of a split. Supported criteria
        are "squared_error" for the mean squared error, which is equal to
        variance reduction as feature selection cr

In [114]:
importances = model.feature_importances_

In [116]:
importances.sum()

1.0