# Регрессия 
Какая метрика подходит для задачи регрессии? Средняя квадратичная ошибка!
Что будет «правильным ответом» в вашей задаче? Полное, вплоть до последнего рубля, совпадение цены квартиры? Если абсолютная точность в задаче не важна, метрика accuracy не подходит.
Наиболее распространённая метрика качества в задаче регрессии — средняя квадратичная ошибка. В прошлых уроках вы добивались наибольшего значения accuracy. Величина MSE, наоборот, должна быть как можно меньше.

In [72]:
import pandas as pd
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [53]:
train_data = pd.read_csv("train_data.csv")

In [37]:
def mse(answers, prediction):
    mses = 0
    for i in range(len(answers)):
        mses += (prediction[i] - answers[i])**2
    return mses / len(answers)
    

In [38]:
answers = [623, 253, 150, 237]
predictions = [649, 253, 370, 148]

In [39]:
print(mse(answers,predictions))

14249.25


In [40]:
mse1 = mean_squared_error(answers,predictions)

In [41]:
mse1

14249.25

In [54]:
features = train_data.drop("last_price", axis=1)

In [55]:
target = train_data["last_price"] / 1000000

In [44]:
features

Unnamed: 0,total_area,rooms,ceiling_height,floors_total,living_area,floor,is_apartment,studio,open_plan,kitchen_area,balcony,airports_nearest,cityCenters_nearest
0,59.0,2,2.87,4.0,31.5,2,0.0,0.0,0.0,6.6,0.0,20485.0,8180.0
1,109.0,4,3.15,5.0,72.0,2,0.0,0.0,0.0,12.2,0.0,42683.0,8643.0
2,74.5,3,2.58,10.0,49.0,9,0.0,0.0,0.0,10.8,0.0,14078.0,16670.0
3,37.4,1,2.50,9.0,20.0,4,0.0,0.0,0.0,6.2,2.0,17792.0,17699.0
4,64.9,3,2.65,12.0,41.9,11,0.0,0.0,0.0,10.4,0.0,14767.0,10573.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6490,40.1,1,2.65,10.0,20.1,6,0.0,0.0,0.0,10.8,1.0,26495.0,14198.0
6491,158.5,7,2.95,7.0,57.4,7,0.0,0.0,0.0,12.4,1.0,23546.0,3088.0
6492,62.1,2,2.75,20.0,31.0,6,0.0,0.0,0.0,13.9,2.0,34506.0,14375.0
6493,66.0,3,2.50,12.0,42.2,6,0.0,0.0,0.0,10.5,0.0,12892.0,11225.0


In [45]:
target.mean()

8.05028371377983

In [13]:
predictions = pd.Series(target.mean(), index = target.index)

In [14]:
predictions

0       8.050284
1       8.050284
2       8.050284
3       8.050284
4       8.050284
          ...   
6490    8.050284
6491    8.050284
6492    8.050284
6493    8.050284
6494    8.050284
Length: 6495, dtype: float64

In [46]:
mse = mean_squared_error(target, predictions)

ValueError: Found input variables with inconsistent numbers of samples: [6495, 4]

In [16]:
mse

138.24439686023302

In [17]:
rmse = mse ** 0.5

In [18]:
rmse

11.757737744151

In [56]:
features_train, features_valid, target_train, target_valid = train_test_split(features,target,test_size = 0.25, random_state=12345)

In [48]:
features_train

Unnamed: 0,total_area,rooms,ceiling_height,floors_total,living_area,floor,is_apartment,studio,open_plan,kitchen_area,balcony,airports_nearest,cityCenters_nearest
2552,42.1,1,2.85,7.0,20.10,2,0.0,0.0,0.0,11.80,1.0,25530.0,4378.0
353,88.4,3,2.55,17.0,51.70,11,0.0,0.0,0.0,13.50,5.0,13172.0,11969.0
3043,62.2,2,2.65,12.0,31.00,11,0.0,0.0,0.0,11.00,0.0,46067.0,15475.0
1472,72.0,3,2.70,4.0,48.20,4,0.0,0.0,0.0,10.60,0.0,20713.0,1755.0
1410,102.3,3,3.40,6.0,65.00,6,0.0,0.0,0.0,14.80,0.0,23955.0,5089.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3497,74.0,3,3.20,5.0,48.00,4,0.0,0.0,0.0,7.50,0.0,16602.0,8133.0
3492,57.9,2,3.00,5.0,33.80,1,0.0,0.0,0.0,9.90,0.0,20516.0,15318.0
2177,68.8,3,2.70,5.0,54.90,3,0.0,0.0,0.0,7.00,1.0,30005.0,8852.0
3557,43.0,1,2.50,24.0,20.00,4,0.0,0.0,0.0,15.00,0.0,11827.0,11459.0


In [49]:
features_valid

Unnamed: 0,total_area,rooms,ceiling_height,floors_total,living_area,floor,is_apartment,studio,open_plan,kitchen_area,balcony,airports_nearest,cityCenters_nearest
3316,40.9,1,2.75,16.0,17.5,6,0.0,0.0,0.0,11.12,0.0,12199.0,12111.0
4890,44.0,1,3.00,15.0,20.1,8,0.0,0.0,0.0,10.30,2.0,33531.0,12163.0
1013,102.0,3,2.75,22.0,53.0,15,0.0,0.0,0.0,20.90,2.0,34463.0,11879.0
5507,58.8,3,2.50,9.0,37.6,8,0.0,0.0,0.0,7.00,1.0,49819.0,15377.0
6386,491.0,5,4.20,9.0,274.0,9,0.0,0.0,0.0,45.00,0.0,25525.0,5845.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3081,60.0,2,3.30,4.0,35.0,1,0.0,0.0,0.0,8.00,0.0,22983.0,2524.0
1499,118.2,3,2.95,5.0,83.6,5,0.0,0.0,0.0,12.60,0.0,22697.0,3962.0
5341,45.8,2,2.50,9.0,28.6,9,0.0,0.0,0.0,7.00,2.0,31476.0,12635.0
1684,35.2,1,2.47,14.0,20.0,7,0.0,0.0,0.0,7.80,1.0,7900.0,13142.0


In [50]:
target_train

2552     5.777000
353     11.200000
3043     7.980000
1472     6.500000
1410     8.900000
          ...    
3497     8.000000
3492     4.950000
2177     8.900000
3557     6.570000
4578     6.204472
Name: last_price, Length: 4871, dtype: float64

In [82]:
best_result = 1000
best_model = None
best_depth = 0
for i in range(1,10):
    model = DecisionTreeRegressor(random_state=12345, max_depth=i)
    model.fit(features_train,target_train)
    predictions_valid = model.predict(features_valid)
    result = mean_squared_error(target_valid, predictions_valid)**0.5
    if result < best_result:
        best_model = model
        best_result = result 
        best_depth = i
print(best_result, best_depth)

6.312754706805505 8


In [70]:
best_result = 1000
best_model = None
best_depth = 0
best_est = 0
for i in range(10,51,10):
    for j in range(1, 11):
        model = RandomForestRegressor(random_state=12345, max_depth=j, n_estimators = i)
        model.fit(features_train,target_train)
        predictions_valid = model.predict(features_valid)
        result = mean_squared_error(target_valid, predictions_valid)**0.5
        if result < best_result:
            best_model = model
            best_result = result 
            best_depth = j
            best_est = i
print("RMSE наилучшей модели на валидационной выборке:", best_result, "Количество деревьев:", best_est, "Максимальная глубина:", best_depth)

RMSE наилучшей модели на валидационной выборке: 7.052975801299085 Количество деревьев: 40 Максимальная глубина: 10


In [76]:
model = LinearRegression()

In [77]:
model.fit(features_train,target_train)

In [78]:
predictions1 = model.predict(features_valid)

In [80]:
result = mean_squared_error(target_valid, predictions1) ** 0.5

In [81]:
result

7.726006697008097