In [None]:
!pip install catboost ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

from catboost import CatBoostRegressor, Pool, cv
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error


plt.style.use('seaborn-talk')

In [None]:
df_train = pd.read_csv('https://github.com/qununc/MLbase_2021_spring/raw/main/lecture05/keggle_competition/data/Train.csv.zip')
df_test = pd.read_csv('https://github.com/qununc/MLbase_2021_spring/raw/main/lecture05/keggle_competition/data/Test.csv.zip')

In [None]:
df_train.head()

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,kw1,kw2,kw3,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
0,0,2011-1,385,,1,59,3,0,25.0,,2,0,1,0,0,0,0,0,0,0,0,0,0,0,2489000
1,1,2011-1,247,,5,53,2,1,15.0,1.0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,1995000
2,2,2011-1,115,0.0,7,61,3,0,10.0,1.0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,3993000
3,3,2011-1,491,1.0,5,61,3,1,30.0,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5709000
4,4,2011-1,623,,3,51,2,1,30.0,,1,0,1,0,0,0,0,0,0,0,0,0,0,0,4374000


In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          100000 non-null  int64  
 1   date        100000 non-null  object 
 2   street_id   100000 non-null  int64  
 3   build_tech  70015 non-null   float64
 4   floor       100000 non-null  int64  
 5   area        100000 non-null  int64  
 6   rooms       100000 non-null  int64  
 7   balcon      100000 non-null  int64  
 8   metro_dist  94906 non-null   float64
 9   g_lift      69942 non-null   float64
 10  n_photos    100000 non-null  int64  
 11  kw1         100000 non-null  int64  
 12  kw2         100000 non-null  int64  
 13  kw3         100000 non-null  int64  
 14  kw4         100000 non-null  int64  
 15  kw5         100000 non-null  int64  
 16  kw6         100000 non-null  int64  
 17  kw7         100000 non-null  int64  
 18  kw8         100000 non-null  int64  
 19  kw9

In [None]:
df_train_new = df_train.drop(columns=['build_tech', 'g_lift'])
df_train_new = df_train_new.dropna()
df_train_new['date'] = OrdinalEncoder().fit_transform(df_train_new[['date']])
df_train_new.iloc[:, 1:] = StandardScaler().fit_transform(df_train_new.iloc[:, 1:])
df_train_new.reset_index(drop=True, inplace=True)
df_train_new.info()
df_train_new.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94906 entries, 0 to 94905
Data columns (total 23 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          94906 non-null  int64  
 1   date        94906 non-null  float64
 2   street_id   94906 non-null  float64
 3   floor       94906 non-null  float64
 4   area        94906 non-null  float64
 5   rooms       94906 non-null  float64
 6   balcon      94906 non-null  float64
 7   metro_dist  94906 non-null  float64
 8   n_photos    94906 non-null  float64
 9   kw1         94906 non-null  float64
 10  kw2         94906 non-null  float64
 11  kw3         94906 non-null  float64
 12  kw4         94906 non-null  float64
 13  kw5         94906 non-null  float64
 14  kw6         94906 non-null  float64
 15  kw7         94906 non-null  float64
 16  kw8         94906 non-null  float64
 17  kw9         94906 non-null  float64
 18  kw10        94906 non-null  float64
 19  kw11        94906 non-nul

Unnamed: 0,id,date,street_id,floor,area,rooms,balcon,metro_dist,n_photos,kw1,kw2,kw3,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
0,0,-1.641398,0.256726,-1.080115,0.378722,1.074008,-0.719259,0.399606,-0.241319,-0.244827,0.769924,-0.0862,-0.081877,-0.063488,-0.043226,-0.029227,-0.086076,-0.088588,-0.045141,-0.016869,-0.042361,-0.014877,-0.617556
1,1,-1.641398,-0.453367,-0.093772,0.039786,-0.133705,1.075834,-0.795372,-0.241319,-0.244827,0.769924,-0.0862,-0.081877,-0.063488,-0.043226,-0.029227,-0.086076,-0.088588,-0.045141,-0.016869,-0.042361,-0.014877,-0.749029
2,2,-1.641398,-1.132587,0.399399,0.491701,1.074008,-0.719259,-1.392862,-0.755812,-0.244827,0.769924,-0.0862,-0.081877,-0.063488,-0.043226,-0.029227,-0.086076,-0.088588,-0.045141,-0.016869,-0.042361,-0.014877,-0.217279
3,3,-1.641398,0.80216,-0.093772,0.491701,1.074008,1.075834,0.997096,-1.270305,-0.244827,0.769924,-0.0862,-0.081877,-0.063488,-0.043226,-0.029227,-0.086076,-0.088588,-0.045141,-0.016869,-0.042361,-0.014877,0.23942
4,4,-1.641398,1.48138,-0.586943,-0.073193,-0.133705,1.075834,0.997096,-0.755812,-0.244827,0.769924,-0.0862,-0.081877,-0.063488,-0.043226,-0.029227,-0.086076,-0.088588,-0.045141,-0.016869,-0.042361,-0.014877,-0.115879


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train_new.iloc[:, 1:-1], df_train_new['price'], test_size=0.3, random_state=47)
X_train.head()

Unnamed: 0,date,street_id,floor,area,rooms,balcon,metro_dist,n_photos,kw1,kw2,kw3,kw4,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13
37692,-0.289936,0.67352,-0.833529,0.322233,1.074008,-0.719259,-0.197883,1.816653,-0.244827,0.769924,-0.0862,-0.081877,-0.063488,-0.043226,-0.029227,-0.086076,-0.088588,-0.045141,-0.016869,-0.042361,-0.014877
68565,0.675393,-1.132587,0.892571,-0.073193,-0.133705,1.075834,0.997096,-1.270305,-0.244827,0.769924,-0.0862,-0.081877,-0.063488,-0.043226,-0.029227,-0.086076,-0.088588,-0.045141,-0.016869,-0.042361,-0.014877
7147,-1.448332,0.699248,0.152814,0.491701,1.074008,-0.719259,0.997096,0.787667,-0.244827,0.769924,-0.0862,-0.081877,-0.063488,-0.043226,-0.029227,-0.086076,-0.088588,-0.045141,-0.016869,-0.042361,-0.014877
23151,-0.869134,-0.170359,0.645985,-0.016704,-0.133705,-0.719259,0.399606,-0.755812,-0.244827,0.769924,-0.0862,-0.081877,-0.063488,-0.043226,-0.029227,-0.086076,-0.088588,-0.045141,-0.016869,-0.042361,-0.014877
7672,-1.448332,-0.196087,-0.093772,0.435212,1.074008,1.075834,0.997096,-1.270305,-0.244827,-1.298829,-0.0862,-0.081877,-0.063488,-0.043226,-0.029227,-0.086076,-0.088588,-0.045141,-0.016869,-0.042361,-0.014877


In [None]:
bdt = GradientBoostingRegressor().fit(X_train, y_train)
# bdt_predict = bdt.predict(X_test)
# mean_squared_error(y_test, bdt_predict)
bdt.score(X_test, y_test)

0.6254086415308096

In [38]:
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

grid_search_result = CatBoostRegressor().grid_search(grid, X=X_train, y=y_train, verbose=False)
grid_search_result

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
16:	learn: 0.8129059	test: 0.8146469	best: 0.8146469 (16)	total: 475ms	remaining: 27.5s
17:	learn: 0.8054006	test: 0.8073790	best: 0.8073790 (17)	total: 501ms	remaining: 27.3s
18:	learn: 0.7979761	test: 0.8000343	best: 0.8000343 (18)	total: 523ms	remaining: 27s
19:	learn: 0.7910084	test: 0.7932295	best: 0.7932295 (19)	total: 543ms	remaining: 26.6s
20:	learn: 0.7846963	test: 0.7870310	best: 0.7870310 (20)	total: 564ms	remaining: 26.3s
21:	learn: 0.7787071	test: 0.7813513	best: 0.7813513 (21)	total: 585ms	remaining: 26s
22:	learn: 0.7723814	test: 0.7753205	best: 0.7753205 (22)	total: 606ms	remaining: 25.8s
23:	learn: 0.7661595	test: 0.7693258	best: 0.7693258 (23)	total: 630ms	remaining: 25.6s
24:	learn: 0.7604076	test: 0.7637918	best: 0.7637918 (24)	total: 669ms	remaining: 26.1s
25:	learn: 0.7545532	test: 0.7580931	best: 0.7580931 (25)	total: 691ms	remaining: 25.9s
26:	learn: 0.7495019	test: 0.7533066	best:

{'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
               46,
               47,
               48,
               49,
             

In [None]:
cat = CatBoostRegressor()
cat.fit(X_train, y_train, verbose=False)
cat.score(X_test, y_test)

0.7275444055912177

In [None]:
params = {"iterations": 100,
          "depth": 2,
          "loss_function": "RMSE",
          "verbose": False}
cv_dataset = Pool(data=X_train,
                  label=y_train)
scores = cv(cv_dataset,
            params,
            fold_count=5, 
            plot="True")

scores

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Unnamed: 0,iterations,test-RMSE-mean,test-RMSE-std,train-RMSE-mean,train-RMSE-std
0,0,0.982411,0.028434,0.982673,0.007020
1,1,0.971444,0.028591,0.971710,0.006954
2,2,0.961125,0.028745,0.961395,0.006906
3,3,0.950710,0.028543,0.950976,0.006895
4,4,0.940988,0.028404,0.941240,0.006895
...,...,...,...,...,...
95,95,0.695929,0.027105,0.695570,0.006172
96,96,0.695326,0.027063,0.694969,0.006160
97,97,0.694817,0.027036,0.694402,0.006179
98,98,0.694272,0.026996,0.693848,0.006189
