In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler  
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.compose import ColumnTransformer

In [None]:
df_train = pd.read_csv('/kaggle/input/linear-regression-apu/diamonds_train.csv')
df_train

In [None]:
df_train.isnull().sum()

In [58]:
# кодируем фичи
categorical_features = ['cut', 'color', 'clarity']
numeric_features = [col for col in df_train.columns if col not in categorical_features + ['price']]


# нормализуем с выбросами
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numeric_features),
        ('cat', 'passthrough', categorical_features)  
    ]
)

# pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('catboost', CatBoostRegressor(
        cat_features=[i for i in range(len(numeric_features), len(numeric_features) + len(categorical_features))],
        iterations=1500,
        learning_rate=0.05,
        depth=8,
        random_seed=42,
        verbose=100,
        loss_function='RMSE',
        eval_metric='RMSE'
    ))
])


y = df_train['price']
X = df_train.drop('price', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = pipeline.fit(X_train, y_train)

0:	learn: 3809.1669511	total: 35.7ms	remaining: 53.6s
100:	learn: 593.7905088	total: 2.03s	remaining: 28.1s
200:	learn: 544.6444386	total: 3.89s	remaining: 25.1s
300:	learn: 522.2552565	total: 5.77s	remaining: 23s
400:	learn: 500.7964251	total: 7.88s	remaining: 21.6s
500:	learn: 483.9253930	total: 10s	remaining: 20s
600:	learn: 468.9622633	total: 12.2s	remaining: 18.2s
700:	learn: 455.9872566	total: 14.3s	remaining: 16.3s
800:	learn: 444.1345568	total: 16.4s	remaining: 14.4s
900:	learn: 432.6382961	total: 18.6s	remaining: 12.3s
1000:	learn: 424.5548392	total: 20.8s	remaining: 10.3s
1100:	learn: 415.5965779	total: 22.9s	remaining: 8.31s
1200:	learn: 407.2239831	total: 25.1s	remaining: 6.24s
1300:	learn: 399.4735182	total: 27.2s	remaining: 4.16s
1400:	learn: 392.9658820	total: 29.4s	remaining: 2.08s
1499:	learn: 386.1962008	total: 31.5s	remaining: 0us


In [None]:
X_ans = pd.read_csv('/kaggle/input/linear-regression-apu/diamonds_test.csv')
X_ans

In [60]:
from sklearn.metrics import mean_squared_error


y_val_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_val_pred))
print(rmse, 'rmse')

544.3567621858805 rmse


In [None]:
final_model = pipeline.fit(X, y)
final_model

In [None]:
ans = final_model.predict(X_ans)
ans

In [None]:
submission = pd.DataFrame({'id': X_ans['id'], 'price': ans})
submission

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
example = pd.read_csv('/kaggle/input/linear-regression-apu/example_answer.csv')
example