In [11]:
import numpy as np
import pandas as pd
import os

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


train = pd.read_csv("/kaggle/input/mai-ml-lab-1-biro/train.csv").replace("-", np.nan)
test = pd.read_csv("/kaggle/input/mai-ml-lab-1-biro/test.csv").replace("-", np.nan)
target_col = "source_attractiveness" 

train.drop(columns=['date_of_registration', 'Unnamed: 0'], inplace=True)
test.drop(columns=['date_of_registration', 'ID'], inplace=True)

X = train.drop(columns=[target_col])
y = train[target_col]

categorical_features = ['category']
numeric_features = [c for c in X.columns if c not in categorical_features]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)
print("Validation MSE:", mse)

# Если результат удовлетворяет условиям (< 0.02), обучаем на всём train
model.fit(X, y)
test_preds = model.predict(test)

data = {"source_attractiveness": test_preds}
submit = pd.DataFrame(data)
submit.to_csv('submission.csv', index_label="ID")

Validation MSE: 0.013824563315977995
