In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv("../input/30-days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

In [3]:
useful_features = [c for c in df.columns if c not in ("id", "target", "KFold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.KFold != fold].reset_index(drop = True)
        xvalid = df[df.KFold == fold].reset_index(drop = True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "KFold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

In [4]:
final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.KFold != fold].reset_index(drop = True)
    xvalid = df[df.KFold == fold].reset_index(drop = True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    params = {'learning_rate': 0.039875167641537965, 'reg_lambda': 0.0005376678984150986, 'reg_alpha': 0.00551122913583057, 'subsample': 0.8029978365835949, 'colsample_bytree': 0.15088780179864864, 'max_depth': 4}
    
    model = XGBRegressor(
        random_state = 0, 
        tree_method = 'gpu_hist',
        gpu_id = 0,
        predictor = "gpu_predictor",
        n_estimators = 5000,
        **params
    )
    model.fit(xtrain, ytrain, early_stopping_rounds = 300, eval_set=[(xvalid, yvalid)], verbose = 1000)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

[0]	validation_0-rmse:7.47264
[1000]	validation_0-rmse:0.72025
[2000]	validation_0-rmse:0.71828
[2788]	validation_0-rmse:0.71805
0 0.7180478311483501
[0]	validation_0-rmse:7.46937
[1000]	validation_0-rmse:0.71963
[2000]	validation_0-rmse:0.71794
[3000]	validation_0-rmse:0.71760
[3417]	validation_0-rmse:0.71763
1 0.7175580710337084
[0]	validation_0-rmse:7.46714
[1000]	validation_0-rmse:0.72134
[2000]	validation_0-rmse:0.71971
[2932]	validation_0-rmse:0.71955
2 0.7194835538799537
[0]	validation_0-rmse:7.46936
[1000]	validation_0-rmse:0.72245
[2000]	validation_0-rmse:0.72161
[2144]	validation_0-rmse:0.72164
3 0.7213647014312778
[0]	validation_0-rmse:7.47614
[976]	validation_0-rmse:0.73445
4 0.7302060421830056
0.7213320399352592 0.004630221419951144


In [5]:
preds = np.mean(np.column_stack(final_predictions), axis = 1)
sample_submission.target = preds
sample_submission.to_csv("submission.csv", index = False)