In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

* 0 0.7245705537554137
* 1 0.7242510333821858
* 2 0.7270667092065692
* 3 0.7268359229595335
* 4 0.7257178555909586
* 0.7256884149789322 0.0011430674400777338

In [None]:
# standardization
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

* standardization
* 0 0.7241755479182882
* 1 0.7241138968948254
* 2 0.7267386816038165
* 3 0.7268357864120136
* 4 0.725667388462628
* 0.7255062602583143 0.001185068397378747

In [None]:
# log transformation
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

for col in numerical_cols:
    df[col] = np.log1p(df[col])
    df_test[col] = np.log1p(df_test[col])

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

* log transformation
* 0 0.7245867071148808
* 1 0.7242518770698644
* 2 0.7269464580617742
* 3 0.7267203050271116
* 4 0.7255892005274619
* 0.7256189095602186 0.001087249680887288

In [None]:
# polynomial features
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

poly = preprocessing.PolynomialFeatures(degree=3, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(df[numerical_cols])
test_poly = poly.fit_transform(df_test[numerical_cols])

df_poly = pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1])])
df_test_poly = pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1])])

df = pd.concat([df, df_poly], axis=1)
df_test = pd.concat([df_test, df_test_poly], axis=1)

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))



* polynomial features
* 0 0.729073179900137
* 1 0.7286941123028183
* 2 0.7302315824391516
* 3 0.7304305210608322
* 4 0.7297044930462646
* 0.7296267777498406 0.0006624450323974544

In [None]:
# for col in numerical_cols:
#     bin_col = pd.cut(df[col], bins = 4,labels=False, right=False)
#     bin_col = bin_col.to_dict()
#     df.loc[:, f"bin_{col}"] = df.index.map(bin_col)

In [None]:
df.columns

In [None]:
# binning the numerical features
# pd.cut
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

#bining the variables++
for col in numerical_cols:
    for fold in range(5):
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        bin_col = pd.cut(xvalid[col], bins = 4,labels=False, right=False)
        bin_col = bin_col.to_dict()
        xvalid.loc[:, f"cat_bin_{col}"] = xvalid.index.map(bin_col)

for col in numerical_cols:
    bin_col = pd.cut(df_test[col], bins = 4,labels=False, right=False)
    bin_col = bin_col.to_dict()
    df_test.loc[:, f"cat_bin_{col}"] = df_test.index.map(bin_col)


useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)
print(np.mean(scores), np.std(scores))




In [None]:
# binning the numerical features
# pd.cut
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

#bining the variables++
for col in numerical_cols:
    bin_col = pd.cut(df[col], bins = 4,labels=False, right=False)
    bin_col = bin_col.to_dict()
    df.loc[:, f"cat_bin_{col}"] = df.index.map(bin_col)

for col in numerical_cols:
    bin_col = pd.cut(df_test[col], bins = 4,labels=False, right=False)
    bin_col = bin_col.to_dict()
    df_test.loc[:, f"cat_bin_{col}"] = df_test.index.map(bin_col)


useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)
print(np.mean(scores), np.std(scores))




* binning the numerical features and ordinary
* 0 0.7244715259084568
* 1 0.7247714725802955
* 2 0.7263946453260414
* 3 0.726903777621419
* 4 0.7249883367633758
* 0.7255059516399177 0.0009613651944672863

In [None]:
# binning the numerical features + OHE
# pd.cut
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

#bining the variables++
for col in numerical_cols:
    bin_col = pd.cut(df[col], bins = 4,labels=False, right=False)
    bin_col = bin_col.to_dict()
    df.loc[:, f"cat_bin_{col}"] = df.index.map(bin_col)

for col in numerical_cols:
    bin_col = pd.cut(df_test[col], bins = 4,labels=False, right=False)
    bin_col = bin_col.to_dict()
    df_test.loc[:, f"cat_bin_{col}"] = df_test.index.map(bin_col)


useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
numerical_cols = [col for col in useful_features if col.startswith("cont")]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ohe = preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore")
    xtrain_ohe = ohe.fit_transform(xtrain[object_cols])
    xvalid_ohe = ohe.transform(xvalid[object_cols])
    xtest_ohe = ohe.transform(xtest[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f"ohe_{i}" for i in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f"ohe_{i}" for i in range(xvalid_ohe.shape[1])])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f"ohe_{i}" for i in range(xtest_ohe.shape[1])])
    
    xtrain = pd.concat([xtrain, xtrain_ohe], axis=1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis=1)
    xtest = pd.concat([xtest, xtest_ohe], axis=1)
    
    xtrain = xtrain.drop(object_cols, axis=1)
    xvalid = xvalid.drop(object_cols, axis=1)
    xtest = xtest.drop(object_cols, axis=1)
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)
print(np.mean(scores), np.std(scores))


* binning the numerical features + OHE
* 0 0.7246528139817886
* 1 0.7243188344102082
* 2 0.7268415459695178
* 3 0.7263439289970397
* 4 0.7256872231245289
* 0.7255688692966167 0.0009629304895589322


In [None]:
# One-hot encoding
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ohe = preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore")
    xtrain_ohe = ohe.fit_transform(xtrain[object_cols])
    xvalid_ohe = ohe.transform(xvalid[object_cols])
    xtest_ohe = ohe.transform(xtest[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f"ohe_{i}" for i in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f"ohe_{i}" for i in range(xvalid_ohe.shape[1])])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f"ohe_{i}" for i in range(xtest_ohe.shape[1])])
    
    xtrain = pd.concat([xtrain, xtrain_ohe], axis=1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis=1)
    xtest = pd.concat([xtest, xtest_ohe], axis=1)
    
    # this part is missing in the video:
    xtrain = xtrain.drop(object_cols, axis=1)
    xvalid = xvalid.drop(object_cols, axis=1)
    xtest = xtest.drop(object_cols, axis=1)
    # missing part ends
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

* One-hot encoding
* 0 0.7244255014738967
* 1 0.7245139958781214
* 2 0.7264465446086561
* 3 0.7264028943362871
* 4 0.7257096926265366
* 0.7254997257846996 0.0008811227736574191

In [None]:
# one hot encoding of categorical variables + standarization of ohe & numerical
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ohe = preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore")
    xtrain_ohe = ohe.fit_transform(xtrain[object_cols])
    xvalid_ohe = ohe.transform(xvalid[object_cols])
    xtest_ohe = ohe.transform(xtest[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f"cont_ohe_{i}" for i in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f"cont_ohe_{i}" for i in range(xvalid_ohe.shape[1])])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f"cont_ohe_{i}" for i in range(xtest_ohe.shape[1])])
    
    xtrain = pd.concat([xtrain, xtrain_ohe], axis=1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis=1)
    xtest = pd.concat([xtest, xtest_ohe], axis=1)
        
    # this part is missing in the video:
    xtrain = xtrain.drop(object_cols, axis=1)
    xvalid = xvalid.drop(object_cols, axis=1)
    xtest = xtest.drop(object_cols, axis=1)
    # missing part ends
    numerical_cols = [col for col in useful_features if col.startswith("cont")]
    scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = scaler.transform(xtest[numerical_cols])
    
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

one hot encoding of categorical variables + standarization of ohe & numerical
* 0 0.7244324833714519
* 1 0.7245329062537117
* 2 0.7262739942272469
* 3 0.7264029729278617
* 4 0.7260672928259383
* 0.7255419299212421 0.0008720458471300098

In [None]:
# # combine categorical columns
# # cat1_cat2
# # df[cat1] + "_" + df[cat2]
# df = pd.read_csv("../input/30days-folds/train_folds.csv")
# df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
# sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

# useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
# object_cols = [col for col in useful_features if 'cat' in col]
# df_test = df_test[useful_features]

# for i in range(0, len(object_cols)-2, 1):
#     for l in range(i+1, len(object_cols)-1, 1):
#         col = df[object_cols[i]] + "_" + df[object_cols[l]]
#         df.loc[:, f"cat{i}_cat{l}"] = df.index.map(col)



In [None]:
# combine categorical columns
# cat1_cat2
# df[cat1] + "_" + df[cat2]
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

for i in range(0, len(object_cols)-2, 1):
    for l in range(i+1, len(object_cols)-1, 1):
        col = df[object_cols[i]] + "_" + df[object_cols[l]]
        df.loc[:, f"cat{i}_cat{l}"] = df.index.map(col)
        
for i in range(0, len(object_cols)-2, 1):
    for l in range(i+1, len(object_cols)-1, 1):
        col = df_test[object_cols[i]] + "_" + df_test[object_cols[l]]
        df_test.loc[:, f"cat{i}_cat{l}"] = df_test.index.map(col)


useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    ohe = preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore")
    xtrain_ohe = ohe.fit_transform(xtrain[object_cols])
    xvalid_ohe = ohe.transform(xvalid[object_cols])
    xtest_ohe = ohe.transform(xtest[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f"ohe_{i}" for i in range(xtrain_ohe.shape[1])])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f"ohe_{i}" for i in range(xvalid_ohe.shape[1])])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f"ohe_{i}" for i in range(xtest_ohe.shape[1])])
    
    xtrain = pd.concat([xtrain, xtrain_ohe], axis=1)
    xvalid = pd.concat([xvalid, xvalid_ohe], axis=1)
    xtest = pd.concat([xtest, xtest_ohe], axis=1)
    
    xtrain = xtrain.drop(object_cols, axis=1)
    xvalid = xvalid.drop(object_cols, axis=1)
    xtest = xtest.drop(object_cols, axis=1)
    
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor="gpu_predictor")
    model.fit(xtrain, ytrain)
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

* 0 0.724260954293362
* 1 0.7244463921778946
* 2 0.7264497617913407
* 3 0.7266519905763227
* 4 0.725542787749717
* 0.7254703773177275 0.0009871065524215002

* combine categorical columns
0.7254703773177275 0.0009871065524215002
* one hot encoding of categorical variables + standarization of ohe & numerical
0.7255419299212421 0.0008720458471300098
* one hot encoding
0.7254997257846996 0.0008811227736574191