In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import optuna

In [None]:
df = pd.read_csv("../input/30days-kfolds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]


In [None]:
xvalid.shape

In [None]:
def run(trial):
    fold = 0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)

    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)

    ytrain = xtrain.target
    yvalid = xvalid.target

    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = preprocessing.OrdinalEncoder()
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])

    model = XGBRegressor(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=1,
        predictor="gpu_predictor",
        n_estimators=7000,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(xtrain, ytrain, early_stopping_rounds=300, eval_set=[(xvalid, yvalid)], verbose=1000)
    preds_valid = model.predict(xvalid)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    return rmse

In [None]:
study = optuna.create_study(direction = "minimize")
study.optimize(run, n_trials=5)

In [None]:
study.best_params

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop("target" ,axis = 1)
y = df["target"]
xtrain,xvalid,ytrain,yvalid = train_test_split(X,y,random_state = 42,test_size = 0.2)

In [None]:
df = pd.read_csv("../input/30days-kfolds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]

for col in object_cols:
    temp_df = []
    temp_test_feat = None
    for fold in range(5):
        xtrain =  df[df.kfold != fold].reset_index(drop=True)
        xvalid = df[df.kfold == fold].reset_index(drop=True)
        feat = xtrain.groupby(col)["target"].agg("mean")
        feat = feat.to_dict()
        xvalid.loc[:, f"tar_enc_{col}"] = xvalid[col].map(feat)
        temp_df.append(xvalid)
        if temp_test_feat is None:
            temp_test_feat = df_test[col].map(feat)
        else:
            temp_test_feat += df_test[col].map(feat)
    
    temp_test_feat /= 5
    df_test.loc[:, f"tar_enc_{col}"] = temp_test_feat
    df = pd.concat(temp_df)
    

useful_features = [c for c in df.columns if c not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]
df_test = df_test[useful_features]


In [None]:
print(xtrain.shape)
print(ytrain.shape)
print(xvalid.shape)
print(yvalid.shape)
print(df_test.shape)

In [None]:
test_drop = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9','tar_enc_cat9']
df_test.drop(test_drop , axis = 1, inplace = True)

In [None]:
drop_columns_train = ["id","kfold","target"]
drop_columns_valid = ["id","kfold","target","tar_enc_cat9"]
total_train_drop = drop_columns_train + object_cols
total_valid_drop = drop_columns_valid + object_cols

xtrain.drop(total_train_drop , axis = 1 , inplace = True)
xvalid.drop(total_valid_drop , axis = 1 , inplace = True)

In [None]:
print(xtrain.shape)
print(ytrain.shape)
print(xvalid.shape)
print(yvalid.shape)
print(df_test.shape)

In [None]:
# xvalid.columns

In [None]:
# xtrain.drop(object_cols , axis = 1 , inplace = True)
# xvalid.drop(object_cols , axis = 1 , inplace = True)

In [None]:
# xtest = df_test.copy()
# xtrain.drop(object_cols , axis = 1 , inplace = True)
# xvalid.drop(object_cols , axis = 1 , inplace = True)

# xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
model = XGBRegressor(
        random_state=42,
        tree_method="gpu_hist",
        gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=7000,
        learning_rate= 0.01411241489945379,
        reg_lambda= 1.0583354165999715e-05,
        reg_alpha= 36.30786517392835,
        subsample= 0.2711337363341389,
        colsample_bytree= 0.21261818541763844,
        max_depth= 4
    )
model.fit(xtrain,ytrain)
pred_val = model.predict(xvalid)
rmse = mean_squared_error(yvalid,pred_val,squared = False)
print(rmse)
# print()
test_preds = model.predict(df_test)

In [None]:
test_preds.shape

In [None]:
sample_submission.shape

In [None]:
sample_submission.target = test_preds
sample_submission.to_csv("submission.csv" , index = False)

In [None]:
pd.read_csv("submission.csv")