# *Feature Engineering + Hyperparameter tuning*

****In this notebook, I have shared the techniques like feature engineering, hyperparameter tuning using Optuna.****

****Step-1 Importing Libraries.****

In [None]:
#importing necessary libararies
import numpy as np
import pandas as pd
import seaborn as sns 
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',500)
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

****Step-2 Importing the datasets.****

****I have used datasets from 30 days of code and from 30days-folds by Abhishek Thakur.****

In [None]:
import optuna
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

****Step-3 Feature Engineering.****

I have tried various techniques like OrdinalEncoder, One Hot Encoding ,Normalisation etc. 

The best came out to be the OneHotEncoder on categorical variables and after that StandardScaler upon the whole dataset after one hot encoding.

In [None]:
imp_col = [c for c in df.columns if c not in ("id", "target", "kfold")]
categorical_col = [col for col in imp_col if 'cat' in col]
numerical_col = [col for col in imp_col if col.startswith("cont")]
df_test = df_test[imp_col]
for folds in range(5):
    X_train=df[df.kfold!=folds].reset_index()
    X_valid=df[df.kfold==folds].reset_index()
    y_train=X_train.target
    y_valid=X_valid.target
    X_train=X_train[imp_col]
    X_valid=X_valid[imp_col]
    X_test=df_test.copy()

    ohe=OneHotEncoder(sparse=False, handle_unknown="ignore")
    X_train_ohe=ohe.fit_transform(X_train[categorical_col])
    X_valid_ohe=ohe.transform(X_valid[categorical_col])
    X_test_ohe=ohe.transform(X_test[categorical_col])

    X_train_ohe=pd.DataFrame(X_train_ohe,columns=[f"ohe_{i}"for i in range(X_train_ohe.shape[1])]) 
    X_valid_ohe=pd.DataFrame(X_valid_ohe,columns=[f"ohe_{i}"for i in range(X_valid_ohe.shape[1])])
    X_test_ohe=pd.DataFrame(X_test_ohe,columns=[f"ohe_{i}"for i in range(X_test_ohe.shape[1])]) 
    X_train=pd.concat([X_train,X_train_ohe],axis=1)
    X_valid=pd.concat([X_valid,X_valid_ohe], axis=1)
    X_test = pd.concat([X_test,X_test_ohe], axis=1)

    X_train = X_train.drop(categorical_col, axis=1)
    X_valid = X_valid.drop(categorical_col, axis=1)
    X_test = X_test.drop(categorical_col, axis=1)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    X_test = scaler.transform(X_test)

****Step-4 Hyperparameter Tuning.****

Using Optuna:

Steps-

* Define objective function to be optimized.Here run.
* Suggest hyperparameter values using trial object. 
* Create a study object and invoke the optimize method over 100 trials.

In [None]:
def run(trial):
    fold=0
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)
   
    xgb_model = XGBRegressor(random_state=42,
                tree_method="gpu_hist",
                gpu_id=1,
                predictor="gpu_predictor",
                n_estimators=7000,
                learning_rate=learning_rate,
                reg_lambda=reg_lambda,
                reg_alpha=reg_alpha,
                subsample=subsample,
                colsample_bytree=colsample_bytree,
                max_depth=max_depth,)
    xgb_model.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_valid, y_valid)], verbose=1000)
    preds_valid = xgb_model.predict(X_valid) 
    rmse = mean_squared_error(y_valid, preds_valid, squared=False)
    return rmse  

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(run, n_trials=100)

***To check the Best parameters.***

In [None]:
study.best_params

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("Value: {}".format(trial.value))

print("Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

**In the next notebook, I will share the optimized model.**

**If you found my work helpful, please upvote!**
Thank You!