<br>
<h1 style = "font-size:60px; font-family:Garamond ; font-weight : normal; background-color: #f6f5f5 ; color : #fe346e; text-align: center; border-radius: 100px 100px;">XGBoost + Optuna 🔥</h1>
<br>

![](https://storage.googleapis.com/kaggle-competitions/kaggle/25225/logos/header.png?t=2021-01-27-17-34-26)

<h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Import Required Libraries 📚</h1>

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

import optuna
from optuna import Trial, visualization

import warnings
warnings.filterwarnings("ignore")

<h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Read the Data 📖</h1>

In [2]:
train = pd.read_csv('/kaggle/input/primal2-0/final_train.csv')
test = pd.read_csv('/kaggle/input/primal2-0/final_test.csv')
sample = pd.read_csv('/kaggle/input/sample/sample.csv')

In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import re

# Load train and test data
train_data = pd.read_csv('/kaggle/input/primal2-0/final_train.csv')
test_data = pd.read_csv('/kaggle/input/primal2-0/final_test.csv')

# Concatenate train and test data for preprocessing
combined_data = pd.concat([train_data, test_data], ignore_index=True)

value_mapping = {
    'a': 6,  # Example: 10 lakhs
}
# Replace values in the 'ticket size' column using the mapping
combined_data['Previous_Organizations'] = test_data['Previous_Organizations'].replace(value_mapping)

# Drop specified columns
combined_data.drop(columns=['Designation', 'DOJ', 'R_District', 'B_District', 'Company_1', 'Company_2', 'Skill'], inplace=True)

# Replace Graduation and Qualification values
combined_data['Graduation'] = combined_data['Graduation'].fillna(0).map({'full time': 1, 'part time': 0, 0: 0})
combined_data['Qualification'] = combined_data['Qualification'].map({'graduate': 2, 'under graduate': 1, 'post graduate': 3, 'others': 0, 'diploma holders': 4}).fillna(-1)

# Set 'Previous_Organizations' as type int, treating NaN as 0
combined_data['Previous_Organizations'] = combined_data['Previous_Organizations'].fillna(0).astype(int)

# One-hot encoding for categorical variables
columns_to_encode = ['Industry', 'Source', 'Department', 'R_Region', 'B_Region']
combined_data_encoded = pd.get_dummies(combined_data, columns=columns_to_encode)

# Create indicator variables for states
states = list(combined_data['R_State']) + list(combined_data['B_State'])
for state in states:
    combined_data_encoded[state] = ((combined_data['R_State'] == state) | (combined_data['B_State'] == state)).astype(int)

# Drop 'R_State' and 'B_State'
combined_data_encoded.drop(columns=['R_State', 'B_State', 'Unnamed: 0'], inplace=True)
combined_data_encoded.columns = [re.sub(r'\W+', '_', col) for col in combined_data_encoded.columns]

# Separate back into train and test data
train_data_processed = combined_data_encoded.iloc[:len(train_data)]
test_data_processed = combined_data_encoded.iloc[len(train_data):].drop(columns='Performance')  # Assuming 'Target' is the target column

In [4]:
train = train_data_processed
test = test_data_processed

In [5]:
train.head()

Unnamed: 0,CandidateID,Graduation,Qualification,Experience,Previous_Organizations,Incentive,Ticket_Size,Family_Members,Earning_Members,Dependent_Members,...,rajasthan,himachal_pradesh,punjab,assam,haryana,chandigarh,west_bengal,jharkhand,jammu_and_kashmir,bihar
0,emp0669,1,2.0,5.0,6,12000,1000000,2,1.0,1,...,0,0,0,0,0,0,0,0,0,0
1,emp0273,1,2.0,1.5,4,8000,150000,4,2.0,4,...,0,0,0,0,0,0,0,0,0,0
2,emp0300,1,1.0,4.5,2,20000,2000000,4,1.0,2,...,0,0,0,0,0,0,0,0,0,0
3,emp0100,1,3.0,4.5,4,12000,2000000,4,2.0,3,...,0,0,0,0,0,0,0,0,0,0
4,emp0319,1,3.0,1.0,2,0,0,4,1.0,3,...,0,0,0,0,0,0,0,0,0,0


In [6]:
feature_cols = [col for col in train.columns.tolist() if col not in ['CandidateID', 'Performance']]
target_col = ['Performance']

In [7]:
class_dict = {
    0: 0,
    1: 1,
}

In [8]:
train['Performance'] = train['Performance'].map(class_dict)

<h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Create Folds</h1>

In [9]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for i, (trn, val) in enumerate(skf.split(train, train[target_col])):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

In [10]:

def fit_xgb(trial, xtr, ytr, xval, yval):
    params = {
        "n_estimators": trial.suggest_int("n_estimators",100,2000,100),
        "subsample": trial.suggest_discrete_uniform("subsample",0.2,1,0.1),
        "colsample_bytree": trial.suggest_discrete_uniform("colsample_bytree",0.2,1,0.1),
        "eta": trial.suggest_loguniform("eta",1e-3,0.1),
        "reg_alpha": trial.suggest_int("reg_alpha",1,100),
        "reg_lambda": trial.suggest_int("reg_lambda",2,200),
        "max_depth": trial.suggest_int("max_depth",3,20),
        "min_child_weight": trial.suggest_int("min_child_weight",2,20),
    }
    
    model = xgb.XGBClassifier(**params, tree_method='gpu_hist', random_state=42)
    model.fit(xtr, ytr.reshape(-1,), eval_metric='mlogloss')
    
    y_val_pred = model.predict_proba(xval)
    
    log = {
        "train logloss": log_loss(ytr, model.predict_proba(xtr)),
        "valid logloss": log_loss(yval, y_val_pred)
    }
    
    return model, log

<h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Objective Function</h1>

In [11]:
def objective(trial):
    loss = 0
    for fold in range(5):
        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_col].values
        xval, yval = val[feature_cols].values, val[target_col].values
        
        model, log = fit_xgb(trial, xtr, ytr, xval, yval)
        loss += log['valid logloss']/5
        
    return loss

<h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Optimize 📈</h1>

In [12]:
study = optuna.create_study(direction="minimize", study_name='Xgboost optimization')
study.optimize(objective, n_trials=100)

[32m[I 2024-04-11 06:38:45,341][0m A new study created in memory with name: Xgboost optimization[0m
[32m[I 2024-04-11 06:39:15,391][0m Trial 0 finished with value: 0.6931471824645996 and parameters: {'n_estimators': 1200, 'subsample': 0.2, 'colsample_bytree': 1.0, 'eta': 0.0021765508017089186, 'reg_alpha': 66, 'reg_lambda': 169, 'max_depth': 20, 'min_child_weight': 6}. Best is trial 0 with value: 0.6931471824645996.[0m
[32m[I 2024-04-11 06:39:20,562][0m Trial 1 finished with value: 0.6931471824645996 and parameters: {'n_estimators': 1800, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.9000000000000001, 'eta': 0.04742868775737352, 'reg_alpha': 56, 'reg_lambda': 32, 'max_depth': 14, 'min_child_weight': 19}. Best is trial 0 with value: 0.6931471824645996.[0m
[32m[I 2024-04-11 06:39:29,464][0m Trial 2 finished with value: 0.6696970882031741 and parameters: {'n_estimators': 1100, 'subsample': 0.6000000000000001, 'colsample_bytree': 1.0, 'eta': 0.05837430959474339, 'reg_a

In [13]:
study.best_params

{'n_estimators': 900,
 'subsample': 0.9000000000000001,
 'colsample_bytree': 1.0,
 'eta': 0.014646079813621789,
 'reg_alpha': 1,
 'reg_lambda': 129,
 'max_depth': 4,
 'min_child_weight': 2}

<h1 style = "font-family: garamond; font-size: 40px; font-style: normal; letter-spcaing: 3px; background-color: #f6f5f5; color :#fe346e; border-radius: 100px 100px; text-align:center">Re-train on the Entire Data</h1>

In [14]:
clf = xgb.XGBClassifier(**study.best_params, tree_method='gpu_hist', random_state=42)
clf.fit(train[feature_cols], train[target_col], eval_metric='mlogloss')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0,
              eta=0.014646079813621789, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.0146460794, max_delta_step=0, max_depth=4,
              min_child_weight=2, missing=nan, monotone_constraints='()',
              n_estimators=900, n_jobs=4, num_parallel_tree=1, random_state=42,
              reg_alpha=1, reg_lambda=129, scale_pos_weight=1,
              subsample=0.9000000000000001, tree_method='gpu_hist',
              validate_parameters=1, verbosity=None)

In [15]:
test_predictions = clf.predict_proba(test[feature_cols])

In [16]:
predictions_df = pd.DataFrame(test_predictions, columns = ["Class_1", "Class_2"])
predictions_df['CandidateID'] = sample['CandidateID']

In [17]:
predictions_df.to_csv("xgboost.csv", index = False)

![Upvote!](https://img.shields.io/badge/Upvote-If%20you%20like%20my%20work-07b3c8?style=for-the-badge&logo=kaggle)