In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

import optuna
from optuna import Trial, visualization

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('/kaggle/input/primal2-0/final_train.csv')
test = pd.read_csv('/kaggle/input/primal2-0/final_test.csv')
CandidateID = test['CandidateID']

In [3]:


import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import re

# Load train and test data
train_data = pd.read_csv('/kaggle/input/primal2-0/final_train.csv')
test_data = pd.read_csv('/kaggle/input/primal2-0/final_test.csv')

# Concatenate train and test data for preprocessing
combined_data = pd.concat([train_data, test_data], ignore_index=True)

value_mapping = {
    'a': 6,  # Example: 10 lakhs
}
# Replace values in the 'ticket size' column using the mapping
combined_data['Previous_Organizations'] = test_data['Previous_Organizations'].replace(value_mapping)

# Drop specified columns
combined_data.drop(columns=['Designation', 'DOJ', 'R_District', 'B_District', 'Company_1', 'Company_2', 'Skill'], inplace=True)

# Replace Graduation and Qualification values
combined_data['Graduation'] = combined_data['Graduation'].fillna(0).map({'full time': 1, 'part time': 0, 0: 0})
combined_data['Qualification'] = combined_data['Qualification'].map({'graduate': 2, 'under graduate': 1, 'post graduate': 3, 'others': 0, 'diploma holders': 4}).fillna(-1)

# Set 'Previous_Organizations' as type int, treating NaN as 0
combined_data['Previous_Organizations'] = combined_data['Previous_Organizations'].fillna(0).astype(int)

# One-hot encoding for categorical variables
columns_to_encode = ['Industry', 'Source', 'Department', 'R_Region', 'B_Region']
combined_data_encoded = pd.get_dummies(combined_data, columns=columns_to_encode)

# Create indicator variables for states
states = list(combined_data['R_State']) + list(combined_data['B_State'])
for state in states:
    combined_data_encoded[state] = ((combined_data['R_State'] == state) | (combined_data['B_State'] == state)).astype(int)

# Drop 'R_State' and 'B_State'
combined_data_encoded.drop(columns=['R_State', 'B_State', 'Unnamed: 0'], inplace=True)
combined_data_encoded.columns = [re.sub(r'\W+', '_', col) for col in combined_data_encoded.columns]

# Separate back into train and test data
train_data_processed = combined_data_encoded.iloc[:len(train_data)]
test_data_processed = combined_data_encoded.iloc[len(train_data):].drop(columns='Performance')  # Assuming 'Target' is the target column

In [4]:
train = train_data_processed
test = test_data_processed

In [5]:
feature_cols = [col for col in train.columns.tolist() if col not in ['CandidateID', 'Performance']]
target_col = ['Performance']

In [6]:
class_dict = {
    0: 0,
    1: 1,
}

In [7]:
train['Performance'] = train['Performance'].map(class_dict)

skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

for i, (trn, val) in enumerate(skf.split(train, train[target_col])):
    train.loc[val, 'kfold'] = i
train['kfold'] = train['kfold'].astype(int)

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

def fit_adaboost(trial, xtr, ytr, xval, yval, categorical_features=None):
    """
    Fits an AdaBoost model with hyperparameter tuning using Optuna trial.

    Args:
        trial: Optuna trial object for hyperparameter suggestions.
        xtr: Training data (features).
        ytr: Training target labels.
        xval: Validation data (features).
        yval: Validation target labels.
        categorical_features: List of indices for categorical features (optional).

    Returns:
        A tuple containing the fitted AdaBoost model and a dictionary with training and validation metrics.
    """

    # Impute NaN values with the mean of the respective column
    imputer = SimpleImputer(strategy='mean')
    xtr_imputed = imputer.fit_transform(xtr)
    xval_imputed = imputer.transform(xval)

    param_space = {
    'n_estimators': trial.suggest_int('n_estimators', 50, 1000),  # Number of weak learners
    'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),  # Controls step size
    'algorithm': trial.suggest_categorical('algorithm', ['SAMME.R', 'SAMME'])  # Algorithm type
    }

    model = AdaBoostClassifier(**param_space, random_state=0)
    model.fit(xtr_imputed, ytr)

    y_val_pred = model.predict_proba(xval_imputed)

    log = {
        "train logloss": log_loss(ytr, model.predict_proba(xtr_imputed)),
        "valid logloss": log_loss(yval, y_val_pred)
    }

    return model, log

In [9]:


def objective(trial):
    loss = 0
    for fold in range(5):
        trn_idx = train['kfold'] != fold
        val_idx = train['kfold'] == fold
        trn = train.loc[trn_idx, :]
        val = train.loc[val_idx, :]

        xtr, ytr = trn[feature_cols].values, trn[target_col].values
        xval, yval = val[feature_cols].values, val[target_col].values
        
        model, log = fit_adaboost(trial, xtr, ytr, xval, yval)
        loss += log['valid logloss']/5
        
    return loss

In [10]:
study = optuna.create_study(direction="minimize", study_name='Aboost optimization')
study.optimize(objective, n_trials=200)

[I 2024-04-11 12:49:07,252] A new study created in memory with name: Aboost optimization
[I 2024-04-11 12:49:23,591] Trial 0 finished with value: 0.6885835353259029 and parameters: {'n_estimators': 889, 'learning_rate': 0.08103361462935256, 'algorithm': 'SAMME.R'}. Best is trial 0 with value: 0.6885835353259029.
[I 2024-04-11 12:49:37,249] Trial 1 finished with value: 0.6895523491905164 and parameters: {'n_estimators': 765, 'learning_rate': 0.14952039702919387, 'algorithm': 'SAMME.R'}. Best is trial 0 with value: 0.6885835353259029.
[I 2024-04-11 12:49:41,544] Trial 2 finished with value: 0.6807892381581059 and parameters: {'n_estimators': 242, 'learning_rate': 0.001044432915062314, 'algorithm': 'SAMME'}. Best is trial 2 with value: 0.6807892381581059.
[I 2024-04-11 12:49:47,634] Trial 3 finished with value: 0.6596538965963791 and parameters: {'n_estimators': 327, 'learning_rate': 0.0023173003368772756, 'algorithm': 'SAMME.R'}. Best is trial 3 with value: 0.6596538965963791.
[I 2024-04

In [11]:
params = study.best_params
model = AdaBoostClassifier(**params, random_state=0)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(train[feature_cols])
model.fit(X_imputed, train[target_col])
X_test_imputed = imputer.fit_transform(test[feature_cols])
test_predictions = model.predict_proba(X_test_imputed)

In [12]:
predictions_df = pd.DataFrame(test_predictions, columns = ["Class_1", "Class_2"])
predictions_df['CandidateID'] = CandidateID

predictions_df['CandidateID'] = list(CandidateID)
predictions_df
predictions_df.to_csv('Adaboost_D.csv')

In [13]:
print(test_predictions)

[[0.50901654 0.49098346]
 [0.50901654 0.49098346]
 [0.54052342 0.45947658]
 [0.55639551 0.44360449]
 [0.60281192 0.39718808]
 [0.50901654 0.49098346]
 [0.4951612  0.5048388 ]
 [0.5862169  0.4137831 ]
 [0.59334492 0.40665508]
 [0.54146136 0.45853864]
 [0.60281192 0.39718808]
 [0.49610542 0.50389458]
 [0.54669715 0.45330285]
 [0.54052342 0.45947658]
 [0.62896692 0.37103308]
 [0.58590012 0.41409988]
 [0.58379499 0.41620501]
 [0.59365996 0.40634004]
 [0.61511012 0.38488988]
 [0.55425451 0.44574549]
 [0.67222449 0.32777551]
 [0.62921437 0.37078563]
 [0.66598659 0.33401341]
 [0.54669715 0.45330285]
 [0.72496632 0.27503368]
 [0.58379499 0.41620501]
 [0.57026834 0.42973166]
 [0.59334492 0.40665508]
 [0.52869181 0.47130819]
 [0.54146136 0.45853864]
 [0.54909023 0.45090977]
 [0.60925615 0.39074385]
 [0.60069261 0.39930739]
 [0.49610542 0.50389458]
 [0.53635121 0.46364879]
 [0.52620533 0.47379467]
 [0.60069261 0.39930739]
 [0.58831842 0.41168158]
 [0.55425451 0.44574549]
 [0.50901654 0.49098346]
