In [1]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import re
import optuna
from optuna import Trial, visualization
from sklearn.model_selection import StratifiedKFold, KFold,cross_val_predict,cross_val_score,cross_validate
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import f1_score


# Load train and test data
train_data = pd.read_csv('/kaggle/input/primal2-0/final_train.csv')
test_data = pd.read_csv('/kaggle/input/primal2-0/final_test.csv')

# Concatenate train and test data for preprocessing
combined_data = pd.concat([train_data, test_data], ignore_index=True)

value_mapping = {
    'a': 6,  # Example: 10 lakhs
}
# Replace values in the 'ticket size' column using the mapping
combined_data['Previous_Organizations'] = test_data['Previous_Organizations'].replace(value_mapping)

# Drop specified columns
combined_data.drop(columns=['Designation', 'DOJ', 'R_District', 'B_District', 'Company_1', 'Company_2', 'Skill'], inplace=True)

# Replace Graduation and Qualification values
combined_data['Graduation'] = combined_data['Graduation'].fillna(0).map({'full time': 1, 'part time': 0, 0: 0})
combined_data['Qualification'] = combined_data['Qualification'].map({'graduate': 2, 'under graduate': 1, 'post graduate': 3, 'others': 0, 'diploma holders': 4}).fillna(-1)

# Set 'Previous_Organizations' as type int, treating NaN as 0
combined_data['Previous_Organizations'] = combined_data['Previous_Organizations'].fillna(0).astype(int)

# One-hot encoding for categorical variables
columns_to_encode = ['Industry', 'Source', 'Department', 'R_Region', 'B_Region']
combined_data_encoded = pd.get_dummies(combined_data, columns=columns_to_encode)

# Create indicator variables for states
states = list(combined_data['R_State']) + list(combined_data['B_State'])
for state in states:
    combined_data_encoded[state] = ((combined_data['R_State'] == state) | (combined_data['B_State'] == state)).astype(int)

# Drop 'R_State' and 'B_State'
combined_data_encoded.drop(columns=['R_State', 'B_State', 'Unnamed: 0'], inplace=True)
combined_data_encoded.columns = [re.sub(r'\W+', '_', col) for col in combined_data_encoded.columns]

# Separate back into train and test data
train_data_processed = combined_data_encoded.iloc[:len(train_data)]
test_data_processed = combined_data_encoded.iloc[len(train_data):].drop(columns='Performance')  # Assuming 'Target' is the target column

In [2]:


df_all = train_data_processed
df_test = test_data_processed
#df_submission = pd.read_csv("/kaggle/input/sample/sample.csv")

In [3]:
X = df_all.drop(labels=['CandidateID','Performance'],axis=1,inplace=False).copy()
y = df_all['Performance'].values
X_test = df_test.drop(labels=['CandidateID'],axis=1,inplace=False).copy()
random_seed = 0

In [4]:


encoder = OneHotEncoder()
all_encoded = encoder.fit_transform(pd.concat([X,X_test]))
#X = all_encoded[0:len(X)]
#X_test = all_encoded[len(X):]
X = all_encoded.tocsr()[0:len(X)]
X_test = all_encoded [len(df_all):]



In [5]:
df_all['Performance'].value_counts()

Performance
0.0    450
1.0    295
Name: count, dtype: int64

In [6]:


X = df_all.drop(labels=['CandidateID','Performance'],axis=1,inplace=False).copy()
y = df_all['Performance'].values
X_test = df_test.drop(labels=['CandidateID'],axis=1,inplace=False).copy()
random_seed = 0



In [7]:
encoder = OneHotEncoder()
all_encoded = encoder.fit_transform(pd.concat([X,X_test]))
#X = all_encoded[0:len(X)]
#X_test = all_encoded[len(X):]
X = all_encoded.tocsr()[0:len(X)]
X_test = all_encoded [len(df_all):]


In [8]:
from sklearn.impute import SimpleImputer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

def objective(trial, data=X, target=y):
    param_space = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),  # Number of weak learners
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),  # Controls step size
        'algorithm': trial.suggest_categorical('algorithm', ['SAMME.R', 'SAMME'])  # Algorithm type
        }

    k = 5
    seed_list = [0]
    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=random_seed)
    oof = np.zeros((len(df_all), 2))
    score_list = []
    fold = 1

    splits = list(kf.split(X, y))
    for train_idx, val_idx in splits:
        X_train, X_val = X[train_idx, :], X[val_idx, :]
        y_train, y_val = y[train_idx], y[val_idx]

        val_preds_list = []

        for seed in seed_list:
            # fit and run model
            imputer = SimpleImputer(strategy='mean')
            Xtrain_imputed = imputer.fit_transform(X_train)
            Xval_imputed = imputer.transform(X_val)
            model = AdaBoostClassifier(**param_space, random_state=0)
            model.fit(Xtrain_imputed,y_train)

            val_preds_list.append(model.predict_proba(Xval_imputed))

        oof[val_idx] = np.mean(val_preds_list, axis=0)
        score = f1_score(y_val, np.argmax(oof[val_idx], axis=1))
        score_list.append(score)
        fold += 1

    cv_f1 = np.mean(score_list)

    return cv_f1

In [9]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-04-11 13:42:43,541] A new study created in memory with name: no-name-ad080fa1-e9df-4f23-9394-c1ece2e544c9
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),  # Controls step size
[I 2024-04-11 13:43:17,821] Trial 0 finished with value: 0.03155737704918033 and parameters: {'n_estimators': 626, 'learning_rate': 0.006941259583685907, 'algorithm': 'SAMME'}. Best is trial 0 with value: 0.03155737704918033.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),  # Controls step size
[I 2024-04-11 13:44:01,098] Trial 1 finished with value: 0.088 and parameters: {'n_estimators': 793, 'learning_rate': 0.0018428294458776035, 'algorithm': 'SAMME'}. Best is trial 1 with value: 0.088.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1.0),  # Controls step size
[I 2024-04-11 13:44:24,441] Trial 2 finished with value: 0.13411104715858943 and parameters: {'n_estimators': 426, 'learning_rate': 0.0958192768586831, 'algorithm': 'SAMME'}. B

Number of finished trials: 100
Best trial: {'n_estimators': 948, 'learning_rate': 0.628581538464075, 'algorithm': 'SAMME.R'}


In [10]:
param_space = study.best_trial.params
imputer = SimpleImputer(strategy='mean')
Xtrain_imputed = imputer.fit_transform(X)
Xtest_imputed = imputer.transform(X_test)
model = AdaBoostClassifier(**param_space, random_state=0)
model.fit(Xtrain_imputed,y)

In [11]:
df_test[['class1','class2']] = model.predict_proba(Xtest_imputed)
submission = df_test[['CandidateID','class1','class2']]
submission.to_csv('catboost_nd.csv')

In [12]:
submission

Unnamed: 0,CandidateID,class1,class2
745,emp0521,0.494564,0.505436
746,emp0613,0.499192,0.500808
747,emp0136,0.492114,0.507886
748,emp0351,0.507313,0.492687
749,emp0049,0.505372,0.494628
...,...,...,...
927,emp0401,0.499666,0.500334
928,emp0408,0.511882,0.488118
929,emp0248,0.502990,0.497010
930,emp0148,0.496242,0.503758
