In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import re
import optuna
from optuna import Trial, visualization
from sklearn.model_selection import StratifiedKFold, KFold,cross_val_predict,cross_val_score,cross_validate
from sklearn.ensemble import RandomForestClassifier

# Load train and test data
train_data = pd.read_csv('/kaggle/input/primal2-0/final_train.csv')
test_data = pd.read_csv('/kaggle/input/primal2-0/final_test.csv')

# Concatenate train and test data for preprocessing
combined_data = pd.concat([train_data, test_data], ignore_index=True)

value_mapping = {
    'a': 6,  # Example: 10 lakhs
}
# Replace values in the 'ticket size' column using the mapping
combined_data['Previous_Organizations'] = test_data['Previous_Organizations'].replace(value_mapping)

# Drop specified columns
combined_data.drop(columns=['Designation', 'DOJ', 'R_District', 'B_District', 'Company_1', 'Company_2', 'Skill'], inplace=True)

# Replace Graduation and Qualification values
combined_data['Graduation'] = combined_data['Graduation'].fillna(0).map({'full time': 1, 'part time': 0, 0: 0})
combined_data['Qualification'] = combined_data['Qualification'].map({'graduate': 2, 'under graduate': 1, 'post graduate': 3, 'others': 0, 'diploma holders': 4}).fillna(-1)

# Set 'Previous_Organizations' as type int, treating NaN as 0
combined_data['Previous_Organizations'] = combined_data['Previous_Organizations'].fillna(0).astype(int)

# One-hot encoding for categorical variables
columns_to_encode = ['Industry', 'Source', 'Department', 'R_Region', 'B_Region']
combined_data_encoded = pd.get_dummies(combined_data, columns=columns_to_encode)

# Create indicator variables for states
states = list(combined_data['R_State']) + list(combined_data['B_State'])
for state in states:
    combined_data_encoded[state] = ((combined_data['R_State'] == state) | (combined_data['B_State'] == state)).astype(int)

# Drop 'R_State' and 'B_State'
combined_data_encoded.drop(columns=['R_State', 'B_State', 'Unnamed: 0'], inplace=True)
combined_data_encoded.columns = [re.sub(r'\W+', '_', col) for col in combined_data_encoded.columns]

# Separate back into train and test data
train_data_processed = combined_data_encoded.iloc[:len(train_data)]
test_data_processed = combined_data_encoded.iloc[len(train_data):].drop(columns='Performance')  # Assuming 'Target' is the target column

In [2]:
df_all = train_data_processed
df_test = test_data_processed
df_submission = pd.read_csv("/kaggle/input/sample/sample.csv")

In [3]:
X = df_all.drop(labels=['CandidateID','Performance'],axis=1,inplace=False).copy()
y = df_all['Performance'].values
X_test = df_test.drop(labels=['CandidateID'],axis=1,inplace=False).copy()
random_seed = 0

In [4]:
encoder = OneHotEncoder()
all_encoded = encoder.fit_transform(pd.concat([X,X_test]))
#X = all_encoded[0:len(X)]
#X_test = all_encoded[len(X):]
X = all_encoded.tocsr()[0:len(X)]
X_test = all_encoded [len(df_all):]

In [5]:
df_all['Performance'].value_counts()

Performance
0.0    450
1.0    295
Name: count, dtype: int64

In [6]:
X = df_all.drop(labels=['CandidateID','Performance'],axis=1,inplace=False).copy()
y = df_all['Performance'].values
X_test = df_test.drop(labels=['CandidateID'],axis=1,inplace=False).copy()
random_seed = 0

In [7]:
encoder = OneHotEncoder()
all_encoded = encoder.fit_transform(pd.concat([X,X_test]))
#X = all_encoded[0:len(X)]
#X_test = all_encoded[len(X):]
X = all_encoded.tocsr()[0:len(X)]
X_test = all_encoded [len(df_all):]

In [8]:
from sklearn.metrics import f1_score

def objective(trial, data=X, target=y):
    param_space = {
        'max_depth': trial.suggest_int('max_depth', 2, 40),
        'n_estimators': trial.suggest_int('n_estimators', 20, 2000, step = 10),
        'max_features': trial.suggest_categorical('max_features', ['log2', 'sqrt']),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'criterion' : trial.suggest_categorical('criterion', ['gini', 'entropy'])
    }
    
    model = RandomForestClassifier(**param_space, random_state=random_seed, n_jobs=-1)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)
    
    f1_scores = []
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        f1_scores.append(f1_score(y_test, y_pred))
        
    cv_score = np.mean(f1_scores)
    
    return cv_score

In [9]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 200)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-04-11 04:35:48,183] A new study created in memory with name: no-name-b11f2354-ef7e-4f13-8172-d65f8cc30c0b
[I 2024-04-11 04:36:09,455] Trial 0 finished with value: 0.006557377049180328 and parameters: {'max_depth': 10, 'n_estimators': 1440, 'max_features': 'sqrt', 'min_samples_split': 5, 'bootstrap': False, 'min_samples_leaf': 11, 'criterion': 'gini'}. Best is trial 0 with value: 0.006557377049180328.
[I 2024-04-11 04:36:17,649] Trial 1 finished with value: 0.0 and parameters: {'max_depth': 8, 'n_estimators': 430, 'max_features': 'sqrt', 'min_samples_split': 4, 'bootstrap': True, 'min_samples_leaf': 8, 'criterion': 'gini'}. Best is trial 0 with value: 0.006557377049180328.
[I 2024-04-11 04:36:31,338] Trial 2 finished with value: 0.01946060285563194 and parameters: {'max_depth': 6, 'n_estimators': 960, 'max_features': 'sqrt', 'min_samples_split': 14, 'bootstrap': False, 'min_samples_leaf': 6, 'criterion': 'entropy'}. Best is trial 2 with value: 0.01946060285563194.
[I 2024-04-11 

Number of finished trials: 200
Best trial: {'max_depth': 40, 'n_estimators': 990, 'max_features': 'sqrt', 'min_samples_split': 2, 'bootstrap': False, 'min_samples_leaf': 1, 'criterion': 'entropy'}


In [10]:
params = study.best_params
model = RandomForestClassifier(**params, random_state=random_seed, n_jobs=-1)
model.fit(X, y=y)
pred = model.predict_proba(X_test)

In [11]:
df_test[['class1','class2']] = pred
submission = df_test[['CandidateID','class1','class2']]
submission.to_csv('randomforest.csv')

In [12]:
submission

Unnamed: 0,CandidateID,class1,class2
745,emp0521,0.517070,0.482930
746,emp0613,0.415717,0.584283
747,emp0136,0.386492,0.613508
748,emp0351,0.687199,0.312801
749,emp0049,0.799034,0.200966
...,...,...,...
927,emp0401,0.588374,0.411626
928,emp0408,0.825348,0.174652
929,emp0248,0.588078,0.411922
930,emp0148,0.356116,0.643884
