In [None]:
! pip install lazypredict

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Loading data
train = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
test  = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_test.csv")
data  = pd.concat([train, test])
m, _  = train.shape

test_ids = test["enrollee_id"].values
data.drop(columns=["enrollee_id", "city"], inplace=True)

# Check for NaNs
data.isna().any()

In [None]:
data

In [None]:
# Preprocessing (categorical -> numerical)
import re

data["gender"] = data["gender"].replace({"Male"  : 0, 
                                         "Female": 1, 
                                         "Other" : 2})
data["company_type"] = data["company_type"].replace({"Other"              : 0,
                                                     "NGO"                : 1,
                                                     "Public Sector"      : 2,
                                                     "Early Stage Startup": 3,
                                                     "Funded Startup"     : 4,
                                                     "Pvt Ltd"            : 5})
data["relevent_experience"] = data["relevent_experience"].replace({"Has relevent experience": 1,
                                                                   "No relevent experience" : 0})
data["enrolled_university"] = data["enrolled_university"].replace({"no_enrollment"   : 0,
                                                                   "Part time course": 1,
                                                                   "Full time course": 2})
data["education_level"] = data["education_level"].replace({"Primary School": 0,
                                                           "High School"   : 1,
                                                           "Graduate"      : 2,
                                                           "Masters"       : 3,
                                                           "Phd"           : 4})
data["major_discipline"] = data["major_discipline"].replace({"No Major"       : 0,
                                                             "Arts"           : 1,
                                                             "Humanities"     : 2,
                                                             "Business Degree": 3,
                                                             "STEM"           : 4,
                                                             "Other"          : 5})
data["last_new_job"] = data["last_new_job"].replace({"never": 0,
                                                     "1" : 1,
                                                     "2" : 2,
                                                     "3" : 3,
                                                     "4" : 4,
                                                     ">4": 5})

data["experience"]   = data["experience"].apply(lambda x: int(re.findall(r"\d+", x)[0]) if not pd.isna(x) else -1)
data["company_size"] = data["company_size"].apply(lambda x: int(re.findall(r"\d+", x)[0]) if not pd.isna(x) else -1)
data.fillna(-1, inplace=True)

In [None]:
# Scaling and Splitting 
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing   import StandardScaler 

scaler = StandardScaler()
numpy_data = scaler.fit_transform(data.drop(columns=["target"]).values)

train  = numpy_data[:m, :]
test   = numpy_data[m:, :]
test_target  = data["target"].values[m:]
train_target = data["target"].values[:m]

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=24)
for train_indices, val_indices in sss.split(train, train_target):
    x_train, x_val = train[train_indices], train[val_indices]
    y_train, y_val = train_target[train_indices], train_target[val_indices]

In [None]:
# Metrics
from sklearn.metrics import roc_auc_score, make_scorer

In [None]:
# Quick model evaluation
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=roc_auc_score)
models, _ = clf.fit(x_train, x_val, y_train, y_val)
print(models)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors       import NearestCentroid   
from lightgbm                import LGBMClassifier

estimators = {'NearestCentroid': {'func'  : NearestCentroid(),
                                  'params': {"metric": ["euclidean", "manhattan"],
                                             "shrink_threshold": [None, 0.1, 0.3, 0.5, 0.8, 1]}},
             "LGBMClassifier"  : {'func'  : LGBMClassifier(),
                                  'params': {"boosting_type": ["gbdt", "dart", "goss", "rf"],
                                             "learning_rate": np.arange(0.05, 0.15, 0.05),
                                             "n_estimators ": [50, 100, 150],
                                             "reg_alpha"    : [0, 0.5, 1],
                                             "reg_lambda"   : [0, 0.5, 1],
                                             "min_child_weight" :[1e-4, 1e-3, 1e-1],
                                             "min_child_samples":[10, 20, 30]}}
             }

for name, estimator in estimators.items():
    model = GridSearchCV(estimator=estimator["func"],
                         param_grid=estimator["params"],
                         scoring=make_scorer(roc_auc_score, greater_is_better=True),
                         n_jobs=-1)
    model.fit(x_train, y_train)
    preds = model.predict(x_val)
    print("{}, \n ROC AUC: {:.3f} \n Best parameters: {}".format(name, 
                                                                 roc_auc_score(y_val, preds), 
                                                                 model.best_params_))

In [None]:
final_preds = model.predict(test)
submission = pd.DataFrame({"enrollee_id": test_ids, "target": final_preds})
submission.to_csv('submission.csv', index=False, index_label=False, line_terminator='\r\n')