In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, log_loss, classification_report, confusion_matrix
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

# **BUSINESS PROBLEM**

A company which is active in Big Data and Data Science wants to hire data scientists among people who successfully pass some courses which conduct by the company. Many people signup for their training. Company wants to know which of these candidates are really wants to work for the company after training or looking for a new employment because it helps to reduce the cost and time as well as the quality of training or planning the courses and categorization of candidates. Information related to demographics, education, experience are in hands from candidates signup and enrollment data.

The whole data divided to "train" and "test". Target isn't included in test.

# **PROJECT GOAL**

* Prediction of the probability of a candidate will work for the company.

In [None]:
train = pd.read_csv("/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
test = pd.read_csv("/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv")

In [None]:
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### NA #####################")
    print(pd.DataFrame({"NA_COUNT":dataframe.isnull().sum(),
                        "NA_RATIO":dataframe.isnull().sum() / len(dataframe)}))
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

# **FEATURES**

*** enrollee_id :** Unique ID for candidate

*** city:** City code

*** city_ development _index :** Development index of the city (scaled)

*** gender:** Gender of candidate

*** relevent_experience:** Relevant experience of candidate

*** enrolled_university:** Type of University course enrolled if any

*** education_level:** Education level of candidate

*** major_discipline:** Education major discipline of candidate

*** experience:** Candidate total experience in years

*** company_size:** Number of employees in current employer's company

*** company_type:** Type of current employer

*** lastnewjob:** Difference in years between previous job and current job

*** training_hours:** Training hours completed

*** target:** 0 – Not looking for job change, 1 – Looking for a job change

In [None]:
check_df(train)

In [None]:
check_df(test)

# **APPROACH**

* First step is to divide the training dataset into two parts as "test_X" and "train" for validation as below. 

* Then to merge two parts of datasets which I created "train" and "test_X" with "test" set that already exist.

* To make feature transformation.

* To fill the missing values using KNN imputer.

* To use Light GBM algorithm for prediction.

* To use the metrics which called log-loss to prevent overfitting and roc-auc score for correct classification. 

* And finally, to predict the probability of working of a candidate on the test set which I will call "submission_df". 

In [None]:
# I created a test_df from train set for validation.
# I dropped the "target" values from test_X and I saved true target values as "results" dataframe.
test_df = train.iloc[:1000, :]
results = test_df[["enrollee_id", "target"]]


test_X = test_df.drop ("target", axis=1)
test_X

In [None]:
# I merged test_X, train and test datasets for feature transformation.
df = pd.concat ([test_X, train.iloc[1000:, :], test], axis=0).reset_index (drop=True)
check_df (df)

In [None]:
# I noticed that there is an imbalanced dataset problem.
df["target"].value_counts() / len(df)

In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    # Grabs the columns which is categorical, numerical, categorical but cardinal and numerical but categorical.
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique () < cat_th and
                   dataframe[col].dtypes != "O" or ("id" in col)]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique () > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print (f"Observations: {dataframe.shape[0]}")
    print (f"Variables: {dataframe.shape[1]}")
    print (f'cat_cols: {len (cat_cols)}')
    print (f'num_cols: {len (num_cols)}')
    print (f'cat_but_car: {len (cat_but_car)}')
    print (f'num_but_cat: {len (num_but_cat)}')

    return cat_cols, cat_but_car, num_cols, num_but_cat

In [None]:
categorical_cols, categorical_but_cardinal, numeric_cols, numeric_but_categorical = grab_col_names(df)

In [None]:
print("Categorical columns : {}".format(categorical_cols))
print("="*80)
print("Cat_But_Car columns : {}".format(categorical_but_cardinal))
print("="*80)
print("Numeric columns : {}".format(numeric_cols))
print("="*80)
print("Num_But_Cat columns : {}".format(numeric_but_categorical))


In [None]:
def cat_summary(dataframe, col_name, plot=False):
    # Shows summary of categorical columns.
    df = pd.DataFrame ({col_name: dataframe[col_name].value_counts (),
                          "Ratio": 100 * dataframe[col_name].value_counts () / len (dataframe)})
    print(df)

    if plot:
        plt.figure(figsize=(7,7))
        plt.pie (df["Ratio"], labels=df.index, 
                labeldistance=1.15, wedgeprops = { 'linewidth' : 1, 'edgecolor' : 'white' }, autopct = "%1.1f%%",
                pctdistance=0.85, textprops={'fontsize': 10})
     
        #draw circle
        centre_circle = plt.Circle((0,0),0.70,fc='white')
        fig = plt.gcf()
        fig.gca().add_artist(centre_circle)
        plt.show();
        print("="*50)

In [None]:
# Number of unique class of each feature.
df.nunique()

In [None]:
graph_cols = [col for col in categorical_cols if df[col].nunique() < 40]
for col in graph_cols:
    cat_summary (df, col, plot=True)

In [None]:
def target_summary_with_cat(dataframe, target, categorical_col, plot=False):
    # Shows some of descriptive statistical metrics of target according to each categorical class.
    df = pd.DataFrame({"TARGET_MEAN": dataframe.groupby(categorical_col)[target].mean(),
                        "TARGET_MEDIAN": dataframe.groupby(categorical_col)[target].median(),
                        "COUNT": dataframe.groupby(categorical_col)[target].count()})
    print(df)
    if plot==True:
        sns.barplot(x=df.index, y=df["TARGET_MEAN"])
        plt.xticks(rotation=45)
        plt.xlabel(df.index.name.upper())
        plt.show();
        print("="*50)

In [None]:
cats = [col for col in df.columns if (col in categorical_cols + categorical_but_cardinal) & (col not in ["enrollee_id","target"]) & (df[col].nunique() < 20)]
for col in cats:
    target_summary_with_cat(df, "target", col, plot=True)

In [None]:
def target_summary_with_num(dataframe, target, numerical_col):
    # Shows average of target according to numerical columns.
    df = dataframe.groupby (target).agg ({numerical_col: "mean"})
    print(df)
    print("="*50)


In [None]:
for col in numeric_cols:
    target_summary_with_num (df, "target", col)

In [None]:
# Examining the missing values.
missing_cols = [col for col in df.columns if (df[col].isnull().any()) & (col != "target")]
msno.matrix(df[missing_cols]);

In [None]:
msno.heatmap(df[missing_cols]);

In [None]:

def data_prep(dataframe):
    # Labeling "relevent_experience" feature.
    dataframe.loc[dataframe["relevent_experience"] == "Has relevent experience", "NEW_IS_RELEVANT_EXP"] = 1
    dataframe.loc[dataframe["relevent_experience"] == "No relevent experience", "NEW_IS_RELEVANT_EXP"] = 0

    # Labeling "gender" feature.
    dataframe.loc[dataframe["gender"] == "Male", "NEW_GENDER"] = 0
    dataframe.loc[dataframe["gender"] == "Female", "NEW_GENDER"] = 1
    dataframe.loc[dataframe["gender"] == "Other", "NEW_GENDER"] = 2

    # Assigning the ones greater than 20 as 20, and the ones less than 1 as 0 and converting them to float.
    dataframe.loc[dataframe["experience"] == ">20", "experience"] = 20
    dataframe.loc[dataframe["experience"] == "<1", "experience"] = 0
    dataframe["NEW_EXPERIENCE"] = dataframe["experience"].astype (float)

    # Collecting the "last_new_job" under three groups.
    dataframe.loc[dataframe["last_new_job"].isin (["1", "2", "3", "4"]), "NEW_LAST_NEW_JOB"] = "1-4 years"
    dataframe.loc[dataframe["last_new_job"] == ">4", "NEW_LAST_NEW_JOB"] = "5+ years"
    dataframe.loc[dataframe["last_new_job"] == "never", "NEW_LAST_NEW_JOB"] = "never"

    # Dividing "major_discipline" into two groups as being STEM or not.
    Exc_STEM = [col for col in dataframe["major_discipline"].unique () if col not in ["STEM", np.nan]]
    dataframe.loc[dataframe["major_discipline"].isin (Exc_STEM), "NEW_MAJOR_is_STEM"] = 0
    dataframe.loc[(dataframe["major_discipline"] == "STEM"), "NEW_MAJOR_is_STEM"] = 1

    new_df = dataframe.copy()

    del_cols = ["gender", "relevent_experience", "experience", "last_new_job", "major_discipline"]

    new_df.drop (del_cols, axis=1, inplace=True)

    return new_df

In [None]:
df_2 = data_prep (df)
check_df(df_2)

In [None]:
multiclass_cat_cols = [col for col in df_2.columns if (df_2[col].nunique () > 2) & (df_2[col].dtype == "O")]
multiclass_cat_cols

In [None]:
# Using Label Encoding for all multiclass categorical columns before filling the missing values.
df_2[multiclass_cat_cols] = df_2[multiclass_cat_cols].apply (lambda series: pd.Series (
    LabelEncoder ().fit_transform (series[series.notnull ()]),
    index=series[series.notnull ()].index))

In [None]:
df_2.head()

In [None]:
def knn_imputer_test(data, n_neighbors_list, model_name):
    # Returns n_neighbor value which has the best roc_auc score for given n_neigbor lists.
    best_roc_auc = 0
    best_n_neighbor = 0

    for n in n_neighbors_list:
        dataframe = data.copy ()
        imputed_cols = [col for col in dataframe.columns if
                        (col not in ["target", "enrollee_id"]) & (dataframe[col].isnull ().any ())]
        knn_imputer = KNNImputer (n_neighbors=n)
        dataframe[imputed_cols] = np.round (knn_imputer.fit_transform (dataframe[imputed_cols]))

        train_data = dataframe[dataframe.notnull ().all (axis=1)]

        # MODEL
        X = train_data.drop (["enrollee_id", "target"], axis=1)
        y = train_data['target']

        # Data upscaling
        smote = SMOTE ()
        X, y = smote.fit_resample (X, y)

        X_train, X_test, y_train, y_test = train_test_split (X, y, random_state=17, test_size=0.2)

        model = model_name
        fit_model = model.fit (X_train, y_train)
        y_probs = fit_model.predict_proba (X_test)
        y_probs = y_probs[:, 1]

        roc_auc = roc_auc_score (y_test, y_probs)

        print ("n_neighbors : {}   //   roc_auc_score : {}".format (n, roc_auc))
        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            best_n_neighbor = n
        else:
            continue
    return best_roc_auc, best_n_neighbor

In [None]:
new_df = df_2.copy()

In [None]:
lgbm_roc_auc, lgbm_n_neighbor = knn_imputer_test (new_df, range (2, 11), LGBMClassifier (random_state=17))

In [None]:
print("LGBM best roc_auc : {}  // LGBM n_neighbor : {}".format(lgbm_roc_auc, lgbm_n_neighbor))

In [None]:
# Columns to be filled.
imputed_cols = [col for col in new_df.columns if 
                (col not in ["target", "enrollee_id"]) & (new_df[col].isnull().any())]
imputed_cols

# **FILLING THE MISSING VALUES USING KNN IMPUTER**

In [None]:
knn_imputer = KNNImputer (n_neighbors=lgbm_n_neighbor)
new_df[imputed_cols] = np.round(knn_imputer.fit_transform (new_df[imputed_cols]))

In [None]:
prep_df = new_df.copy ()
check_df(prep_df)

In [None]:
# Dividing dataset into three parts as test_data, subb_data and train_data.
test_data = prep_df.loc[prep_df["enrollee_id"].isin (results["enrollee_id"])].reset_index(drop=True)
subb_data = prep_df.loc[prep_df["enrollee_id"].isin (test["enrollee_id"])].reset_index(drop=True)
train_data = prep_df[prep_df.notnull ().all (axis=1)].reset_index(drop=True)

In [None]:
subb_data.head()

In [None]:
test.head()

# **LGBM MODEL**

In [None]:
# Using train_data for setting up lgbm model.
X = train_data.drop (["enrollee_id", "target"], axis=1)
y = train_data['target']

In [None]:
# Imbalanced dataset.
y.value_counts () / len(y)

In [None]:
def plot_learning_curve(model, X, Y):
    # Plots logistic loss values for train and test sets.
    x_train, x_test, y_train, y_test = train_test_split (X, Y, test_size=0.2, random_state=17)
    train_loss, test_loss = [], []

    for m in range (200, len (x_train), 200):
        model.fit (x_train.iloc[:m, :], y_train[:m])
        y_train_prob_pred = model.predict_proba (x_train.iloc[:m, :])
        train_loss.append (log_loss (y_train[:m], y_train_prob_pred))

        y_test_prob_pred = model.predict_proba (x_test)
        test_loss.append (log_loss (y_test, y_test_prob_pred))

    plt.figure (figsize=(15, 8))
    plt.plot (train_loss, 'r-+', label='Training Loss')
    plt.plot (test_loss, 'b-', label='Test Loss')
    plt.xlabel ('Number Of Batches')
    plt.ylabel ('Log-Loss')
    plt.legend (loc='best')

    plt.show ()

In [None]:
plot_learning_curve(LGBMClassifier(random_state=17), X, y)

In [None]:
# Data Upscaling for decreasing log_loss values
smote = SMOTE (random_state=17)
X_smote, y_smote = smote.fit_resample (X, y)

In [None]:
# SMOTE 2
X_smote1, y_smote1 = smote.fit_resample(X,y)

In [None]:
# SMOTE 3
X_smote2, y_smote2 = smote.fit_resample(X,y)

In [None]:
X_final = pd.concat([X_smote, X_smote1, X_smote2]).reset_index(drop=True)
y_final = pd.concat([y_smote, y_smote1, y_smote2]).reset_index(drop=True)

In [None]:
y_final.value_counts()

In [None]:
plot_learning_curve(LGBMClassifier(random_state=17), X_final, y_final)

In [None]:
X_train, X_val, y_train, y_val = train_test_split (X_final, y_final, random_state=17, test_size=0.2)

In [None]:
lgbm = LGBMClassifier (random_state=17)
lgbm_model = lgbm.fit (X_train, y_train)

In [None]:
# Train Set Score
train_preds = lgbm_model.predict (X_train)
train_probs = lgbm_model.predict_proba (X_train)
train_probs = train_probs[:, 1]
print (classification_report (y_train, train_preds))
print ("Roc_Auc Score : {}".format(roc_auc_score (y_train, train_probs)))

In [None]:
# Validation Set Score
y_pred = lgbm_model.predict (X_val)
probs_model = lgbm_model.predict_proba (X_val)
probs_model = probs_model[:, 1]

print (classification_report (y_val, y_pred))
print ("Roc_Auc Score : {}".format(roc_auc_score (y_val, probs_model)))

In [None]:
# MODEL TUNING
lgbm_params = {"num_leaves": [20, 50],
               "max_depth": [5, 8],
               "learning_rate": [0.005, 0.01, 0.02],
               "n_estimators": [100, 500, 1000]}

lgbm_cv = GridSearchCV (lgbm, lgbm_params, cv=5, n_jobs=-1, verbose=2).fit (X_train, y_train)

lgbm_cv.best_params_

In [None]:
# FINAL MODEL
lgbm_tuned = LGBMClassifier (random_state=17, **lgbm_cv.best_params_).fit (X_train, y_train)
y_tuned = lgbm_tuned.predict (X_val)
probs_final = lgbm_tuned.predict_proba (X_val)
probs_final = probs_final[:, 1]

In [None]:
# Validation Set Score with Final Model
print (classification_report (y_val, y_tuned))
print ("Roc_Auc Score : {}".format(roc_auc_score (y_val, probs_final)))

In [None]:
def roc_auc_plot(model_name, testX, ytrue):
    probs = model_name.predict_proba(testX)
    probs = probs [:,1]
    fpr, tpr, threshold = roc_curve (ytrue, probs)
    roc_auc = roc_auc_score (ytrue, probs)

    plt.title ('Receiver Operating Characteristic')
    plt.plot (fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend (loc='lower right')
    plt.plot ([0, 1], [0, 1], 'r--')
    plt.xlim ([0, 1])
    plt.ylim ([0, 1])
    plt.ylabel ('True Positive Rate')
    plt.xlabel ('False Positive Rate')
    plt.show ();

    return roc_auc


In [None]:
roc_auc_plot(lgbm_tuned, X_val, y_val)

In [None]:
# Using tuned model into test_data which extracting from train set at the beginning.
test_data

In [None]:
results

In [None]:
test_labels = results["target"]

In [None]:
test_data_X = test_data.drop (["enrollee_id", "target"], axis=1)

lgbm_final_pred = lgbm_tuned.predict (test_data_X)
lgbm_final_probs = lgbm_tuned.predict_proba (test_data_X)
lgbm_final_probs = lgbm_final_probs[:, 1]


In [None]:
print (classification_report (test_labels, lgbm_final_pred))
print ("Roc_Auc Score : {}".format(roc_auc_score (test_labels, lgbm_final_probs)))

In [None]:
def con_matrix(true_labels, pred_labels):
    # Plots confusion matrix as a heatmap.
    matrix = confusion_matrix (true_labels, pred_labels)
    sns.heatmap (matrix, annot=True, fmt="d")
    plt.xlabel ("Predicted")
    plt.ylabel ("Actual")
    plt.show ();

In [None]:
roc_auc_plot (lgbm_tuned, test_data_X, test_labels)

In [None]:
con_matrix (test_labels, lgbm_final_pred)

# **SUBMISSION**

In [None]:
subb_data.head()

In [None]:
subb_X = subb_data.drop(["enrollee_id","target"], axis=1)

In [None]:
prediction = lgbm_tuned.predict_proba(subb_X)
prediction[:5]

In [None]:
predict = prediction[:,1]
predict[:5]

In [None]:
submission_df = pd.DataFrame({"enrollee_id":subb_data["enrollee_id"],
             "target":predict})

submission_df.head()

In [None]:
submission_df.to_csv('submission.csv',index=False)