In [None]:
import pandas as pd

# You need scikit-learn 0.24 to run this notebook, you can install it with the code below if your account has been verified
# and you have the "Internet" toggle switched on in the right pannel of Kaggle

# pip install scikit-learn==0.24

In [None]:
df = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
df_test = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/aug_test.csv")
sample = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv")

### Let's get some infos about this Dataset, i have a rapid function for that

In [None]:
def get_df_infos(df_):

    df_info = pd.DataFrame(columns={"column", "NaN", "NaN %"})

    for index, value in df_.isna().sum().iteritems():
        df_temp = pd.DataFrame({"column" : index, "NaN" : [value], "NaN %" : round(value*100 /len(df_), 2)})
        df_info = pd.concat([df_info, df_temp], ignore_index=True)
        df_info.sort_values(by="NaN", ascending=False, inplace=True)

    int_ = df_.select_dtypes(include=['int64']).columns.to_list()
    float_ = df_.select_dtypes(include=['float64']).columns.to_list()
    object_ = df_.select_dtypes(include=['object']).columns.to_list()

    print(f"Int64 : {', '.join(int_)}")
    print(f"\nFloat64 : {', '.join(float_)}")
    print(f"\nObject : {', '.join(object_)}\n") 
    
    print("Total detected columns =",len(int_) + len(float_) + len(object_))
    print("\nshape =", df_.shape)
    print("\nshape without NaNs =", df_.dropna().shape)

    print("\n\n", df_info)


In [None]:
get_df_infos(df)

### Too many NaNs, gotta clean that up
### I will drop the NaNs from the columns with <3% of NaNs

In [None]:
def df_cleaner(df_):
    under_3 = []
    for index, value in df_.isna().sum().iteritems():
        if 0 <  value*100 /len(df_) < 3:
            under_3.append(index)
    df_.dropna(subset=under_3, axis=0, inplace=True)
    return df_

df = df_cleaner(df)

df.shape

### The Dataset isn't that big, i can't afford to drop all the NaNs i'll end up with too little Data to play with, i'll fill the rest of the NaN with the mode value of each column

In [None]:
columns_to_fill = ['gender','company_size','major_discipline','company_type','relevent_experience']

for col in columns_to_fill:
    df[col].fillna(df[col].mode()[0], inplace=True)

df.isna().sum()

### Very well, we no longer have any NaNs, let's move onto the next problem to deal with.
### As stated in the description, the "target" is unbalanced, resampling could be the way to go about it


In [None]:
round(df["target"].value_counts(normalize=True)*100, 3)

In [None]:
from sklearn.utils import resample

df_majority = df[df["target"] == 0]
df_minority = df[df["target"] == 1]
 

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     
                                 n_samples=len(df_majority),    
                                 random_state=123) 
 
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
df_upsampled["target"].value_counts()

### We now have equally populated "target" classes

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axs = plt.subplots(1, 2, figsize=(17, 6))
fig.suptitle('Before | After')

sns.set_style("darkgrid")

sns.countplot(df['target'], ax=axs[0]).set_title("1. Original")
sns.countplot(df_upsampled['target'], ax=axs[1]).set_title("2. Upsampled")

fig.show()

### Let's initiate a Pipeline with no specific parameters yet
### ColumnTransformer will transform our data according to our needs (categorical, ordinal)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

# !!! "handle_unknown" for OrdinalEncoder ONLY works with sklearn 0.24 and above

X = df_upsampled.drop(["target", "enrollee_id"], axis=1)
y = df_upsampled["target"]

categorical_features = df_upsampled.select_dtypes(include=['object']).columns.to_list()

ordinal_features = ["training_hours", "city_development_index"]

preprocessor = ColumnTransformer(
               transformers=[
               ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
               ('ord', OrdinalEncoder(handle_unknown='ignore'), ordinal_features),         
               ],
               remainder = "drop"
               )

classifier_pipeline = Pipeline(
                      steps=[
                      ('preprocessor', preprocessor),
                      ('SVD', TruncatedSVD()),
                      ('classifier', ExtraTreesClassifier())             
                      ]
                      )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

classifier_pipeline.fit(X_train, y_train)

###  Let's list all available parameters for my pipeline

In [None]:
sorted(classifier_pipeline.get_params().keys())

### There are a lot of them! I'll choose a few, it takes a long time to execute a Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = [                           
              {
                'SVD__n_components': range(5, 9),
                'classifier__max_depth': range(25, 40, 2),
                'classifier__min_samples_leaf' : range(3, 10),
                'classifier__criterion' : ["gini", "entropy"]
                }
              ]

grid_search = GridSearchCV(classifier_pipeline, param_grid=parameters, scoring="accuracy")

grid_search.fit(X_train, y_train)

print(f"Best parameters : \n\n{grid_search.best_params_}")

### Let's apply those parameters to our pipeline

In [None]:
X = df_upsampled.drop(["target", "enrollee_id"], axis=1)
y = df_upsampled["target"]

categorical_features = df_upsampled.select_dtypes(include=['object']).columns.to_list()

ordinal_features = ["training_hours", "city_development_index"]

preprocessor = ColumnTransformer(
               transformers=[
               ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
               ('ord', OrdinalEncoder(handle_unknown='ignore'), ordinal_features),         
               ],
               remainder = "drop"
               )

classifier_pipeline = Pipeline(
                      steps=[
                      ('preprocessor', preprocessor),
                      ('SVD', TruncatedSVD(n_components=5)),
                      ('classifier', ExtraTreesClassifier(criterion='entropy', max_depth=37, min_samples_leaf=3))             
                      ]
                      )

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25)

classifier_pipeline.fit(X_train, y_train)

In [None]:
print(f"Train score = {round(classifier_pipeline.score(X_train, y_train), 4)}")
print(f"Test score = {round(classifier_pipeline.score(X_test, y_test), 4)}")

In [None]:
from sklearn import metrics
from sklearn.metrics import plot_roc_curve

sns.set(rc={'figure.figsize':(10, 5)})
sns.set_style("darkgrid")

metrics.plot_roc_curve(classifier_pipeline, X_test, y_test)

plt.show()