# Import and first overview

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBClassifier

# plt.style.use('ggplot')
sns.set_theme(style="whitegrid")

In [None]:
train = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv', index_col='enrollee_id')
X_test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv', index_col='enrollee_id')
train.shape, X_test.shape

In [None]:
train.head()

# Preprocessing

I will first take a look at the distribution of the data, and then create a ColumnTransformer in the end that will handle the transformations

In [None]:
missing_values = train.count().sum()
total_values = np.product(train.shape)
percentages = train.isna().sum().reset_index().rename(columns={'index': 'Column', 0:'Missing'})
percentages['Percentage'] = percentages['Missing']/train.shape[0]*100
print(f'Amount of total missing data in train set: {missing_values}\nRelative amount of missing data: {missing_values/total_values*100:.3f}%')
print('-'*20)
print(f'Missing data per column:\n\n',percentages)

In [None]:
train.city.nunique()

* way too many unique variables for One Hot Encoding, and data is probably rather useless anyways. Let's drop the column

In [None]:
train.city_development_index.describe()

* since this is a numerical variable that is already between 0 and 1, we shouldn't worry about it

In [None]:
def plot(data, column):
    fig, ax = plt.subplots(1,2, figsize=(12,6))
    perc = percentages.query(f'Column == "{column}"')['Percentage'].values
    sns.countplot(data=data, x=column, ax=ax[0])
    ax[1].pie(train[column].value_counts().values, labels=train[column].value_counts().index, autopct='%1.1f%%')
    fig.tight_layout()
    fig.suptitle(f'Column: \'{column}\'   -   Missing data: {perc[0]:.2f}%', fontsize=30)
    fig.subplots_adjust(top=0.88)
    fig.show()
plot(train, 'gender')

* a lot of missing values, but the data is also very male dominated, so most frequent imputation should not add too much bias
* One Hot Encoding should be the way to go

In [None]:
plot(train, 'relevent_experience')

* Target Encoding seems reasonable here, since there can be a ordering.<br> 'Has relevant experience' (-> 1) is better to have in a job than not (-> 0) 

In [None]:
plot(train, 'enrolled_university')

* lets just do most frequent imputation to not delete the data, I don't think the values in this column matter too much
* One Hot Encoding

In [None]:
plot(train, 'education_level')
# pd.crosstab(index=train.education_level, columns=train.enrolled_university)

* same as before/above for missing values
* Encoding from Primary School (-> 0) to Phd (-> 4)

In [None]:
plot(train, 'major_discipline')

* a lot of data is missing, but the STEM is already by far the most prominent value, so most frequent imputation won't add a lot of bias to the data
* One Hot Encoding

In [None]:
plot(train, 'experience')

* very balanced column counts, and also few data missing plus I have a feeling this column might be important. Dropping the rows with missing data is probably the best, as imputation will be very biased (its also only 65 datapoints)
* convert <1 to 0 and >20 to 21, and treat it as a numeric variable

In [None]:
plot(train, 'company_size')

* a lot of data missing, and imputing would introduce a lot of bias. Let's drop the whole column, as this probably doesn't correlate with our target value too much aswell (company size seems to be more a personal preference)

In [None]:
plot(train, 'company_type')

* same as above, although Pvt Ltd is dominant - lets still drop it

In [None]:
plot(train, 'last_new_job')

* Most Frequent Imputation not the best, but let's give it a try
* convert 'never' to 0 and '>4' to 5 and treat it as a numerical variable

In [None]:
train.training_hours.head(10)

* continuous value with no missing data, leave it as is

## Create the ColumnTransformer

I will also add another feature that might be of high importance for prediction:<br>'Experience per last new jobs' - This measures how long a employee stays at his job on average<br>(this assumes that Experience=Years of having a job)

In [None]:
class MapTransformer(BaseEstimator, TransformerMixin):
    """Encodes the following ordinal variables: 
       
           relevant_experience
           education_level
           experience
           last_new_job
       
       Returns:
        encoded variables
       """
    def __init__(self):
        self.rel_ex_mapping = {'Has relevent experience':1, 'No relevent experience':0}
        self.ed_level_mapping = {'Primary School': 0, 'High School':1, 'Graduate':2, 'Masters':3, 'Phd':4}
        self.ex_mapping = {'<1':0, '>20':21}
        self.last_n_job_mapping = {'never':0, '>4':5}
    
    def fit(self, x, y=None):
        return self
    
    def transform(self, x, y=None):
        df = pd.DataFrame(x, columns=['relevent_experience', 'education_level', 'experience', 'last_new_job'])
        df['relevent_experience'] = df['relevent_experience'].replace(self.rel_ex_mapping)
        df['education_level'] = df['education_level'].replace(self.ed_level_mapping)
        df['experience'] = df['experience'].replace(self.ex_mapping).astype(float)
        df['last_new_job'] = df['last_new_job'].replace(self.last_n_job_mapping).astype(float)
        df['experience_per_job'] = df['experience'] / [x+1 for x in df['last_new_job']]
        
        return df
    
    def get_feature_names(self):
        return ['relevent_experience', 'education_level', 'experience', 'last_new_job', 'experience_per_job']

In [None]:
# DROPPING
for x in [train, X_test]:
    x.drop(columns=['city', 'company_size', 'company_type'], inplace=True)
    x.dropna(axis=0, subset=['experience'], inplace=True)
    
X_train = train.loc[:, train.columns != 'target']
y_train = train.pop('target')
    
# PIPELINE/COLUMNTRANSFORMER
pipeline_imp_ohe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
pipeline_imp_map = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('mapper', MapTransformer())
])

preprocessor = ColumnTransformer(transformers=[
    ('imp_ohe', pipeline_imp_ohe, ['gender', 'enrolled_university', 'major_discipline']),
    ('imp_map', pipeline_imp_map, ['relevent_experience', 'education_level', 'experience', 'last_new_job']),
], remainder='passthrough')


# DEMO OF WHAT DATA LOOKS LIKE AFTER PREPROCESSING
demo = preprocessor.fit_transform(X_train)
pd.DataFrame(demo)

# Model 1: Random Forest Classifier

In [None]:
# HYPERPARAMETER TUNING
# params = {'model__n_estimators': [100, 300, 500, 800],
#           'model__max_depth': [5, 8, 15, 25, None], 
#           'model__min_samples_split':[2, 5, 10, 15, 100],
#           'model__min_samples_leaf': [1, 2, 5, 10] } 

# randomforest_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('model', RandomForestClassifier())
# ])
# randomforest_pipeline.fit(X_train, y_train)

# clf = GridSearchCV(randomforest_pipeline, params, cv=3, verbose=1, n_jobs=-1, scoring='accuracy')
# scores_m1 = clf.fit(X_train, y_train)

# print(f'Best accuracy: {scores_m1.best_score_:.3f}%')
# print(f'Best Config: {scores_m1.best_params_}')


#OUTCOME
randomforest_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(n_estimators= 500, max_depth= 8, min_samples_leaf=1, min_samples_split=2))
])
scores_m1 = cross_val_score(randomforest_pipeline, X_train, y_train, cv=3)

# Model 2: Support Vector Classifier

In [None]:
# # HYPERPARAMETER TUNING
# params = {'model__C':[0.1,1,10],
#           'model__kernel':['sigmoid'],
#           'model__degree':[2,3,5],
#           'model__gamma': [1, 0.1, 0.01] }

# svmpipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('model', SVC())
# ])
# svmpipeline.fit(X_train, y_train)

# clt = GridSearchCV(svmpipeline, params, cv=3, verbose=1, n_jobs=-1, scoring='accuracy')
# scores_m2 = clt.fit(X_train, y_train)

# print(f'Best accuracy: {scores_m2.best_score_:.3f}%')
# print(f'Best Config: {scores_m2.best_params_}')

# #OUTCOME
svmpipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(C= .1, kernel= 'rbf', degree=5, gamma=0.1))
])
scores_m2 = cross_val_score(svmpipeline, X_train, y_train, cv=3)
scores_m2.mean()





# Model 3: XGBoost Classifier

In [None]:
# HYPERPARAMETER TUNING
# params = {'model__learning_rate':[0.5,0.1,0.05, 0.01],
#           'model__n_estimators':[100, 300, 500, 800],
#           'model__max_depth':[3,5,8] }
# fit_params = {'early_stopping_rounds': 5}

# xgbpipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('model', XGBClassifier())
# ])
# xgbpipeline.fit(X_train, y_train)

# clt = GridSearchCV(xgbpipeline, params, cv=3) #fit_params=fit_params)
# scores_m3 = clt.fit(X_train, y_train)

# print(f'Best accuracy: {scores_m3.best_score_:.3f}%')
# print(f'Best Config: {scores_m3.best_params_}')

#OUTCOME
xgbpipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(learning_rate= 0.1, n_estimators= 100, max_depth=3, use_label_encoder=False))])
scores_m3 = cross_val_score(xgbpipeline, X_train, y_train, cv=3)

In [None]:
print(f'RESULTS:\nRandom Forest Classifier: {scores_m1.mean():.3f}% accuracy')
print(f'Support Vector Classifier: {scores_m2.mean():.3f}% accuracy')
print(f'XGBoost Classifier: {scores_m3.mean():.3f}% accuracy')

# Submission

In [None]:
sample_submission = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv')

xgbpipeline.fit(X_train, y_train)
solutions = xgbpipeline.predict(X_test)

X_test['target'] = solutions
submission = X_test.reset_index()[['enrollee_id', 'target']]

submission.to_csv('submission.csv',index=False)

<h2 style='text-align: center'>That you for reading this notebook to the end!<br>Feel free to upvote and leave a comment.</h2><h4 style='text-align: center'>Also please tell me what I could've done better...<h4>