In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



df_train = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
df_test = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

In [None]:
df_train

### Data Cleaning

#### Train set

In [None]:
# Replace string with float/int
df_train['experience'] = df_train['experience'].replace('>20','25')
df_train['experience'] = df_train['experience'].replace('<1','0.5')
df_train['experience'] = df_train['experience'].astype('float')
df_train['last_new_job'] = df_train['last_new_job'].replace('>4','5')
df_train['last_new_job'] = df_train['last_new_job'].replace('never','0')

# Impute/fill NaN
df_train['gender'] = df_train['gender'].replace(np.nan, 'unknown')
df_train['enrolled_university'] = df_train['enrolled_university'].replace(np.nan, 'unknown')
df_train['education_level'] = df_train['education_level'].replace(np.nan, 'unknown')
df_train['major_discipline'] = df_train['major_discipline'].replace(np.nan, 'unknown')
df_train['education_level'] = df_train['education_level'].replace(np.nan, 'unknown')
df_train['experience'] = df_train['experience'].fillna(value = df_train['experience'].median())
df_train['company_size'] = df_train['company_size'].fillna(value = df_train['company_size'].value_counts().index[0])
df_train['company_type'] = df_train['company_type'].replace(np.nan, 'unknown')
df_train['last_new_job'] = df_train['last_new_job'].fillna(value = df_train['last_new_job'].median()).astype('int')
df_train['target'] = df_train['target'].astype('int')

#### Repeat with test set

In [None]:
# Replace string with float/int
df_test['experience'] = df_test['experience'].replace('>20','25')
df_test['experience'] = df_test['experience'].replace('<1','0.5')
df_test['experience'] = df_test['experience'].astype('float')
df_test['last_new_job'] = df_test['last_new_job'].replace('>4','5')
df_test['last_new_job'] = df_test['last_new_job'].replace('never','0')
# Impute/fill NaN
df_test['gender'] = df_test['gender'].replace(np.nan, 'unknown')
df_test['enrolled_university'] = df_test['enrolled_university'].replace(np.nan, 'unknown')
df_test['education_level'] = df_test['education_level'].replace(np.nan, 'unknown')
df_test['major_discipline'] = df_test['major_discipline'].replace(np.nan, 'unknown')
df_test['education_level'] = df_test['education_level'].replace(np.nan, 'unknown')
df_test['experience'] = df_test['experience'].fillna(value = df_test['experience'].median())
df_test['company_size'] = df_test['company_size'].fillna(value = df_test['company_size'].value_counts().index[0])
df_test['company_type'] = df_test['company_type'].replace(np.nan, 'unknown')
df_test['last_new_job'] = df_test['last_new_job'].fillna(value = df_test['last_new_job'].median()).astype('int')
# df_test['target'] = df_test['target'].astype('int')

### Treat continuous and categorical variables separately and then combine

In [None]:
df_train

In [None]:
# Continuous variables
features = ['city_development_index', 'training_hours', 'experience', 'last_new_job']
X_train_con = df_train[features]

from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Since we're using more than one feature, let's scale our features
scaler = StandardScaler()
X_train_con_scaled = scaler.fit_transform(df_train[features])
cont_columns = X_train_con.columns
X_train_con_df = pd.DataFrame(X_train_con_scaled, columns=cont_columns, index=X_train_con.index)

y_train = df_train['target']

In [None]:
# Categorical variables
cat_variables = ['relevent_experience','enrolled_university','education_level','major_discipline','company_size','company_type']
X_train_cat = df_train[cat_variables]

ohe = OneHotEncoder(sparse=False, drop='first')
ohe.fit(X_train_cat) 
cats = ohe.transform(X_train_cat)

columns = ohe.get_feature_names(cat_variables)
X_train_cat_df = pd.DataFrame(cats, columns=columns, index=X_train_cat.index)

# Combine Con and Cat
X_train = pd.concat([X_train_con_df,X_train_cat_df], axis='columns')

In [None]:
# Repeat for test set

# y_test = df_test['target']

# Continuous variables
features = ['city_development_index', 'training_hours', 'experience', 'last_new_job']
X_test_con = df_test[features]


# Since we're using more than one feature, let's scale our features
scaler = StandardScaler()
X_test_con_scaled = scaler.fit_transform(df_test[features])
cont_columns = X_test_con.columns
X_test_con_df = pd.DataFrame(X_test_con_scaled, columns=cont_columns, index=X_test_con.index)

# y_test = df_test['target']

# Categorical variables
cat_variables = ['relevent_experience','enrolled_university','education_level','major_discipline','company_size','company_type']
X_test_cat = df_test[cat_variables]

ohe = OneHotEncoder(sparse=False, drop='first')
ohe.fit(X_test_cat) 
cats = ohe.transform(X_test_cat)

columns = ohe.get_feature_names(cat_variables)
X_test_cat_df = pd.DataFrame(cats, columns=columns, index=X_test_cat.index)

# Combine Con and Cat
X_test = pd.concat([X_test_con_df,X_test_cat_df], axis='columns')

### Handle imbalanced classes

In [None]:
from imblearn.over_sampling import ADASYN
X_train, y_train = ADASYN(random_state=42).fit_sample(X_train,y_train)

### Model: Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

decisiontree_2 = DecisionTreeClassifier()
decisiontree_2.get_params()

In [None]:
max_depth = [3,4,5,6]
min_samples_leaf = [0.04,0.06,0.08]
max_features = [0.2,0.4,0.6,0.8]

In [None]:
# Define grid search
grid = dict(max_depth = max_depth, min_samples_leaf = min_samples_leaf, max_features = max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=decisiontree_2, param_grid=grid, n_jobs=-1, cv=5, scoring='recall',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

In [None]:
# Select the best estimator from grid search results
decisiontree_3 = grid_search.best_estimator_
decisiontree_3.fit(X_train,y_train)
pred = decisiontree_3.predict(X_test)

### Prepare for submission

In [None]:
my_submission = df_test[['enrollee_id']].copy()
my_submission['predict'] = pred
my_submission

In [None]:
my_submission.to_csv('submission.csv', index=False)