In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce

from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_validate, validation_curve, learning_curve

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline

# Helpers

In [None]:
def get_text_columns(d):
    current_filter = d.dtypes == object
    columns = [c for c in current_filter.index if current_filter[c]]
    
    return columns

def confusion_matrix(y_test, predicted):
    confusion_matrix = metrics.confusion_matrix(y_test, predicted)
    class_name = [0, 1]
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_name))
    plt.xticks(tick_marks, class_name)
    plt.yticks(tick_marks, class_name)

    sns.heatmap(pd.DataFrame(confusion_matrix), annot=True, cmap='Blues_r', fmt='g')
    ax.xaxis.set_label_position('top')
    plt.tight_layout()
    plt.title('Confusion Matrix', y=1.1)
    plt.ylabel('Current Label')
    plt.xlabel('Predicted Label')

def test_model(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    y_predicted = model.predict(X_test)
    
    confusion_matrix(y_test, y_predicted)
    
    return (model, score, y_predicted)

def test_model_cross_val(model, X, y, cv=5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    result = cross_val_score(model, X_train, y_train, cv=cv)
    
    return np.mean(result)

def test_model_cross_validate(model, X, y, cv=5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    result = cross_validate(model, X_train, y_train, return_train_score=True, cv=cv)
    
    test_score = result['test_score']
    train_score = result['train_score']
    
    print('test_score', np.mean(test_score))
    print('train_score', np.mean(train_score))
    
def test_model_validation_curve(model, param_name, param_range, cv=5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    train_scores, test_scores = validation_curve(
        model,
        X_train,
        y_train,
        param_name=param_name,
        param_range=param_range,
        cv=cv)
    
    np.mean(train_scores, axis=1)
    np.mean(train_scores, axis=1)
    
    plt.plot(np.mean(train_scores, axis=1))
    plt.plot(np.mean(test_scores, axis=1))
    plt.xticks(np.arange(len(param_range)), param_range)
    
    
def test_learning_curve(model, X, y, cv=5):
    lc = learning_curve(model, X, y, cv=cv)
    samples, train, test = lc[0], lc[1], lc[2]
    plt.plot(samples[1:], np.mean(train, axis=1)[1:])
    plt.plot(samples[1:], np.mean(test, axis=1)[1:])

# Datasets and informations

In [None]:
train = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')

### Train information

In [None]:
train.info()

### Train size

In [None]:
train.shape

### Test information

In [None]:
test.info()

### Test size

In [None]:
test.shape

- **enrollee_id:** Unique ID for candidate
- **city:** City code
- **city_ development _index:** Developement index of the city (scaled)
- **gender:** Gender of candidate
- **relevent_experience:** Relevant experience of candidate
- **enrolled_university:** Type of University course enrolled if any
- **education_level:** Education level of candidate
- **major_discipline:** Education major discipline of candidate
- **experience:** Candidate total experience in years
- **company_size:** No of employees in current employer's company
- **company_type:** Type of current employer
- **lastnewjob:** Difference in years between previous job and current job
- **training_hours:** training hours completed
- **target:** 0 – Not looking for job change, 1 – Looking for a job change

# Show data

In [None]:
train.head(5)

In [None]:
test.head(5)

# Feature engineering

In [None]:
cat_cols = get_text_columns(train)
cat_cols

### Encode data

In [None]:
encoder = ce.BinaryEncoder(cols=cat_cols)
train_binenc = encoder.fit_transform(train)
test_binenc = encoder.fit_transform(test)

In [None]:
train_binenc.head()

In [None]:
test_binenc.head()

# Train and selected model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import ensemble, tree

In [None]:
y = train_binenc['target']
X = train_binenc.drop(['target', 'enrollee_id'], axis=1)

In [None]:
(model, score, y_predicted) = test_model(tree.DecisionTreeClassifier(), X, y)

In [None]:
score

In [None]:
(model, score, y_predicted) = test_model(ensemble.GradientBoostingClassifier(), X, y)

In [None]:
score

# Validation

In [None]:
test_model_cross_validate(ensemble.GradientBoostingClassifier(), X, y)

In [None]:
test_learning_curve(ensemble.GradientBoostingClassifier(), X, y)

In [None]:
from sklearn.model_selection import GridSearchCV

param_test1 = {'n_estimators': np.arange(20, 501, 20)}
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
model_test = ensemble.GradientBoostingClassifier(
    learning_rate=0.1,
    min_samples_split=500,
    min_samples_leaf=50,
    max_depth=8,
    max_features='sqrt',
    subsample=0.8,
    random_state=10,
)

In [None]:
gridSearch = GridSearchCV(model_test, param_grid=param_test1, scoring='r2', cv=5)
gridSearch.fit(X_train, y_train)

In [None]:

# gridSearch.best_score_
# gridSearch.best_estimator_

gridSearch.best_params_ 

In [None]:
r = cross_validate(gridSearch.best_estimator_, X_train, y_train, return_train_score=True)
test_score = r['test_score']
train_score = r['train_score']

print('test_score', np.mean(test_score))
print('train_score', np.mean(train_score))

# Final model

In [None]:
model = ensemble.GradientBoostingClassifier(
    learning_rate=0.1,
    min_samples_split=500,
    min_samples_leaf=50,
    max_depth=8,
    max_features='sqrt',
    subsample=0.8,
    random_state=10,
    n_estimators=80,
)

model.fit(X_train, y_train)
score = model.score(X_test, y_test)

In [None]:
score

In [None]:
X_test2 = test_binenc.drop(['enrollee_id'], axis=1)
model.predict(X_test2)