In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
%matplotlib inline

**Note:**

- The dataset is imbalanced.
- Most features are categorical (Nominal, Ordinal, Binary), some with high cardinality.
- Missing imputation can be a part of your pipeline as well.

**Features:**

- enrollee_id : Unique ID for candidate
- city: City code
- city_ development _index : Developement index of the city (scaled)
- gender: Gender of candidate
- relevent_experience: Relevant experience of candidate
- enrolled_university: Type of University course enrolled if any
- education_level: Education level of candidate
- major_discipline :Education major discipline of candidate
- experience: Candidate total experience in years
- company_size: No of employees in current employer's company
- company_type : Type of current employer
- lastnewjob: Difference in years between previous job and current job
- training_hours: training hours completed
- target: 0 – Not looking for job change, 1 – Looking for a job change

**Inspiration:**

- Predict the probability of a candidate will work for the company
- Interpret model(s) such a way that illustrate which features affect candidate decision

# Load and check data.

In [None]:
train_data = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test_data = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
sub_data = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/sample_submission.csv')

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
sub_data.head()

In [None]:
print("train data shape =", train_data.shape)
print("test data shape =", test_data.shape)
print("submission data shape =", sub_data.shape)

# Let's see how much missed data we have...

In [None]:
train_data.isnull().sum()

Wow! 

In [None]:
test_data.isnull().sum()

In [None]:
sub_data.isnull().sum()

In [None]:
cols = train_data.columns[0:14]
for col in cols:
    print(train_data[col].unique())

# Some methods to find duplicates.

In [None]:
print(train_data['enrollee_id'].equals(test_data['enrollee_id']))
print(train_data['enrollee_id'].equals(sub_data['enrollee_id']))
print(test_data['enrollee_id'].equals(sub_data['enrollee_id']))
print((test_data['enrollee_id'] == sub_data['enrollee_id']).all())

In [None]:
set(train_data['enrollee_id']) & set(test_data['enrollee_id'])

**Yes, we have no duplicates. Thx lord!**

# Replace nan values

In [None]:
cols_train = train_data.columns[2:14]
for col in cols_train:
    train_data[col] = train_data[col].fillna("No_Answer")
    
cols_test = test_data.columns[2:13]
for col in cols_test:
    test_data[col] = test_data[col].fillna("No_Answer")

In [None]:
cols = train_data.columns[2:14]
for col in cols:
    print(train_data[col].unique())

In [None]:
cols = test_data.columns[2:13]
for col in cols:
    print(test_data[col].unique())

In [None]:
sns.set(rc={'figure.figsize':(12,8)})
sns.countplot(x=train_data['gender'], hue='target', data=train_data)

**We see that males are more workers then females)**

**We have not so big disblance in class.**

In [None]:
sns.countplot(x=train_data['major_discipline'], hue='target', data=train_data)

In [None]:
sns.countplot(x=train_data['relevent_experience'], hue='target', data=train_data)

In [None]:
sns.histplot(x=train_data['city_development_index'], hue='target', data=train_data)

In [None]:
sns.set(rc={'figure.figsize':(20,8)})
sns.histplot(x=train_data['city'], hue='target', data=train_data)

**Feature 'City' doesn't really matter, we can remove it.**

In [None]:
sns.set(rc={'figure.figsize':(20,8)})
sns.histplot(x=train_data['company_size'], hue='target', data=train_data)

In [None]:
sns.set(rc={'figure.figsize':(20,8)})
sns.histplot(x=train_data['company_type'], hue='target', data=train_data)

In [None]:
sns.set(rc={'figure.figsize':(20,8)})
sns.histplot(x=train_data['company_type'], hue='company_size', data=train_data)

**Features 'company_size' and 'company_type' not informative.
We can drop it too.**

In [None]:
train_data = train_data.drop(['enrollee_id', 'city', 'company_size', 'company_type'],axis=1)
test_data = test_data.drop(['enrollee_id', 'city', 'company_size', 'company_type'],axis=1)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
sns.set(rc={'figure.figsize':(12,8)})
sns.countplot(x=train_data['last_new_job'], hue='target', data=train_data)

In [None]:
sns.set(rc={'figure.figsize':(20,8)})
sns.countplot(x=train_data['experience'], hue='target', data=train_data)

**We need to correct qualitative data into quantitative**

In [None]:
train_data['last_new_job'] = train_data['last_new_job'].replace({'never':0,'>4':5, 'No_Answer':-1}).astype('float')
test_data['last_new_job'] = test_data['last_new_job'].replace({'never':0,'>4':5, 'No_Answer':-1}).astype('float')
train_data['experience'] = train_data['experience'].replace({'<1':0,'>20':21, 'No_Answer':-1}).astype('float')
test_data['experience'] = test_data['experience'].replace({'<1':0,'>20':21, 'No_Answer':-1}).astype('float')

In [None]:
sns.set(rc={'figure.figsize':(12,8)})
sns.countplot(x=train_data['last_new_job'], hue='target', data=train_data)

In [None]:
sns.set(rc={'figure.figsize':(20,8)})
sns.countplot(x=train_data['experience'], hue='target', data=train_data)

In [None]:
train_data = pd.get_dummies(train_data, columns=['gender',
                                                 'enrolled_university',
                                                 'relevent_experience',
                                                 'education_level',
                                                 'major_discipline'],drop_first=True)

test_data = pd.get_dummies(test_data, columns=["gender",
                                               "enrolled_university",
                                               "relevent_experience",
                                               "education_level",
                                               "major_discipline"],drop_first=True)

In [None]:
train_data.head()

In [None]:
X = train_data.drop("target",axis=1)
y = train_data["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=100)

In [None]:
RF_clf = RandomForestClassifier(n_estimators=50,
                                oob_score=True,
                                random_state=42,
                                verbose=2,
                                max_depth = 10,
                                n_jobs=-1)

In [None]:
RF_clf = RF_clf.fit(X_train, y_train)

In [None]:
pred = RF_clf.predict(X_test)

In [None]:
print(accuracy_score(y_test, pred))

In [None]:
print(confusion_matrix(y_test, pred))

In [None]:
print(classification_report(y_test, pred))

In [None]:
pred = RF_clf.predict(test_data)
result = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv")

In [None]:
result.target = pred
result.target.value_counts()

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 200, num = 50)]

max_features = ['auto', 'sqrt', 'log2']

max_depth = [int(x) for x in np.linspace(10, 100, num = 11)]
max_depth.append(None)

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,}

print(random_grid)

In [None]:
rf = RandomForestClassifier(random_state = 42, n_jobs=-1)

rf_random = RandomizedSearchCV(estimator=rf, 
                               param_distributions=random_grid,
                               n_iter = 100,
                               scoring='neg_mean_absolute_error', 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs=-1,
                               return_train_score=True)

RF_clf_rand = rf_random.fit(X_train, y_train)

best_prms = rf_random.best_params_

In [None]:
print(best_prms)

In [None]:
preds = RF_clf_rand.predict(X_test)

In [None]:
print(accuracy_score(y_test, preds))

In [None]:
print(confusion_matrix(y_test, preds))

In [None]:
print(classification_report(y_test, preds))

In [None]:
preds = RF_clf_rand.predict(test_data)
result = pd.read_csv("../input/hr-analytics-job-change-of-data-scientists/sample_submission.csv")

In [None]:
result.target = preds
result.target.value_counts()