In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;"> Table of Contents </h1>

#### 1) Load Required Libraries

#### 2) iris

>       2.1) Finding Important Features

>       2.2) Generating the Model on Selected Features

#### 3) HR Analytics

>       3.1) Read Data

>       3.2) EDA (Exploratory Data Analysis)

>            3.2.1) Drop Unwanted Features

>       3.3) Model building and Evaluation 

>            3.3.1) Random Forest

>            3.3.2) LGBM

#### 4) Boston

>       a) Xgboost Built-in Feature Importance

>       b) Permutation Based Feature Importance

>       c) Feature Importance Computed with SHAP Values

<h1 style="background-color:orange; font-family:newtimeroman; font-size:170%; text-align:left;"> 1) Load Required Libraries </h1>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

plt.style.use("fivethirtyeight")
sns.set_style("darkgrid")

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

import xgboost
from xgboost import XGBClassifier

import lightgbm
from lightgbm import LGBMClassifier

# Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import accuracy_score

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 2) iris </h1>

In [None]:
iris = pd.read_csv("/kaggle/input/iris/Iris.csv")
iris.head()

In [None]:
iris.shape

In [None]:
# Import scikit-learn dataset library
from sklearn import datasets

# Load dataset
iris = datasets.load_iris()

In [None]:
# print the label species(setosa, versicolor,virginica)
print(iris.target_names)

# print the names of the four features
print(iris.feature_names)

In [None]:
# print the iris data (top 5 records)
print(iris.data[0:5])

# print the iris labels (0:setosa, 1:versicolor, 2:virginica)
print(iris.target)

In [None]:
# Creating a DataFrame of given iris dataset.

data = pd.DataFrame({
    'sepal length':iris.data[:,0],
    'sepal width':iris.data[:,1],
    'petal length':iris.data[:,2],
    'petal width':iris.data[:,3],
    'species':iris.target
})

data.head()

In [None]:
X = data[['sepal length', 'sepal width', 'petal length', 'petal width']]  # Features
y = data['species']                                                       # Labels

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)    # 70% training and 30% test

In [None]:
# Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100)

# Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", accuracy_score(y_test, y_pred))

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;"> 2.1) Finding Important Features </h1>

 - Random forest uses **gini** importance or **mean decrease in impurity (MDI)** to calculate the importance of each feature.

In [None]:
feature_imp = pd.Series(clf.feature_importances_, index=iris.feature_names).sort_values(ascending=False)

In [None]:
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)

# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;"> 2.2) Generating the Model on Selected Features </h1>

- Here, we can remove the **"sepal width"** feature because it has **very low importance,** and select the 3 remaining features.

In [None]:
# Split dataset into features and labels
X = data[['petal length', 'petal width','sepal length']]  # Removed feature "sepal length"
y = data['species']                                       

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.70, random_state=5) # 70% training and 30% test

In [None]:
# Create a Gaussian Classifier
clf = RandomForestClassifier(n_estimators=100)

# Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

# prediction on test set
y_pred=clf.predict(X_test)

# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

- We can see that after **removing** the least important features (sepal length), the accuracy **increased.** This is because you removed misleading data and noise, resulting in an increased accuracy. A lesser amount of features also reduces the training time.

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 3) HR Analytics </h1>

|Feature                   |Description                                                |
|--------------------------|-----------------------------------------------------------|
|enrollee_id               |Unique ID for candidate                                    |
|city                      |City code                                                  |
|city_development_index    |Development index of the city (scaled)                     |
|gender                    |Gender of candidate                                        |
|relevent_experience       |Relevant experience of candidate                           |
|enrolled_university       |Type of University course enrolled if any                  |
|education_level           |Education level of candidate                               |
|major_discipline          |Education major discipline of candidate                    |
|experience                |Candidate total experience in years                        |             
|company_size              |Number of employees in current employer's company          |
|company_type              |Type of current employer                                   |
|lastnewjob                |Difference in years between previous job and current job   |
|training_hours            |Training hours completed                                   |
|target                    |0 – Not looking for job change                             |
|                          |1 – Looking for a job change                               |

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.1) Read Data </h1>

In [None]:
train = pd.read_csv("/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv")
test = pd.read_csv("/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_test.csv")
sub = pd.read_csv("/kaggle/input/hr-analytics-job-change-of-data-scientists/sample_submission.csv")

In [None]:
display(train.head(3))
display(test.head(3))
display(sub.head(3))

In [None]:
display(train.shape)
display(test.shape)
display(sub.shape)

In [None]:
display(train.info())
display(test.info())

In [None]:
trainoriginal = train.copy()
testoriginal = test.copy()

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.1.1) target </h1>

In [None]:
train['target'].value_counts()

In [None]:
x = ['Looking for job change', 'Not looking for job change']
y = train['target'].value_counts()
plt.bar(x, y, color='orangered')
plt.title('Survived Distribution', fontweight='bold', fontsize=20)
plt.xlabel('Survived', fontweight='bold', fontsize=15)
plt.ylabel('Frequency', fontweight='bold', fontsize=15)
plt.show();

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.1.2) gender </h1>

In [None]:
train['gender'].value_counts()

In [None]:
x = ['Male', 'Female', 'Other']
y = train['gender'].value_counts()
plt.bar(x, y, color='orangered')
plt.title('gender Distribution', fontweight='bold', fontsize=20)
plt.xlabel('gender', fontweight='bold', fontsize=15)
plt.ylabel('Frequency', fontweight='bold', fontsize=15)
plt.show();

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.1.3) education_level </h1>

In [None]:
train['education_level'].value_counts()

In [None]:
plt.figure(figsize=(8,6))
x = ['Graduate', 'Masters', 'High School', 'Phd', 'Primary School']
y = train['education_level'].value_counts()
plt.bar(x, y, color='orangered')
plt.title('education_level Distribution', fontweight='bold', fontsize=20)
plt.xlabel('education_level', fontweight='bold', fontsize=15)
plt.ylabel('Frequency', fontweight='bold', fontsize=15)
plt.show();

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.1.4) enrolled_university </h1>

In [None]:
train['enrolled_university'].value_counts()

In [None]:
plt.figure(figsize=(8,6))
x = ['no_enrollment', 'Full time course', 'Part time course']
y = train['enrolled_university'].value_counts()
plt.bar(x, y, color='orangered')
plt.title('enrolled_university Distribution', fontweight='bold', fontsize=20)
plt.xlabel('enrolled_university', fontweight='bold', fontsize=15)
plt.ylabel('Frequency', fontweight='bold', fontsize=15)
plt.show();

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.1.5) relevent_experience </h1>

In [None]:
train['relevent_experience'].value_counts()

In [None]:
plt.figure(figsize=(8,6))
x = ['Has relevent experience', 'No relevent experience']
y = train['relevent_experience'].value_counts()
plt.bar(x, y, color='orangered')
plt.title('relevent_experience Distribution', fontweight='bold', fontsize=20)
plt.xlabel('relevent_experience', fontweight='bold', fontsize=15)
plt.ylabel('Frequency', fontweight='bold', fontsize=15)
plt.show();

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.1.6) experience </h1>

In [None]:
train['experience'].value_counts()

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.1.7) company_size </h1>

In [None]:
train['company_size'].value_counts()

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.2) EDA (Exploratory Data Analysis) </h1>

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.2.1) Drop Unwanted Features </h1>

In [None]:
train = train.drop(['enrollee_id', 'city'], axis=1)
test = test.drop(['enrollee_id', 'city'], axis=1)

- Random forests can also handle missing values. 

<h1 style="background-color:skyblue; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.2.2) Missing Values </h1>

#### Visualize missing values (NaN) values using Missingno Library

a) Visualize missing values as a matrix

b) Visualize missing values as a barplot

c) Visualize missing values as a heatmap

d) Visualize missing values as a dendrogram

In [None]:
import missingno as msno

In [None]:
display(train.isnull().sum())
print('-'*40)
display(test.isnull().sum())

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.2.2.1) Visualize missing values as a matrix </h1>

In [None]:
# Visualize missing values as a matrix
# msno.matrix(train,figsize=(11,7), sparkline=False, fontsize=12, color=(0.27, 0.52, 1.0));
# msno.matrix(train,figsize=(11,7), sparkline=False, fontsize=12, color=(0,.3,.3));
msno.matrix(train,figsize=(11,7), fontsize=12, color=(1, 0.38, 0.27));

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.2.2.2) Visualize missing values as a barplot </h1>

In [None]:
# Visualize the number of missing values as a bar chart
# color="dodgerblue" "orangered"
msno.bar(train, color="dodgerblue", sort="ascending", figsize=(13,7), fontsize=12);

In [None]:
fig = plt.figure(figsize=(15,7))

ax1 = fig.add_subplot(1,2,1)
msno.bar(train, color="tomato", fontsize=12, ax=ax1);

ax2 = fig.add_subplot(1,2,2)
msno.bar(train, log=True, color="tab:green", fontsize=12, ax=ax2);

plt.tight_layout()

In [None]:
# Missing Values
NaN = np.sum(train.isnull())
NaN_Col = NaN.loc[(NaN != 0)].sort_values(ascending=False)

plt.figure(figsize=(8,6))
sns.barplot(x = NaN_Col.index, y = NaN_Col)
plt.ylabel("Missing Value Count", size=20);
plt.xlabel("Feature Name", size=20);
plt.xticks(rotation=90)
plt.show()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.2.2.3) Visualize missing values as a heatmap </h1>

In [None]:
# Visualize the correlation between the number of missing values in different columns as a heatmap
msno.heatmap(train, cmap="RdYlGn", figsize=(10,5), fontsize=12)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.2.2.4) Visualize missing values as a dendogram </h1>

In [None]:
msno.dendrogram(train, figsize=(12,7), fontsize=12)

In [None]:
fig = plt.figure(figsize=(15,7))

ax1 = fig.add_subplot(1,2,1)
msno.dendrogram(train, orientation="right", method="centroid", fontsize=12, ax=ax1);

ax2 = fig.add_subplot(1,2,2)
msno.dendrogram(train, orientation="top", method="ward", fontsize=12, ax=ax2);

plt.tight_layout()

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(train.corr(), vmax=.8, square=True, annot=True, cmap='Blues');

<h1 style="background-color:magenta; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.3) Model building and Evaluation </h1>

In [None]:
# Transform discrete values to columns with 1 and 0s
train_OHE = pd.get_dummies(train)

# Do the same for competition data
test_OHE = pd.get_dummies(test)

In [None]:
X1 = train_OHE.drop('target', axis=1)  # Features
y1 = train_OHE['target']               # Labels

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2) # 80% training and 20% test

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.3.1) Random Forest </h1>

In [None]:
# Create a Gaussian Classifier
rf = RandomForestClassifier(n_estimators=100)

# Train the model using the training sets y_pred=clf.predict(X_test)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
y_pred_test_rf = rf.predict(test_OHE)

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))

In [None]:
feature_imp1 = pd.Series(rf.feature_importances_,index=X1.columns).sort_values(ascending=False)

plt.figure(figsize=(15,20))

# Creating a bar plot
sns.barplot(x=feature_imp1, y=feature_imp1.index)

# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.3.2) XGBoost </h1>

xgb1 = XGBClassifier()
xgb1.fit(X_train, y_train)

# make predictions for test set
y_pred_xgb1 = xgb1.predict(X_test)
predictions = [round(value) for value in y_pred_xgb1]

accuracy = accuracy_score(y_test, predictions)
accuracy

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:170%; text-align:left;"> 3.3.3) LGBM </h1>

In [None]:
lgbm_parameters = {
    'reg_alpha': 0.00388218567052311,
    'reg_lambda': 8.972335390951376e-05,
    'colsample_bytree': 0.18375780999902297,
    'subsample': 0.013352256062576087,
    'learning_rate': 0.002597839272059483,
    'max_depth': 44,
    'num_leaves': 15,
    'min_child_samples': 89,
    'cat_smooth': 56, 
    'cat_l2': 22.375773634793603,
    'max_bin': 33, 
    'min_data_per_group': 89
}

In [None]:
lgbm_parameters['metric'] = 'binary_logloss'
lgbm_parameters['objective'] = 'binary'
lgbm_parameters['n_estimators'] = 15000

In [None]:
lgbm_model = LGBMClassifier(**lgbm_parameters)

In [None]:
lgbm_model.fit(X_train, y_train)

In [None]:
y_pred = lgbm_model.predict(X_test)
y_pred_test_rf = lgbm_model.predict(test_OHE)

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))

In [None]:
plt.rcParams["figure.figsize"] = (12, 22)
lightgbm.plot_importance(lgbm_model, max_num_features = 60, height=.9)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> 4) Boston </h1>

In [None]:
import shap

In [None]:
from sklearn.datasets import load_boston

from sklearn.inspection import permutation_importance

from xgboost import XGBRegressor

In [None]:
boston = load_boston()
X = pd.DataFrame(boston.data, columns=boston.feature_names)
y = boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)

In [None]:
X.head(3)

In [None]:
xgb = XGBRegressor(n_estimators=100)
xgb.fit(X_train, y_train)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:180%; text-align:left;"> 4.1) Xgboost Feature Importance Computed in 3 Ways with Python </h1>

#### a) Feature Importance built-in the Xgboost algorithm,

#### b) Feature Importance computed with Permutation method,

#### c) Feature Importance computed with SHAP values.

### About Xgboost Built-in Feature Importance

- There are several types of importance in the Xgboost. It can be computed in several different ways. The **default** type is **gain** if you construct model with scikit-learn like API (docs). When you access Booster object and get the importance with **get_score** method, then default is **weight**. You can check the type of the importance with **xgb.importance_type.**

- The **gain** type shows the average gain across all splits where feature was used.

- The **weight** shows the number of times the feature is used to split data. This type of feature importance can favourize **numerical and high cardinality features**.

- There are also **cover, total_gain, total_cover** types of importance.

<h1 style="background-color:orange; font-family:newtimeroman; font-size:180%; text-align:left;"> a) Xgboost Built-in Feature Importance </h1>

In [None]:
# Random Forest we would do the same to get importances
print(xgb.feature_importances_)

In [None]:
# plot
plt.figure(figsize=(9,8))
plt.bar(range(len(xgb.feature_importances_)), xgb.feature_importances_)
plt.show()

In [None]:
plt.figure(figsize=(9,8))
plt.barh(boston.feature_names, xgb.feature_importances_);

In [None]:
# To have even better plot, let’s sort the features based on importance value:

plt.figure(figsize=(9,8))
sorted_idx = xgb.feature_importances_.argsort()
plt.barh(boston.feature_names[sorted_idx], xgb.feature_importances_[sorted_idx]);
plt.xlabel("Xgboost Feature Importance")
plt.show()

<h1 style="background-color:orange; font-family:newtimeroman; font-size:180%; text-align:left;"> b) Permutation Based Feature Importance </h1>

In [None]:
# This permutation method will randomly shuffle each feature and compute the change in the model’s performance. The features which impact the performance the most are the most important one.

perm_importance = permutation_importance(xgb, X_test, y_test)

In [None]:
plt.figure(figsize=(9,10))
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(boston.feature_names[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

#### The permutation based importance is computationally expensive (for each feature there are several repeast of shuffling). The permutation based method can have problem with highly-correlated features. Let’s check the correlation in our dataset:

In [None]:
def correlation_heatmap(train):
    correlations = train.corr()

    fig, ax = plt.subplots(figsize=(13,13))
    sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f', cmap="YlGnBu",
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70}
                )
    plt.show();
    
correlation_heatmap(X_train[boston.feature_names[sorted_idx]])

- Based on above results, I would say that it is **safe to remove: ZN, CHAS, AGE, INDUS.**

- Their importance based on permutation is very low and they are **not highly correlated with other features (abs(corr) < 0.8).**

<h1 style="background-color:orange; font-family:newtimeroman; font-size:180%; text-align:left;"> c) Feature Importance Computed with SHAP Values </h1>

- The third method to compute feature importance in Xgboost is to use SHAP package. It is model-agnostic and using the Shapley values from game theory to estimate the how does each feature contribute to the prediction.

In [None]:
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test)

In [None]:
# To visualize the feature importance we need to use summary_plot method:
shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.dependence_plot("LSTAT", shap_values, X_test)

<h1 style="background-color:LimeGreen; font-family:newtimeroman; font-size:200%; text-align:center; border-radius: 15px 50px;"> Submission </h1>

In [None]:
submission = pd.DataFrame({"enrollee_id": testoriginal["enrollee_id"], "target": y_pred_test_rf})
submission.to_csv('submission.csv', index=False)