In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings


warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

### Visualization

## Data Cleaning

In [3]:
train_df["previous_year_rating"].value_counts()

3.0    18618
5.0    11741
4.0     9877
1.0     6223
2.0     4225
Name: previous_year_rating, dtype: int64

In [4]:
train_df["previous_year_rating"].fillna(-9999, inplace=True)
test_df["previous_year_rating"].fillna(-9999, inplace=True)

In [5]:
# OHE

In [6]:
train_df["reg_num"] = train_df["region"].str.extract("(\d+)")
train_df["reg_num"] = train_df["reg_num"].astype(int)

test_df["reg_num"] = test_df["region"].str.extract("(\d+)")
test_df["reg_num"] = test_df["reg_num"].astype(int)

In [7]:
train_df = pd.concat([train_df, pd.get_dummies(train_df["department"], prefix="dep_")], axis=1)
test_df = pd.concat([test_df, pd.get_dummies(test_df["department"], prefix="dep_")], axis=1)

In [8]:
train_df = pd.concat([train_df, pd.get_dummies(train_df["gender"], prefix="gen_")], axis=1)
test_df = pd.concat([test_df, pd.get_dummies(test_df["gender"], prefix="gen_")], axis=1)

In [9]:
train_df = pd.concat([train_df, pd.get_dummies(train_df["recruitment_channel"], prefix="rec_")], axis=1)
test_df = pd.concat([test_df, pd.get_dummies(test_df["recruitment_channel"], prefix="rec_")], axis=1)

In [10]:
train_df["education"] = train_df["education"].map({"Bachelor's": 2, "Master's & above": 4, "Below Secondary": 1})
test_df["education"] = test_df["education"].map({"Bachelor's": 2, "Master's & above": 4, "Below Secondary": 1})

In [11]:
train_df["education"].fillna(-9999, inplace=True)
test_df["education"].fillna(-9999, inplace=True)

In [12]:
#Dropping columns

In [13]:
emp_id = test_df["employee_id"]
train_df.drop(["department", "region", "gender", "recruitment_channel", "employee_id"], axis=1, inplace=True)
test_df.drop(["department", "region", "gender", "recruitment_channel", "employee_id"], axis=1, inplace=True)

## Feature Engineering

## Data Prep

In [14]:
y_train = train_df["is_promoted"]
X_train = train_df.drop("is_promoted", axis=1)

X_test = test_df

# CV

In [16]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb

In [17]:
# rf - cv
kf = KFold(n_splits=5)
rf_model = RandomForestClassifier(random_state=7, class_weight="balanced")
scores = cross_val_score(rf_model, X_train, y_train, scoring="f1", cv=kf)
x = scores.mean()
x

0.41465104967028077

In [34]:
# xgb - cv
kf = KFold(n_splits=5)
xgb_model = xgb.XGBClassifier(random_state=7, silent=1, n_jobs=-1, scale_pos_weight=4)
scores = cross_val_score(xgb_model, X_train, y_train, scoring="f1", cv=kf)
x = scores.mean()
x

0.49158747319754009

## Model

In [20]:
rf_model = RandomForestClassifier(random_state=7, class_weight="balanced", n_jobs=-1)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

In [22]:
model = xgb.XGBClassifier(random_state=7, silent=0, n_jobs=-1, scale_pos_weight=4)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

## Submission

In [23]:
submission = pd.DataFrame()
submission["employee_id"] = emp_id
submission["is_promoted"] = y_pred
submission.to_csv("sub_xgb3.csv", index=False)

In [21]:
feature_importances = pd.DataFrame(rf_model.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

In [22]:
feature_importances

Unnamed: 0,importance
avg_training_score,0.259672
KPIs_met >80%,0.142093
age,0.115305
reg_num,0.101183
length_of_service,0.085014
previous_year_rating,0.081591
awards_won?,0.033918
dep__Sales & Marketing,0.023249
no_of_trainings,0.022309
education,0.022264
