In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data1=pd.read_csv('../input/train_LZdllcl.csv')
data3=pd.read_csv('../input/train_LZdllcl.csv')
data1.head()

In [None]:
data1[['department', 'region', 'education', 'gender',
       'recruitment_channel']].head()


In [None]:
cat_col=['department','region','education','gender','recruiment_channel']
cols=['department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score']

In [None]:
dict_department=dict((v,k) for k,v in (dict(enumerate(list(data3.department.unique())))).items())
dict_region=dict((v,k) for k,v in (dict(enumerate(list(data3.region.unique())))).items())
dict_recr=dict((v,k) for k,v in (dict(enumerate(list(data3.recruitment_channel.unique())))).items())
dict_edu=dict((v,k) for k,v in (dict(enumerate(list(data3.education.unique())))).items())
dict_gen=dict((v,k) for k,v in (dict(enumerate(list(data3.gender.unique())))).items())
            

In [None]:
data3['department']=data3['department'].map(dict_department)
data3['region']=data3['region'].map(dict_region)
data3['recruitment_channel']=data3['recruitment_channel'].map(dict_recr)
data3['education']=data3['education'].map(dict_edu)
data3['gender']=data3['gender'].map(dict_gen)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,matthews_corrcoef
x_train,x_test,y_train,y_test=train_test_split(data3[cols], data3['is_promoted'], test_size=0.20, random_state=123)
x_train.head()

In [None]:
from xgboost import XGBClassifier
clf=XGBClassifier().fit(x_train[cols],y_train)


In [None]:
print (classification_report(y_test,clf.predict(x_test[cols])))
print(matthews_corrcoef(y_test,clf.predict(x_test[cols])))

In [None]:
data1.dtypes
#There are continuous variable and categorical variable. 
#We will first start with numerical variable.

There are broadly two variable categories-
 - Categorical Varaibles
 - Numerical/Continuous Variables

There are two major classes of categorical data- Nominal and Ordinal

**Nominal**- in this there is no concept of order. e.g. racial types- Asian, American, Europians, etc. There is no order here.
**Ordinal**- we have some sense order amongst the values. e.g. Shoe sizes S, M, L, XL, XXL

**#Handling Ordinal Variables**- Few popular methods
1. Creation of Dummies
2. Mean Encoding

**Dummy creation**- In this we create dummies for the all the unique value that the variable can take. Let's say variable can take m unique values, then we create m-1 dummies.


Let's take column **department**. There are in total 9 unique values. we will create in total 8 dummies. But the problem with this approach if number of unique values that the variable can take is high (high cardinality). It can increases the number columns drastically.

In [None]:
print (len(data1.department.unique()))
dum_department=pd.get_dummies(data1['department'], prefix='department', drop_first=False)
dum_department.head()

**Mean Encoding**- proportion of positive labels present for a particular value of a categorical variable. Problem with this methodology is overfitting. mean encoding is one of the key transformation applied to categorical variables when we are using Gradient Boosting. Since, in Gradient Boosting tree height is monitored and always restricted to low tree height. As exposing tree to higher tree height leads to overfitting.

In [None]:
data1[['department','is_promoted']].head()


In [None]:
dpt=pd.DataFrame(data1.department.value_counts())
dpt=dpt.reset_index()
dpt.columns=['department','count']
target_label=data1[['department','is_promoted']].groupby(['department']).sum()
target_label=target_label.reset_index()
final_encoded=pd.merge(dpt,target_label,on='department',how='left')
final_encoded['mean_encoded']=final_encoded['is_promoted']/final_encoded['count']
final_encoded=final_encoded[['department','mean_encoded']]
final_encoded

In [None]:
data1=pd.merge(data1,final_encoded,on='department',how='left')
data1[['department','mean_encoded']].head()

As most of the machine learning algorithms recognise only numbers. Therefore, all non-numeric ordinal variables needs to be transformed into numeric. This can be achieved using a dictionary using Python's map function.

> Before Starting with any modelling, one must first check problem context and try to get as much information possible. For the given dataset we are provided with average training score. But, before we make any comments on this feature's importance we should take a step back and think. How a person would be promoted in a multistate and multi department company-
* Promotion would happen department wise
* Promotion would be region wise
* No. of promotion would be dependent on a particular department. Some departments would be inherently promoting more number of people than other


In [None]:
sns.distplot(data1[data1['is_promoted']==1]['avg_training_score'], color='r')
sns.distplot(data1[data1['is_promoted']==0]['avg_training_score'], color='g')
#Observation- Idea behind doing dist plot, plotting separately for categories to identify regions where there are no overlaps. If we can find pockets of
#non-overlap, then the variable can clearly differenitate or classify the target and would add value to the model.

In [None]:
# from sklearn.preprocessing import CategoricalEncoder
data1=pd.merge(data1,data1[['region','department','avg_training_score']].groupby(['region','department']).mean(),how='left',on=['region','department'])
data1=data1.rename(columns={'avg_training_score_x':'avg_training_score','avg_training_score_y':'mean_reg_dpt'})
data1['new_avg_trng_score']=data1['avg_training_score']/data1['mean_reg_dpt']
data1.head()

In [None]:
plt.subplots(figsize=(16,7))
sns.distplot(data1[data1.is_promoted==0]['new_avg_trng_score'],color='g',label='Not Promoted')
sns.distplot(data1[data1.is_promoted==1]['new_avg_trng_score'],color='r',label='Promoted')

#as we can see now the engineered feature would be able to classify more accurately, 
#as the regions of non-overlap are more clearly defined

In [None]:
print (data1.age.describe())

In [None]:
sns.distplot(data1[data1['is_promoted']==1]['age'], color='r')
sns.distplot(data1[data1['is_promoted']==0]['age'], color='g')

**Binning Strategies and handling numeric variables**

Binning is of two types- **Fixed** and **Adaptive**

**Fixed Bining** as the name suggest is fixed- boundries are predefined, which may lead to imperfect bins with with less irregular density in few bins

**Adaptive Binning**- Quantile based binning is a good strategy to use for adaptive binning. Quantiles are specific values or cut-points which help in partitioning the continuous valued distribution of a specific numeric field into  discrete contiguous bins or intervals. Thus, q-Quantiles help in partitioning a numeric attribute into q equal partitions. Popular examples of quantiles include the 2-Quantile known as the median which divides the data distribution into two equal bins, 4-Quantiles known as the quartiles which divide the data into 4 equal bins and 10-Quantiles also known as the deciles which create 10 equal width bins. 

In [None]:
data1['age_bin']=pd.qcut(data1['age'], q=[0,.10,.20,.30,.40,.50,.60,.70,.80,.90,1], labels=False)

In [None]:
sns.distplot(data1[data1['is_promoted']==1]['age_bin'], color='r')
sns.distplot(data1[data1['is_promoted']==0]['age_bin'], color='g')


#few things to take care while handling numerical variables. 
* Clipping values after certain threshold- Outlier clipping- It's a very simple exercise. One would look at quantile plot. And decides values after which  variables is not adding more value. e.g. no. of likes seen in light of product getting picked by a customer. Likes can be bucketed into certain quantiles based on the data.
* Binning

In [None]:
sns.distplot(data1[data1['is_promoted']==1]['length_of_service'], color='r')
sns.distplot(data1[data1['is_promoted']==0]['length_of_service'], color='g')

Based on the plot it can be said that age bins is not adding value. But our modelling is not bivariate modelling.So, it may happen that age might become important once it is seen with other variables.
From Domain standspoint- Age should be one of the factor which should be important while promoting people. Individually it doesn't make sense to promote people based on age, but with other varaible, it may be useful.

In [None]:
data1.columns
cols=['gender','no_of_trainings', 'age',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score']
cols_a=['gender','no_of_trainings', 'age',
       'length_of_service', 'KPIs_met >80%', 'awards_won?', 'mean_reg_dpt','new_avg_trng_score']
category_cols = ['gender','recruitment_channel', 'region', 'department']

In [None]:
data1.gender.unique()

In [None]:
data2=data1
data2['gender']=data2['gender'].map({'m':1,'f':0})
#data2['recruitment_channel']=data2['recruitment_channel'].map({'sourcing':1,'other':0,'referred':2})
dum_recr=pd.get_dummies(data2['recruitment_channel'], prefix_sep='recr', drop_first=True)
#data2['department']=data2['department'].map({'Sales & Marketing':0, 'Operations':1, 'Technology':2, 'Analytics':3,
#       'R&D':4, 'Procurement':5, 'Finance':6, 'HR':7, 'Legal':8})
dum_dpt=pd.get_dummies(data2['department'], prefix_sep='dpt', drop_first=True)
data2=pd.concat([dum_dpt, dum_recr,data2], axis=1)


In [None]:
cols=cols+list(dum_recr.columns)+list(dum_dpt.columns)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data2[cols], data2['is_promoted'], test_size=0.20, random_state=123)
x_train.head()

In [None]:
#automated feature selection using Standard Scikit Package. One of the most popular such algorithm is Random Forest
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


In [None]:
x_train,x_test,y_train,y_test=train_test_split(data1[cols_a], data1['is_promoted'], test_size=0.20, random_state=123)
clf=XGBClassifier().fit(x_train[cols_a],y_train)
print (classification_report(y_test,clf.predict(x_test[cols_a])))
print(matthews_corrcoef(y_test,clf.predict(x_test[cols_a])))

The bar plot below on feature name and feature importance, produced many of the machine learning packages like Random Forest, LASSO Regression, CATBOOST, etc. helps in reducing the feature scope.

In [None]:
category_cols = ['gender']

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier().fit(x_train[cols_a],y_train)
print (classification_report(y_test,clf.predict(x_test[cols_a])))
print(matthews_corrcoef(y_test,clf.predict(x_test[cols_a])))
clf.feature_importances_
sns.barplot(y=cols_a , x=clf.feature_importances_)
# Gender, awards_won, recruitment channel and no_of_traings recieved are few features marked as least important. 
#Let's analyse them, before we cross them off from our list
#gender- it points to fact that the dataset we have, belongs to a region where getting promoted is gender insensitive. More of just work culture.