In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py
init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import seaborn as sns

import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv('/kaggle/input/hr-analytics-analytics-vidya/train.csv')
test = pd.read_csv('/kaggle/input/hr-analytics-analytics-vidya/test.csv')

# Understanding the Data

In [None]:
train.head()

In [None]:
print("Train data set dtypes: \n")
print(f"Total Cols: {len(train.columns)}")
print(f"{train.dtypes.value_counts()}")
print('-'*30)

In [None]:
train.describe()

# Missing Values

In [None]:
data = go.Bar(
            x=train.isnull().sum().index,
            y=train.isnull().sum(),
            name = "Missing Values"
)

layout = go.Layout(barmode = "group")
fig = go.Figure(data = data, layout = layout)
iplot(fig)

# Target Column

In [None]:
labels = train['is_promoted'].value_counts().index
values = train['is_promoted'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values,hole=0.3)])
fig.show()

Not Promoted is roughly 9 times dominating

# Categorical

In [None]:
categorical = train.select_dtypes(include='object')
for col in categorical.columns:
    labels = train[col].value_counts().index
    values = train[col].value_counts()
    fig = go.Figure(data=[go.Pie(labels=labels, values=values,hole=0.3,name=col)])
    fig.show()

# Numeric

In [None]:
numeric_cols = train.select_dtypes(exclude='object')
numeric_cols = numeric_cols.drop('employee_id',axis=1)

In [None]:
fig = make_subplots(rows=4, cols=2)

traces = [
    go.Histogram(
        x=train[col], 
        nbinsx=100, 
        name=col
    ) for col in numeric_cols.columns
]

for i in range(len(traces)):
    fig.append_trace(
        traces[i], 
        (i // 2) + 1, 
        (i % 2) + 1
    )
fig.update_layout(
    title_text='Numeric_Values',
    height=900,
    width=800
)
fig.show()

# Outliers

In [None]:
con_col = ['age','length_of_service','avg_training_score']
for col in con_col:
    fig, axs = plt.subplots(1, 4, figsize=(16, 5))
    sns.boxplot(y=train[col], data=train, ax=axs[0])
    sns.boxenplot(y=train[col], data=train, ax=axs[1])
    sns.violinplot(y=train[col], data=train, ax=axs[2])
    sns.stripplot(y=train[col], data=train, size=4, color=".3", linewidth=0, ax=axs[3])

# Bi-Variate Analysis

In [None]:
cols = categorical.drop('region',axis=1) 
plt.figure(figsize=(20,12))
j=0
for i in cols:
    j+=1
    plt.subplot(2,2,j)
    ax1 = sns.countplot(data=train,x= train[i],hue="is_promoted")
    #if(j==1):
    #    plt.xticks( rotation=90)
    for p in ax1.patches:
        height = p.get_height()
        ax1.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}'.format(height/len(train),0),
                ha="center",rotation=0) 

In [None]:
plt.figure(figsize=(12,8))
corr = numeric_cols.corr()
sns.heatmap(corr,cmap='Blues',linewidth=0.5,annot=True)

# Data Processing

NOTE: These are just basic process, main purpose for doing this is for the next step Oversampling and Undersampling

In [None]:
train = train.dropna()
train = train[train['length_of_service'] < 35]
mapp = {'Finance':'Analytics & Others','HR':'Analytics & Others','R&D':'Analytics & Others','Legal':'Analytics & Others','Procurement':'T&P','Technology':'T&P','Sales & Marketing':'S&M',
       'Operations':'Operations','Analytics':'Analytics & Others'}
train['department'] = train['department'].map(mapp)
test['department'] = test['department'].map(mapp)
mapp_1= {"Bachelor's":'UG & Below',"Master's & above":"Master's & above",'Below Secondary':'UG & Below'}
train['education'] = train['education'].map(mapp_1)
test['education'] = test['education'].map(mapp_1)
mapp_2= {'other':'other','sourcing':'s&r','referred':'s&r'}
test['recruitment_channel'] = test['recruitment_channel'].map(mapp_2)
train['age'] = pd.cut(train['age'],bins=[18,30,40,100],labels=['twenties','thirties','forty+'])
test['age'] = pd.cut(test['age'],bins=[18,30,40,100],labels=['twenties','thirties','forty+'])
train['length_of_service']= pd.cut(train['length_of_service'],bins=[0,5,100],labels=['<5','5+'])
test['length_of_service']= pd.cut(test['length_of_service'],bins=[0,5,100],labels=['<5','5+'])
train['no_of_trainings']= pd.cut(train['no_of_trainings'],bins=[0,1,100],labels=['1','2+'])
test['no_of_trainings']= pd.cut(test['no_of_trainings'],bins=[0,1,100],labels=['1','2+'])
train['avg_training_score'] = pd.cut(train['avg_training_score'],bins=[0,50,60,70,80,100],labels=['5','6','7','8','9+'])
test['avg_training_score'] = pd.cut(test['avg_training_score'],bins=[0,50,60,70,80,100],labels=['5','6','7','8','9+'])
train = train.drop('region',axis=1)
test = test.drop('region',axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder
cats = [c for c in train.columns if (train[c].dtypes =='object' ) ]
print('Categories', cats)

for c in cats:
    le=LabelEncoder()
    le.fit(list(train[c].astype('str')) + list(test[c].astype('str')))
    train[c] = le.transform(list(train[c].astype(str))) 
    test[c] = le.transform(list(test[c].astype(str))) 
train.head()

In [None]:
train['age'] = le.fit_transform(train['age'])
train['length_of_service'] = le.fit_transform(train['length_of_service'])
train['no_of_trainings'] = le.fit_transform(train['no_of_trainings'])
train['avg_training_score'] = le.fit_transform(train['avg_training_score'])

test['age'] = le.fit_transform(test['age'])
test['length_of_service'] = le.fit_transform(test['length_of_service'])
test['no_of_trainings'] = le.fit_transform(test['no_of_trainings'])
test['avg_training_score'] = le.fit_transform(test['avg_training_score'])
train = train.drop('employee_id',axis=1)
test = test.drop('employee_id',axis=1)

# Sampling

![](https://cdn-images-1.medium.com/max/1600/1*P93SeDGPGw0MhwvCcvVcXA.png)

### OUR dataset consists of only roughly 9 percent of positive outcome. This would cause these classifiers to ignore small classes while concentrating on classifying the large ones accurately.
TWO METHODS TO OVERCOME


# Oversampling

In [None]:
print("Before Sampling")
train['is_promoted'].value_counts()

In [None]:
from imblearn.over_sampling import SMOTE

y_train_ada =train['is_promoted']

oversampled_df, oversampled_trainY = SMOTE().fit_resample(train, y_train_ada)

In [None]:
print("After Sampling")
oversampled_df['is_promoted'].value_counts()

In [None]:
plt.figure(figsize=(12,8))

sns.heatmap(oversampled_df.corr(),cmap='Blues',linewidth=0.5,annot=True)

# Undersampling

In [None]:
print("Before Sampling")
train['is_promoted'].value_counts()

In [None]:
randomn_df = train.sample(frac=1,random_state=4)

is_promoted = randomn_df.loc[randomn_df['is_promoted'] == 1]

non_promoted = randomn_df.loc[randomn_df['is_promoted'] == 0].sample(n=4232,random_state=42)

undersample_df = pd.concat([is_promoted, non_promoted])

In [None]:
print("After Sampling")
undersample_df['is_promoted'].value_counts()

In [None]:
plt.figure(figsize=(12,8))

sns.heatmap(undersample_df.corr(),cmap='Blues',linewidth=0.5,annot=True)

## Correlation with is_promoted in undersample and oversample is better than the imbalanced datasets

In [None]:
Thank you!