In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
import os 

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#NOTE- aug_test doesn't have target value and it's not a competition so no way to check if output is correct. Hence I'm using aug_train only

df=pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
df.head()

In [None]:
df.isnull().mean()

**Observation 1**

1. Lot's of null values,I'm dropping them.
2. Company_size has ordinal values.  
3. Lot's of Unqiue value in features. We will checkthem after we drop the null values.


# Data visualization

In [None]:
df.dropna(inplace=True)

In [None]:
df_raw = df[['training_hours', 'company_type']].groupby('company_type').apply(lambda x: x.mean())
df_raw.sort_values('training_hours', inplace=True)

# Draw plot
fig, ax = plt.subplots(figsize=(10,7), dpi= 80)
ax.vlines(x=df_raw.index, ymin=0, ymax=df_raw.training_hours, color='firebrick', alpha=0.7, linewidth=2)
ax.scatter(x=df_raw.index, y=df_raw.training_hours, s=75, color='firebrick', alpha=0.7)
ax.set_xlabel('Company Type')
for row in df_raw.itertuples():
    ax.text(row.Index, row.training_hours+.5, s=round(row.training_hours, 2), horizontalalignment= 'center', verticalalignment='bottom', fontsize=14)

plt.show()


In [None]:
df.replace(to_replace ="no_enrollment", value ="No enrollment",inplace=True) 
g_count = df['enrolled_university'].value_counts()

fig, ax = plt.subplots(figsize=(8, 8))

def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%\n({v:d})'.format(p=pct,v=val)
    return my_autopct

genre_col = ['darkgreen','crimson','orange']

center_circle = plt.Circle((0, 0), 0.7, color='white')
plt.pie(x=g_count.values, labels=g_count.index, autopct=make_autopct(g_count.values), 
           textprops={'size': 15}, pctdistance=0.5, colors=genre_col)
ax.add_artist(center_circle)

fig.suptitle('Distribution of Enrolled University', fontsize=20)
fig.show()

In [None]:
df['experience']=df['experience'].replace({'>20': 21, '<1': 0}).astype(int)
df['last_new_job']=df['last_new_job'].replace({'>4': 5, 'never': 0}).astype(int)

In [None]:
sns.set()
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
fig.suptitle('Male/Female/others', fontsize=20)

sns.barplot(ax=axes[0], x=df['gender'], y=df['training_hours'],linewidth=2.5, facecolor=(1, 1, 1, 0),errcolor=".2", edgecolor=".2")
axes[0].set_title('Training hours')
sns.barplot(ax=axes[1], x=df['gender'], y=df['experience'],linewidth=2.5, facecolor=(1, 1, 1, 0),errcolor=".2", edgecolor=".2")
axes[1].set_title('experience')

In [None]:
from matplotlib import cm
from math import log10
df_m=df['company_size'].value_counts().to_dict()
labels = list(df_m.keys())
data = list(df_m.values())

n = len(data)
k = 10 ** int(log10(max(data)))
m = k * (1 + max(data) // k)
r = 1.5
w = r / n 
colors = [cm.terrain(i / n) for i in range(n)]

fig, ax = plt.subplots()
ax.axis("equal")
fig.suptitle('Count of Company', fontsize=10)

for i in range(n):
    innerring, _ = ax.pie([m - data[i], data[i]], radius = r - i * w, startangle = 90, labels = ["", labels[i]], labeldistance = 1 - 1 / (1.5 * (n - i)), textprops = {"alpha": 0}, colors = ["white", colors[i]])
    plt.setp(innerring, width = w, edgecolor = "white")

plt.legend()
plt.show()

In [None]:
df_pvt=df[['city_development_index','gender','training_hours','experience']]
pd.pivot_table(df_pvt, values=['city_development_index','training_hours','experience'],columns=['gender'], aggfunc='mean')

In [None]:
df_raw = df[['training_hours', 'major_discipline']].groupby('major_discipline').apply(lambda x: x.mean())
df_raw.sort_values('training_hours', inplace=True)

fig, ax = plt.subplots(figsize=(10,5), dpi= 80)
ax.hlines(y=df_raw.index, xmin=11, xmax=26, color='gray', alpha=0.7, linewidth=1, linestyles='dashdot')
ax.scatter(y=df_raw.index, x=df_raw.training_hours, s=75, color='firebrick', alpha=0.7)
fig.suptitle('Training hours', fontsize=20)

In [None]:
df_dict=dict(sorted(df['last_new_job'].value_counts().to_dict().items()))
df_d=pd.DataFrame({'x_axis': df_dict.keys(), 'y_axis': df_dict.values() })
plt.plot( 'x_axis', 'y_axis', data=df_d, linestyle='dotted', marker='o')
plt.xlabel("Years since Last New Job")
plt.ylabel("Count of Employees")
plt.show()

**Observation 2**

1. Training hours vary with degree
2. Women has high training hours than Men
3. Year of experience is also high in men
4. Training hours in Early stage start up is also high as compare to Public sector

# Data preparation for algorithem

In [None]:
# Ordinal values

company_size={'50-99':3, '<10':1, '5000-9999':7, '1000-4999':6, '10/49':2, '100-500':4,'10000+':8, '500-999':5}
df['company_size']=df['company_size'].map(company_size)

education_level={'Graduate':1,'Masters':2,'Phd':3}
df['education_level']=df['education_level'].map(education_level)

enrolled_university={'No enrollment':1, 'Part time course':2, 'Full time course':3}
df['enrolled_university']=df['enrolled_university'].map(enrolled_university)

relevent_experience={'No relevent experience':0, 'Has relevent experience':1}
df['relevent_experience']=df['relevent_experience'].map(relevent_experience)

In [None]:
# Replacing with count

df['company_type']=df['company_type'].map(df['company_type'].value_counts()).astype(int)
df['major_discipline']=df['major_discipline'].map(df['major_discipline'].value_counts()).astype(int)
df['city']=df['city'].map(df['city'].value_counts()).astype(int)
df['gender']=df['gender'].map(df['gender'].value_counts()).astype(int)


In [None]:
# Since I'm using XGBoost no standardization is required

from sklearn.model_selection import train_test_split

from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
model = XGBClassifier(max_depth=3)

df_y=df['target']
df.drop('target',axis=1,inplace=True)
df.drop('enrollee_id',axis=1,inplace=True)

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(df, df_y, test_size=0.2, random_state=71) 

model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [None]:
print('Confusion matrix \n {}'.format(confusion_matrix(y_test,y_pred)))
print('Accuracy score {:.2f}'.format(accuracy_score(y_test,y_pred)*100))
print(classification_report(y_test,y_pred))