In [None]:
# import neccesary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%config Completer.use_jedi = False      #active autocomplete in jupyter

**Features**

enrollee_id : Unique ID for candidate

city: City code

city_ development _index : Developement index of the city (scaled)

gender: Gender of candidate

relevent_experience: Relevant experience of candidate

enrolled_university: Type of University course enrolled if any

education_level: Education level of candidate

major_discipline :Education major discipline of candidate

experience: Candidate total experience in years

company_size: No of employees in current employer's company

company_type : Type of current employer

lastnewjob: Difference in years between previous job and current job

training_hours: training hours completed

target: 0 – Not looking for job change, 1 – Looking for a job change

In [None]:
hr_train=pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
hr_test=pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
print(hr_train.dtypes, hr_test.dtypes)


In [None]:
hr_train.head(2)

In [None]:
#joining both dataset 
hr_df=hr_train.append(hr_test)
hr_df.head(2)

In [None]:
#i verified that all records are unique, so lets quickly summary our data to see what contains.
hr_df.describe()

In [None]:
hr_df.describe(include='object')

In [None]:
#looks like several variables have some amount of null values, depending on the type of analysis we may need to input/remove those values.
hr_df_objct=hr_df.select_dtypes(include='object').isnull().sum()
nulls={hr_df.select_dtypes(include='object').columns[i]: str("{:.1f}".format(hr_df_objct[i]/len(hr_df)*100))+'%' for i in range(0,len(hr_df.select_dtypes(include='object').columns)) }
nulls_df=pd.DataFrame(nulls,index=['nulls'])
nulls_df
#We will keep this information for now, later it might be needed.

# Univariate Exploratory data analysys

## Continuous variables

In [None]:
fig,ax=plt.subplots(1,3,figsize=(16,5))
ax[0].hist(hr_df['training_hours'],bins=40)
ax[0].set_title(' Training hours histogram')
ax[1].hist(hr_df['target'])
ax[1].set_title(' Target histogram')
ax[2].hist(hr_df['city_development_index'])
ax[2].set_title(' Target histogram')
fig.suptitle('Continous variables distribution',fontsize=16,y=1.1)
plt.show()

Training hours has a exponential behavoiur, in average candidates are taking 65 hours of training, however we have a few candidates that have taking more than 250 hours.. would this have an impact on other variables such as the one related to seeking a new job?

In [None]:
#From the describe for the objct variable we know that experience has too many level, i will plot this in a separate division.
hr_df_objcts2=hr_df.select_dtypes(include='object')
del(hr_df_objcts2['experience'])
del(hr_df_objcts2['city'])
fig,axes=plt.subplots(4,2,figsize=(15,20), sharey=False)
tot_it=0
for i in range(0,4):
    for j in range(0,2):
        axes[i][j].tick_params(axis='x', rotation=45)
        sns.countplot(ax=axes[i][j], x=hr_df_objcts2.columns[tot_it], data=hr_df_objcts2 )
        fig.tight_layout()
        tot_it+=1

### Comments
1) Within the data science world there are more male than female people.

2) Most of the candidates in the dataset have some level of experience.

3) Most of the candidates do not have an enrollemt in university.

4) Most of the candidates are graduates, however we have some other interesting educational levels such as PHD or primary school.

5) As expected, most candidates are related to STEM world.

6) Company size are kind of similarly distributed, however we could say that an important part of candidates are in small-medium size companies (10-500).

7) Data scientist are highly required almost exclusively for private companies, no surprise here.

8) Most of candidates are in their current position for less than 4 years.

# Data analysis 
Ok, now the fun part begins, lets inspect the relation among variables and the response variable to check if there is any relation but also to see if some of those variable are affecting the likelyhood of looking or not for a new job.
Some of the previous analysis on describe and EDA will be taken into account depending on the specific analysis.

##  Relation and analysis among dependant variables
As we have several variables, we will only create plots to the ones that have more sense to analyze.

In [None]:
fig,axes=plt.subplots(3,3,figsize=(15,10), sharey=False)
tot_it=0
for i in range(0,3):
    for j in range(0,3):
        axes[i][j].tick_params(axis='x', rotation=45)
        sns.countplot(ax=axes[i][j], x=hr_df_objcts2.columns[tot_it], hue='relevent_experience', data=hr_df_objcts2 )
        fig.tight_layout()
        tot_it+=1
        if tot_it==8:
            break
            

### Commments:
1) It's curious to see that candidates without any enrollement in university have considerably more experience than the ones enroll in any course (full or part time).
2) People with only high school or primary school does not have much experience in the data scientist world.

In [None]:
fig,axes=plt.subplots(3,3,figsize=(15,10), sharey=False)
tot_it=0
for i in range(0,3):
    for j in range(0,3):
        axes[i][j].tick_params(axis='x', rotation=45)
        plot=sns.countplot(ax=axes[i][j], x=hr_df_objcts2.columns[tot_it], hue='education_level', data=hr_df_objcts2 )
        fig.tight_layout()
        tot_it+=1
        if tot_it==8:
            break

### Comments:
1) Almost all candidates related to STEM area have at least a graduate title.

### Independat vs dependant variables (categorical)
For this analysis we will use exclusively the dataframe that contains the data with labels.

In [None]:
hr_train['target']=hr_train['target'].astype(int)
hr_train2=hr_train.copy()
for item in ['enrollee_id','city','city_development_index']:
    del(hr_train2[item])

fig,axes=plt.subplots(3,3,figsize=(15,10), sharey=False)
tot_it=0
for i in range(0,3):
    for j in range(0,3):
        to_plot=hr_train2.groupby(hr_train2.columns[tot_it], as_index=False)['target'].mean()
        axes[i][j].tick_params(axis='x', rotation=45)
        sns.barplot(ax=axes[i][j], x=to_plot.columns[0], y='target', data=to_plot , orient='v' )
        fig.tight_layout()
        tot_it+=1
        if tot_it==8:
            break

## Comments:
All comments below are subject to statistic evaluation, however is likely that the null hypothesis is reject and there is statistical difference among levels:

1) Gender doe not have any influence in searching for a new job.

2) People with no relevant experience has more chances of being looking for a new job.

3) Candidates enrolled in some course are more likely to search for a new job in comparison with part time and no enrollment.

4) Graduates are more likely to search for a new job than the other education levels.

5) There is no clear pattern between target and relevant experience.

6) Candidates with few years of experience (<=5) are more likely to search a job than candidates with more years of experience.

7) Apparently, candidates prefer to work on medium size companies, as the tails (10-49 and 5k-10k) are more likely to look for a new job.

8) Company type does not have a clear influence, sector is no relevant to look or not a new job.

### Independat vs dependant variables (continous)

In [None]:
#First i will transform experience into a continouns variable to explore some relations.

def experience_to_continous(row):
    if row=='>20':
        return 21
    if row=='<1':
        return 1
    if   row is None:
        return None
    else:
        return int(row)
hr_train['experience_cont']=hr_train['experience'].dropna().apply(lambda x: experience_to_continous(x))

In [None]:
hr_train3=hr_train.select_dtypes(exclude='object').dropna()
def discrete_development(row):
    if row>=0 and row<=0.25:
        return'0-0.25'
    if row>0.25 and row<=0.5:
        return'0.25-0.5'
    if row>0.5 and row<=0.75:
        return'0.5-0.75'
    else:
        return'>0.75'
    
hr_train3['discrete_index']=hr_train['city_development_index'].apply(lambda x: discrete_development(x))
to_plot=hr_train3.groupby('discrete_index', as_index=False)['target'].mean()
sns.barplot( x=to_plot.columns[0], y='target', data=to_plot , orient='v' )

In [None]:
colors = ["#CC0000", "#830137"]
sns.set(rc={'axes.facecolor':'#FFFFFF', 'figure.facecolor':'#FFFFFF','axes.grid' : False})
sns.set_palette(sns.color_palette(colors))
def biv_densityplot(dataframe,target_variable):
    num_data=dataframe._get_numeric_data()
    num_data[target_variable] = dataframe[target_variable].apply(str)
    for i in range(0,len(num_data.columns)):
        current_col=num_data.columns[i]
        if current_col!=target_variable:
            fig,ax=plt.subplots()
            ax.spines['right'].set_visible(False)
            ax.spines['top'].set_visible(False)
            sns.kdeplot(data=num_data,x=current_col,hue=target_variable,fill=False,shade=True,multiple='stack')
            plt.xlabel(current_col)
            ax.set_ylabel('') 
            plt.show()

biv_densityplot(hr_train.loc[:,hr_train.columns!='enrollee_id'],'target')

**Comments:**

1) For plot 1, looks like city development has few influence in changing or not job, as lower the index more likely to want another job.

2) For plot 2, candidates with few training hours are more likely to change job

3) For plot 3, we analyzed this variable as a object variable, but conclusions are the same, data scientist with less than 10 years experience are likely to change their job.