In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
train.head()

### Features

enrollee_id : Unique ID for enrollee

city: City code

citydevelopmentindex: Developement index of the city (scaled)

gender: Gender of enrolee

relevent_experience: Relevent experience of enrolee

enrolled_university: Type of University course enrolled if any

education_level: Education level of enrolee

major_discipline :Education major discipline of enrolee

experience: Enrolee total experience in years

company_size: No of employees in current employer's company

company_type : Type of current employer

lastnewjob: Difference in years between previous job and current job

training_hours: training hours completed

target: 0 – Not looking for job change, 1 – Looking for a job change

In [None]:
train.info()

In [None]:
train.describe(include = 'O')

In [None]:
for col in train.columns:
    if train[col].dtype == 'object':
        categories = train[col].unique()
        print('[{}] ({})'.format(col, len(categories)))

### Binary features with target

In [None]:
def count_subplots(data, feature1, hue = 'target', ylim = None, xlim = None):
    f, ax = plt.subplots(2, figsize = (18, 15))
    sns.countplot(feature1,  data = data, ax = ax[0])
    ax[0].set_title('{} Count Plot'.format(feature1), size = 20)
    ax[0].set_xlabel(feature1, size = 15)
    ax[0].set_ylabel('Count', size = 15)
    ax[0].tick_params(labelsize = 15)
    
    sns.countplot(feature1, hue = hue, data = data, ax = ax[1])
    ax[1].set_title('{} Count Plot'.format(feature1), size = 20)
    ax[1].set_xlabel(feature1, size = 15)
    ax[1].set_ylabel('Count')
    ax[1].tick_params(labelsize = 15)
    if hue == 'target':
        ax[1].legend(['Not looking for jobs', 'Looking for jobs'], loc = 'upper right', prop = {'size' : 15})
        
    if ylim != None:
        plt.ylim(ylim)
    if xlim != None:
        plt.xlim(xlim)    
    
    plt.show()

In [None]:
count_subplots(train, 'gender')

In [None]:
count_subplots(train, 'relevent_experience')

In [None]:
count_subplots(train, 'enrolled_university')

In [None]:
count_subplots(train, 'education_level')

In [None]:
count_subplots(train, 'experience')

In [None]:
count_subplots(train, 'company_size')

In [None]:
count_subplots(train, 'company_type')

In [None]:
def factor_plots(data, feature1, feature2 = None, col = None, hue = None, kind = 'point', ylim = None, xlim = None):
        g = sns.factorplot(feature1, feature2, col = col, hue = hue, kind = kind, data = data)
        #if feature2 != None:
        #    plt.title("{} and {}'s {} plot".format(feature1, feature2, kind))
        #else:
        #    plt.title("{}'s {} plot".format(feature1, kind))
        fig = plt.gcf()
        fig.set_size_inches(20, 6)
        
        if ylim != None:
            plt.ylim(ylim)
        if xlim != None:
            plt.xlim(xlim)
        
        plt.tight_layout()
        
        plt.show()

In [None]:
factor_plots(train, 'relevent_experience',  kind = 'count', hue = 'target', col = 'enrolled_university')

#### We can see that 'Full time course' + 'No relevent experience' value has strong correlationship with target

In [None]:
factor_plots(train, 'relevent_experience',  kind = 'count', hue = 'target', col = 'education_level')

It shows that 'Graduate' with 'No relevent experience' has very strong correlationship with target, also Master too.

In [None]:
factor_plots(train, 'gender',  kind = 'count', hue = 'target', col = 'relevent_experience')

In [None]:
train.info()

## Numerical features

In [None]:
sns.heatmap(train.drop('enrollee_id', axis = 1).corr(), annot = True);

In [None]:
factor_plots(train, 'relevent_experience', 'training_hours', kind = 'violin', hue = 'target')

In [None]:
factor_plots(train, 'relevent_experience', 'city_development_index', kind = 'violin', hue = 'target')

### 'city_development_index' feature has very strong correlationship with target.