# Load Library and Get Data

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
data = pd.read_csv('/kaggle/input/hr-analytics-job-change-of-data-scientists/aug_train.csv')

# Exploratory Data Analysis (EDA)

In [None]:
data.describe()

In [None]:
data.head()

# Impact of Education on Company Type

In [None]:
temp = data[['education_level', 'company_type']]
temp.dropna(axis=0, inplace = True)
value_counts = temp.value_counts().to_frame()
value_counts.reset_index(level=[0, 1], inplace=True)
value_counts = value_counts.rename(columns = {0:'count'})
bar, ax = plt.subplots(figsize=(20,12))
sns.barplot(x = 'education_level', y = 'count', data = value_counts, hue='company_type')
plt.xticks(rotation = 90)
plt.xlabel('Education Level')
plt.ylabel('Frequency')
plt.title('Impact of Education on Company Type', fontsize=20)

# Top Three Discipline

In [None]:
title = ['Top Three Discipline for leaving Company', 'Top Three Discipline for Not Leaving Company']
for index, target in enumerate(data['target'].unique()):
    major_discipline = data[data['target'] == target]['major_discipline']
    temp = major_discipline.value_counts().to_frame()
    temp.reset_index(level=[0], inplace= True)
    top_three = temp.iloc[:3,:]
    bar, ax = plt.subplots(figsize=(20,12))
    sns.barplot(x = 'index', y = 'major_discipline', data = top_three)
    plt.xlabel('Major Discipline')
    plt.ylabel('Frequency')
    plt.title(title[index], fontsize=20)


# Distribution over Gender

In [None]:
gender = data[data['target'] == 1]['gender']
temp = gender.value_counts()
labels = temp.keys()
bar,ax = plt.subplots(figsize=(20,12))
plt.pie(x = temp, labels = labels, autopct="%.1f%%")
plt.title('Impact of Gender on leaving company', fontsize=20)

# Box Plot with repect to Training Hours

In [None]:
plt.title('Box Plot with repect to Training Hours', fontsize=20)
for temp in data.columns:
    if data[temp].dtypes == 'object' and temp not in ['city', 'last_new_job']:
       
        sns.boxplot(x = temp, y ='training_hours', data=data)
        plt.xticks(rotation=90)
        plt.xlabel(' '.join(x.capitalize() for x in temp.split('_')))
        plt.ylabel('Training Hours')
        plt.show()
    
       

# Impact of Company type on leaving company

In [None]:
company_type = data[data['target'] == 1]['company_type']
temp = company_type.value_counts()
labels = temp.keys()
bar,ax = plt.subplots(figsize=(20,12))
plt.pie(x = temp, labels = labels, autopct="%.1f%%")
plt.title('Impact of Company type on leaving company', fontsize=20)

# Tenure Working in Company V/S Swiching Company

In [None]:
data['last_new_job'].value_counts()

In [None]:
class_one = 'less than a year'
class_two = 'more than a year'
job_status = list()
for ind, temp in data.iterrows():
    if temp['last_new_job'] == '1':
        temp_class = class_one
    else:
        temp_class = class_two
    if temp['target'] == 0.0:
        classes = 'Will Not Leave'
    else:
        classes = 'Will Leave'
    job_status.append([temp_class, classes])
    
job_change_frequency = pd.DataFrame(job_status, columns = ['Classes', 'Target'])
job_change_frequency

In [None]:
df = job_change_frequency.value_counts().to_frame()
df.reset_index(level=[0,1], inplace=True) 
df = df.rename(columns={0: 'temp'})
sns.barplot(x='Target', y = 'temp', data = df, hue='Classes')
plt.xlabel('Target')
plt.ylabel('Frequency/Count')
plt.title('Tenure Working in Company V/S Swiching Company')

# Impact of Company Size on Target

In [None]:
size_details = list()
company_size = data['company_size'].unique()
for element in company_size:
    for target in data['target'].unique():
        company_target = data[(data['company_size'] == element) & (data['target'] == target)]
        count = company_target.shape[0]
        if target == 1.0:
            temp_target = 'Will Leave'
        else:
            temp_target = 'Will Not Leave'
            
        size_details.append([element, temp_target, count])
company_size = pd.DataFrame(size_details, columns=['size', 'target', 'count'])
company_size

In [None]:
sns.barplot(x = 'target', y = 'count', data = company_size, hue='size')
plt.xlabel('Target')
plt.ylabel('Count/Frequency')
plt.title('Impact of Company Size on Target')

# Correlation Matrix

In [None]:
data = data.drop(columns = ['enrollee_id'])

In [None]:
correlation = data.corr()
sns.heatmap(correlation, annot = True)
plt.title('Correlation Matrix')

# Top 5 city with frequent job change

In [None]:
city_frequency = list()
unique_cities = data['city'].unique()
for city_id, city in enumerate(unique_cities):
    temp = data[(data['city'] == city) & (data['target'] == 1.0)]
    frequency = temp.shape[0]
    city_frequency.append([city, frequency])
city_data = pd.DataFrame(city_frequency, columns = ['city_name', 'frequency'])
sorted_city_frequency = city_data.sort_values(by = 'frequency', ascending = False)

In [None]:
top = 5
top_five = sorted_city_frequency.iloc[:top, :]
sns.barplot(x = 'city_name', y='frequency', data = top_five)
plt.xlabel('City Name')
plt.ylabel('Job Chamge Frequency')
plt.title('Top 5 city with frequent job change')

# Training Hours Histogram

In [None]:
data['training_hours'].median(axis=0)

In [None]:
sns.distplot(a = data['training_hours'])
plt.title('Training Hours Histogram', fontsize=20)
plt.xlabel('Training Hours')

# Target Distribution  

In [None]:
sns.countplot(x = 'target', data = data)
plt.xlabel('Target')
plt.ylabel('Count')
plt.title('Class Distribution')

# Experience Category over Target

In [None]:
data['relevent_experience'].unique()

In [None]:
temp = list()
unique_experience = data['relevent_experience'].unique()
for eg in unique_experience:
    relevant_data = data[(data['relevent_experience'] == eg) & (data['target']==1)]
    non_relevant_data = data[(data['relevent_experience'] == eg ) & (data['target']==0)]
    temp.append([eg, 'target_1', relevant_data.shape[0]])
    temp.append([eg, 'target_0', non_relevant_data.shape[0]])
    
experience_data = pd.DataFrame(temp, columns = ['experience', 'target', 'count'])
experience_data

In [None]:
sns.barplot(x = 'experience', y = 'count', hue = 'target', data = experience_data)
plt.xlabel('Experience Category')
plt.ylabel('Count')