# EDA for HR Job Change Data Scientist Dataset
### Course Project
### Creator: DCX
**data source:** https://www.kaggle.com/arashnic/hr-analytics-job-change-of-data-scientists?select=aug_test.csv

<h1> The whole EDA framework contains 3 main sequential modules


*   Overall Statistic Report
*   Univariate Analysis
*   Bivariate Analysis (contains Hypothesis Testing)




In [None]:
pip install dataprep

In [None]:
from dataprep.datasets import get_dataset_names
from dataprep.datasets import load_dataset
from dataprep.eda import create_report,plot,plot_missing
import pandas as pd
import numpy as np
import scipy.stats as stats


### 0. Load the dataset

In [None]:
df=pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')

<h2> 1. Overall Statistic Report


In [None]:
# display the overall stat report
display(plot(df, display=['Stats', 'Insights']))
display(df.info())

# store and display the numerical and nonn-numerical cols in df
num_cols=list(df.select_dtypes(include=['number']).columns)
non_num_cols=list((set(df.columns)-set(num_cols)))

print(f'Num cols = {num_cols}')
print(f'Non-num cols = {non_num_cols}')

In [None]:
# info about missing value
print('The missing Value in Dataset\n')
display(plot_missing(df,display=['Stats','Bar Chart','Heat Map']))

<h2> 2. Univariate Analysis



*   for Non-Numerical Cols : display 'stats','pie chart', 'value table'.


*   for Numerical Cols : display 'stats','KDE Plot', 'Q-Q Plot','Box Plot'.



In [None]:
import ipywidgets as widgets

<h3> 2.1 Univariate Analaysis --- Non-Numerical Columns

In [None]:
# for non-numerical columns
choose_non_num=widgets.Dropdown(
    options=non_num_cols,
    description='Choose the Non-Numerical Column You Want to Explore',
    style= {'description_width': 'initial'},
    disabled=False,
)
choose_non_num

In [None]:
print(f'The Non-Numerical Column You Choose is: {choose_non_num.value}\n')
display(plot(df,choose_non_num.value,display=['Stats','Pie Chart','Value Table']))

<h3> 2.2 Univariate Analysis -- Numerical Column

In [None]:
# for numerical columns
choose_num=widgets.Dropdown(
    options=num_cols,
    description='Choose the Numerical Column You Want to Explore',
    style= {'description_width': 'initial'},
    disabled=False,
)
choose_num

In [None]:
# calculate the skewness of the selected column
choose_skewness=df[choose_num.value].skew()

if choose_skewness>0:
  skewness_direction='Skewed Right'
elif choose_skewness==0:
  skewness_direction='Perfectly Normal Distribution'
else:
  skewness_direction='Skewed Left'

In [None]:
print(f'The Non-Numerical Column You Choose is: {choose_num.value}\n')
print('The Normal Distribution Skewness = 0')
print(f'The {choose_num.value} Skewness = {choose_skewness}')
print(f'The Skewness Direction = {skewness_direction}\n')
display(plot(df,choose_num.value,display=['Stats','KDE Plot','Normal Q-Q Plot','Box Plot']))


<h2> 3. Bivariate Analaysis



*   Overall Correlation Heatmap
*   For Selected Bivariate Relationship
** Numerical-Numerical
** Numerical-Categorical
** Categorical-Categorical



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# pairplot
sns.pairplot(data=df,corner=True,diag_kind='kde')


In [None]:
# correlation heatmap
fig, axs=plt.subplots(figsize=(8,7))

sns.heatmap(df.corr(),annot=True, linewidths=.7,cmap='coolwarm',fmt='.2f',ax=axs)

<h3> 3.1 Numerical-Numerical


*   scatter plot
*   hexbin plot



In [None]:
print(f'Num cols = {num_cols}')
print(f'Non-num cols = {non_num_cols}')

In [None]:
choose_num_1=widgets.Dropdown(
    options=num_cols,
    description='Choose the 1st Numerical Column You Want to Compare',
    style= {'description_width': 'initial'},
    disabled=False,
)
display(choose_num_1)

choose_num_2=widgets.Dropdown(
    options=num_cols,
    description='Choose the 2nd Numerical Column You Want to Compare',
    style= {'description_width': 'initial'},
    disabled=False,
)
display(choose_num_2)

In [None]:
print(choose_num_1.value)
print(choose_num_2.value)

num_num_select=list([choose_num_1.value,choose_num_2.value])
print(num_num_select)

In [None]:
# the scatter plot
sns.jointplot(data=df[num_num_select],x=choose_num_1.value,y=choose_num_2.value,marginal_ticks=True)

# the hexbin plot
sns.jointplot(data=df[num_num_select],x=choose_num_1.value,y=choose_num_2.value,marginal_ticks=True,kind='hex',gridsize=15)

<h3> 3.2 Numerical-Categorical 



*   visualization
** box plot
** line chart
*   normal transformation and Kruskal



In [None]:
print(f'Num cols = {num_cols}')
print(f'Non-num cols = {non_num_cols}')

In [None]:
# choose num col
choose_num_1=widgets.Dropdown(
    options=num_cols,
    description='Choose the 1st Numerical Column You Want to Compare',
    style= {'description_width': 'initial'},
    disabled=False,
)
display(choose_num_1)

# choose non-num col
choose_non_num_2=widgets.Dropdown(
    options=non_num_cols,
    description='Choose the 2nd Non-Numerical Column You Want to Compare',
    style= {'description_width': 'initial'},
    disabled=False,
)
display(choose_non_num_2)

In [None]:
print(choose_num_1.value)
print(choose_non_num_2.value)


In [None]:
# visualization
plot(df,choose_num_1.value,choose_non_num_2.value)

In [None]:
from sklearn.preprocessing import PowerTransformer
# normality transformation
num_skew=df[choose_num_1.value].skew()
print(f'The chosen numerical col [{choose_num_1.value}] Skewness = {num_skew}')

In [None]:
transform=PowerTransformer()
tr_num_1=transform.fit_transform(df[[choose_num_1.value]])

In [None]:
# visualize the PowerTransformation
fig, (ax1, ax2) = plt.subplots(ncols=2,figsize=(15,5))
fig.suptitle('Normality Transformation')

sns.kdeplot(data=df[choose_num_1.value],ax=ax1)
ax1.set_title('Before Normality Transformation')
sns.kdeplot(data=tr_num_1,ax=ax2)
ax2.set_title('After Normality Transformation')

In [None]:
# create a df for kruskal test
df_kk=pd.DataFrame()
tr_num_1=tr_num_1.flatten()

num_1=choose_num_1.value
non_num_2=choose_non_num_2.value

df_kk[num_1]=tr_num_1
df_kk[non_num_2]=df[choose_non_num_2.value]

In [None]:
df_kk.groupby(non_num_2).count()

In [None]:
# using dict to store the infor for further kruskal test
df_kk_dict={}
for cat in df_kk[non_num_2].unique():
  df_kk_dict[cat]=[list(df_kk.loc[df_kk[non_num_2]==cat,[num_1]][num_1])]

In [None]:
cat_1=list(df_kk_dict.keys())[0]
cat_2=list(df_kk_dict.keys())[1]
cat_3=list(df_kk_dict.keys())[2]
cat_4=list(df_kk_dict.keys())[3]

print(cat_1,
      cat_3)

In [None]:
# the Kruskal-Wallis H-test for the hypothesis testing
pvalue=stats.kruskal(df_kk_dict[cat_1][0],
              df_kk_dict[cat_3][0]
              ).pvalue
kk_stat=stats.kruskal(df_kk_dict[cat_1][0],
              df_kk_dict[cat_3][0]
              ).statistic

print (f'Numeircal Column:{num_1}')
print(f'Categorical Column:{non_num_2}')
print(f'Categories: {cat_1} , {cat_3}\n')

if pvalue<0.05:
  print (f'Reject Null Hypothesis')
  print(f'There Exists Significant Difference for {num_1} in terms of: [{cat_1}], [{cat_3}]\n')
else:
  print (f'Fail to Reject Null Hypothesis')
  print(f'There is No Significant Difference for {num_1} in terms of: [{cat_1}], [{cat_3}]\n')

print (f'P-value = {pvalue}')
print (f'Statistics = {kk_stat}')


<h3> 3.3 Categorical-Categorical


*   Visualization
*   Chi-Squre Test: 
https://www.youtube.com/watch?v=L1QPBGoDmT0



In [None]:
print(f'Non-num cols = {non_num_cols}')

In [None]:
# User input for variable selection

# choose non-num col_1
choose_non_num_1=widgets.Dropdown(
    options=non_num_cols,
    description='Choose the 1st Non-Numerical Column You Want to Compare',
    style= {'description_width': 'initial'},
    disabled=False,
)
display(choose_non_num_1)

# choose non-num col_2
choose_non_num_2=widgets.Dropdown(
    options=non_num_cols,
    description='Choose the 2nd Non-Numerical Column You Want to Compare',
    style= {'description_width': 'initial'},
    disabled=False,
)
display(choose_non_num_2)

In [None]:
# display the user selection

non_num_1=choose_non_num_1.value
non_num_2=choose_non_num_2.value

print(non_num_1,non_num_2)

In [None]:
# plot the visualization
plot(df,non_num_1,non_num_2,display=['Stacked Bar Chart','Heat Map'])

In [None]:
# Chi-square test
from scipy.stats import chi2_contingency

# 1st step convert the data into a contingency table with frequencies
chi_contigency=pd.crosstab(df[non_num_1],df[non_num_2])
chi_contigency

In [None]:
# 2nd step: Chi-square test of independence.
c, p, dof, expected = chi2_contingency(chi_contigency)


if p<0.05:
  print('Reject Null Hypothesis')
  print(f'The:\n [{non_num_1}],[{non_num_2}] are not independent\n')
else:
  print('Fail to Reject Null Hypothesis')
  print(f'The:\n [{non_num_1}],[{non_num_2}] are independent\n')


print(f'The P-value = {p}')