<h1>HR Analytics: Job Change of Data Scientists

In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, OneHotEncoder, MaxAbsScaler, LabelEncoder
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_classif

<h2>The hidden cell below defines a class named 'AutoEdaBinaryClassif' to perform auto EDA for any binary classification problem.

In [None]:
class AutoEdaBinaryClassif():
    def __init__(self):
        pass
    
    
    def primary_eda(self,data):
        '''
        This method returns head, shape,data types of variables 
        and summary statistics of input data frame.
        
        params:
        data: input data frame.
        
        Returns: None.
        
        '''
        data = data.copy()
        print("First few records")
        display(data.head().T)
        shape = data.shape
        print(f'Dataset has {shape[0]} records/rows and {shape[1]} features/columns\n')
        data_types = data.dtypes
        data_types = [f'{i}: {j}' for i, j in zip(list(data_types.index), list(data_types.astype('str')))]
        print(f'Data types: {data_types}\n\n\nSummary of numeric variables:')
        display(data.describe().T)
        print(f'Summary of categorical variables:')
        display(data.describe(include='object').T)
    
    
    def missing_value_summary(self, data, show_plot=True, plot_width=8, plot_height=8):
        '''
        This method returns missing value summary of an input data frame and also
        plots a heatmap of the missing values across columns and indices.
        
        params:
        data: input data frame.
        show_plot: boolean to disply the missing value plot, default value = True
        plot_width: width of the heatmap, default value = 8.
        plot_height: height of the heatmap, default value = 8.
        
        Returns: Data frame with missing value summary.
        
        '''
        data = data.copy()
        miss_val_cnt = data.isnull().sum().astype('int')
        miss_val_per = np.round((miss_val_cnt / len(data)) * 100,2)
        missing_value_summary = pd.concat([miss_val_cnt,miss_val_per],axis=1).reset_index()
        missing_value_summary.columns = ['column', 'missing_count', 'missing_percent']
        miss_str = missing_value_summary.copy()
        miss_str['column'] = miss_str['column'] + ':'
        miss_str['missing_percent'] = '(' + miss_str['missing_percent'].astype('str') + '%)'
        miss_str = "; ".join(miss_str.apply(lambda x: ''.join([str(i) for i in list(x)]),axis=1))
        print(f'Missing values: Format- column:count(percent)\n\n{miss_str}')
        if show_plot == True:
            miss_val_mark = data.isnull()
            fig = plt.figure(figsize=(plot_width, plot_height))
            sns.heatmap(miss_val_mark,cbar=False)
            plt.title('Missing values')
            plt.show()
        return missing_value_summary
    
    
    def class_separation(self, data, categorical_variables, method='pca', plot_alpha=0.5):
        '''
        This method uses 2-D scatter plot to visualize the class separation 
        by reducing the data dimensions to 2, using PCA/SVD.
        
        params:
        data: input data frame.
        categorical_variables: list containing categorical variable/column names of 'data'.
        method: technique to be used for dimensionality reduction. Takes two values 'pca' or 'svd'.
        default value = 'pca'
        plot_alpha: specifies the transparency of the markers in the scatter plot.
        default value = 0.5.
        
        Returns: None.
        
        '''
        categorical_variables = categorical_variables.copy()
        data = data.copy()
        data = data.dropna().copy()
        print(f'{len(data)} examples remaining after dropping examples with NaN')
        X = data.iloc[:,:-1].copy()
        y = LabelEncoder().fit_transform(data['target']).copy()
        
        if method == 'pca':
            col_transform = ColumnTransformer(transformers=[['ordinal_encoder',
                                                             OrdinalEncoder(),
                                                             categorical_variables]],
                                              remainder='passthrough')

            X = col_transform.fit_transform(X).copy()
            min_max = MinMaxScaler()
            X = min_max.fit_transform(X).copy()

            pca = PCA(n_components=2,random_state=11)
            X_pca = pca.fit_transform(X).copy()
            X_pca = pd.DataFrame(X_pca,columns=["component_1","component_2"])
            X_pca['y'] = y

            print(f'Total explained variance ratio: {np.cumsum(pca.explained_variance_ratio_)[-1]}\nTotal explained variance ratio less than 0.6 may not be reliable')
            
            fig = plt.figure(figsize=(8,8))
            sns.scatterplot(data=X_pca,x='component_1',y="component_2",hue='y',alpha=plot_alpha);
        else:
            col_transform = ColumnTransformer(transformers=[['ohe',
                                                             OneHotEncoder(),
                                                             categorical_variables]],
                                              remainder='passthrough')

            X = col_transform.fit_transform(X).copy()
            max_abs = MaxAbsScaler()
            X = max_abs.fit_transform(X).copy()

            tsvd = TruncatedSVD(n_components=2,random_state=11)
            X_svd = tsvd.fit_transform(X).copy()
            X_svd = pd.DataFrame(X_svd,columns=["component_1","component_2"])
            X_svd['y'] = y

            print(f'Total explained variance ratio: {np.cumsum(tsvd.explained_variance_ratio_)[-1]}\nTotal explained variance ratio less than 0.6 may not be reliable')
            
            fig = plt.figure(figsize=(8,8))
            sns.scatterplot(data=X_svd,x='component_1',y="component_2",hue='y',alpha=plot_alpha);

            
    def density_plots(self, data, numeric_variables, plot_width = 15, plot_height = 20):
        '''
        This method displays the KDE plots of the numeric variables in a data frame.
        
        params:
        data: input data frame.
        numeric_variables: list containing numeric variable/column names of 'data'.
        plot_width: width of the plot, default value = 15.
        plot_height: height of the plot, default value = 20.
        
        Returns: None
        
        '''
        numeric_variables = numeric_variables.copy()
        data = data.copy()
        numeric_variables_length = len(numeric_variables)
        rows_in_plot = int(numeric_variables_length / 2) if (numeric_variables_length % 2 == 0) else (int(numeric_variables_length / 2) + 1)
        data[numeric_variables].plot(kind='kde', figsize=(plot_width, plot_height), subplots=True, layout=(rows_in_plot,2),sharex = False, sharey = False, color='black');
        
    
    def cdf(self, data, numeric_variables, plot_width=15, plot_height=20):
        data = data.copy()
        numeric_variables = numeric_variables.copy()
            
        r = c = 0
        numeric_variables_length = len(numeric_variables)
        rows_in_plot = int(numeric_variables_length / 2) if (numeric_variables_length % 2 == 0) else (int(numeric_variables_length / 2) + 1)
        fig,ax = plt.subplots(rows_in_plot,2,figsize=(plot_width, plot_height))
        
        if rows_in_plot > 1:
            for n,i in enumerate(numeric_variables):
                val_cnts = np.round(data[i].value_counts(normalize=True),3).sort_index().copy()
                val_cnts = val_cnts.cumsum().copy()
                val_cnts.plot(kind='line',ax=ax[r,c],title=f'CDF of {i}',color='black',xlabel=i,ylabel='cdf')
                ax[r,c].yaxis.grid(color='lightgray', linestyle='dashed')
                ax[r,c].xaxis.grid(color='lightgray', linestyle='dashed')
                c+=1
                if (n+1)%2==0:
                    r+=1
                    c=0
            if (numeric_variables_length % 2) != 0:
                ax[r,c].axis("off")
        else:
            for n,i in enumerate(numeric_variables):
                val_cnts = np.round(data[i].value_counts(normalize=True),3).sort_index().copy()
                val_cnts = val_cnts.cumsum().copy()
                val_cnts.plot(kind='line',ax=ax[n],title=f'CDF of {i}',color='black',xlabel=i,ylabel='cdf')
                ax[n].yaxis.grid(color='lightgray', linestyle='dashed')
                ax[n].xaxis.grid(color='lightgray', linestyle='dashed')
                c+=1
                if (n+1)%2==0:
                    r+=1
                    c=0
            if numeric_variables_length == 1:
                ax[1].axis('off')
        plt.show()
        
        
    
    
    def normality_test(self, data, numeric_variables):
        '''
        This method performs a test for normality using 'normaltest' function of scipy.stats module.
        
        params:
        data: input data frame.
        numeric_variables: list containing numeric variable/column names of 'data'.
        
        Returns: None
        
        '''
        numeric_variables = numeric_variables.copy()
        data = data.copy()
        print("The test for normality is performed using 'normaltest' function of scipy.stats\n\nSignificance Level (alpha) : 0.05\n\n" +
        "h0:Sample comes from a normal distribution\nh1:Sample doesn't come from a normal distribution\n\n")
        for i in numeric_variables:
            print(f"{i}: {'Non-Gaussian' if (stats.normaltest(data[i])[1])<0.05 else 'Gaussian'}  {stats.normaltest(data[i])}")
       
    
    def qqplots(self, data, numeric_variables):
        '''
        This method displays the Q-Q plots of the specified numeric variables in the input data frame.
        
        params:
        data: input data frame.
        numeric_variables: list containing numeric variable/column names of 'data'.
        
        Returns: None.
        
        '''
        numeric_variables = numeric_variables.copy()
        data = data.copy()
        for n,i in enumerate(numeric_variables):
            stats.probplot(data[i],plot=plt)
            plt.title(i)
            plt.show()
           
        
    def boxplots(self, data, numeric_variables, plot_width=15, plot_height=20):
        '''
        This method displays the box plots of the specified numeric variables in a data frame.
        
        params:
        data: input data frame.
        numeric_variables: list containing numeric variable/column names of 'data'.
        plot_width: width of the plot, default value = 15.
        plot_height: height of the plot, default value = 20.
        
        Returns: None.
        
        '''
        numeric_variables = numeric_variables.copy()
        data = data.copy()
        numeric_variables_length = len(numeric_variables)
        rows_in_plot = int(numeric_variables_length / 2) if (numeric_variables_length % 2 == 0) else (int(numeric_variables_length / 2) + 1)
        data[numeric_variables].plot(kind='box',subplots=True,layout=(rows_in_plot,2),figsize=(plot_width, plot_height), color='black');
        
        
    def skewness_test(self, data, numeric_variables):
        '''
        This method performs a test for skewness and also displays the skewness value.
        
        params:
        data: input data frame.
        numeric_variables: list containing numeric variable/column names of 'data'.
        
        Returns: None
        
        '''
        numeric_variables = numeric_variables.copy()
        data = data.copy()
        for i in numeric_variables:
            print(f"{i}: {'Skewed' if (stats.skewtest(data[i])[1])<0.05 else 'Not Skewed'}  {stats.skew(data[i])}")
            
    
    def feature_correlation(self, data, numeric_variables, corr_matrix_width = 10, corr_matrix_height = 10, plot = True, plot_width = 10, plot_height = 10):
        '''
        This method displays the Pearson & Kendall rank correlation coefficients of the specified
        numeric features of the input data frame. The correlation coefficients are displayed using 
        seaborn' heatmap.
        
        params:
        data: input data frame.
        numeric_variables: list containing numeric variable/column names of 'data'.
        corr_matrix_width: width of correlation heatmaps, default value = 10.
        corr_matrix_height: height of correlation heatmaps, default value = 10.
        plot: boolean to control the display of pairplots, default value = True.
        plot_width: width of pair plot, default value = 10.
        plot_height: height of pair plot, default value = 10.
        
        Returns: None.
        
        '''
        numeric_variables = numeric_variables.copy()
        data = data.copy()
        fig = plt.figure(figsize=(corr_matrix_width, corr_matrix_height))
        sns.heatmap(data[numeric_variables].corr(method='pearson'),mask=np.triu(data[numeric_variables].corr()),
                    annot=True,fmt='.2f',
                    cbar=False,cmap=['white'],linewidths=0.01,linecolor='black',square=True)
        plt.title('Pearson correlation')
        plt.show();
        
        fig = plt.figure(figsize=(corr_matrix_width, corr_matrix_height))
        sns.heatmap(data[numeric_variables].corr(method='kendall'),mask=np.triu(data[numeric_variables].corr()),
                    annot=True,fmt='.2f',
                    cbar=False,cmap=['white'],linewidths=0.01,linecolor='black',square=True)
        plt.title('Kendall rank correlation')
        plt.show();
        
        if plot == True:
            fig = plt.figure(figsize=(plot_width, plot_height))
            sns.pairplot(data[numeric_variables])
            plt.show();
            
        
    
    def numeric_variables_vs_target(self, data, numeric_variables, target, plot_width=15, plot_height=20):
        '''
        This method plots violin plots of specified numeric variables vs target variable.
        
        params:
        data: input data frame.
        numeric_variables: list containing numeric variable/column names of 'data'.
        target: string specifying the name of target variable.
        plot_width: width of pair plot, default value = 15.
        plot_height: height of pair plot, default value = 20.
        
        Returns: None.
        
        '''
        numeric_variables = numeric_variables.copy()
        data = data.copy()
        r = c = 0
        numeric_variables_length = len(numeric_variables)
        rows_in_plot = int(numeric_variables_length / 2) if (numeric_variables_length % 2 == 0) else (int(numeric_variables_length / 2) + 1)
        fig,ax = plt.subplots(rows_in_plot,2,figsize=(plot_width, plot_height))
        
        if rows_in_plot > 1:
            for n,i in enumerate(numeric_variables):
                med = data[[i,target]].groupby(target).median().copy()
                sns.violinplot(x=target,y=i,data=data,ax=ax[r,c],palette=["gray","lightgray"])
                med.plot(ax=ax[r,c],color='black',linewidth=3,linestyle="--",legend=False)
                for x,y in zip(list(med.index),med[i]):
                    ax[r,c].text(x=x+0.05,y=y+0.01,s=np.round(y,2),fontsize=10,color='white',backgroundcolor='black')
                ax[r,c].set_title(i.upper()+" by "+target)
                c+=1
                if (n+1)%2==0:
                    r+=1
                    c=0
            if (numeric_variables_length % 2) != 0:
                ax[r,c].axis("off")
        else:
            for n,i in enumerate(numeric_variables):
                med = data[[i,target]].groupby(target).median().copy()
                sns.violinplot(x=target,y=i,data=data,ax=ax[c],palette=["gray","lightgray"])
                med.plot(ax=ax[c],color='black',linewidth=3,linestyle="--",legend=False)
                for x,y in zip(list(med.index),med[i]):
                    ax[c].text(x=x+0.05,y=y+0.01,s=np.round(y,2),fontsize=10,color='white',backgroundcolor='black')
                ax[c].set_title(i.upper()+" by "+target)
                c+=1
                if (n+1)%2==0:
                    r+=1
                    c=0
            if numeric_variables_length == 1:
                ax[1].axis('off')
        plt.show()
        
        
    def categorical_variable_distribution(self, data, categorical_variables, plot=True, vars_to_exclude_from_plots=[], plot_width=10, plot_height=10):
        categorical_variables = categorical_variables.copy()
        data = data.copy()
        data = data.fillna('NaN/Missing').copy()
        for i in categorical_variables:
            val_cnt = pd.DataFrame(np.round(data[i].value_counts(normalize=True)*100,2))
            val_cnt.columns = ['Proportion']
            n_unique = data[i].nunique()
            excl_str = ""
            if 'NaN/Missing' in list(val_cnt.index):
                n_unique -= 1
                excl_str = "(excluding 'NaN/Missing')"
            print(f"{i}: {n_unique} unique categories{excl_str}\n{val_cnt}\n\n")
            
            
        if plot == True:
            categorical_variables = categorical_variables.copy()
            data = data.copy()
            for rem in vars_to_exclude_from_plots:
                categorical_variables.remove(rem)
            r = c = 0
            variables_length = len(categorical_variables)
            rows_in_plot = int(variables_length / 2) if (variables_length % 2 == 0) else (int(variables_length / 2) + 1)
            fig,ax = plt.subplots(rows_in_plot,2,figsize=(plot_width, plot_height))

            if rows_in_plot > 1:
                for n,i in enumerate(categorical_variables):
                    cat_dist = data[i].value_counts().sort_values().copy()
                    cat_dist = np.round((cat_dist / cat_dist.sum()) * 100,1).copy()
                    cat_dist.plot(kind="barh",ax=ax[r,c],sharey=False,title=i,color='black')
                    for x,y in enumerate(list(cat_dist.index)):
                        ax[r,c].text(y=x-0.02,x=cat_dist[y],s=f'{np.round(cat_dist[y],2)}%')
                    ax[r,c].set_title(i)
                    c+=1
                    if (n+1)%2==0:
                        r+=1
                        c=0
                if (variables_length % 2) != 0:
                    ax[r,c].axis("off")
            else:
                for n,i in enumerate(categorical_variables):
                    cat_dist = data[i].value_counts().sort_values().copy()
                    cat_dist = np.round((cat_dist / cat_dist.sum()) * 100,1).copy()
                    cat_dist.plot(kind="barh",ax=ax[n],sharey=False,title=i,color='black')
                    for x,y in enumerate(list(cat_dist.index)):
                        ax[n].text(y=x-0.02,x=cat_dist[y],s=f'{np.round(cat_dist[y],2)}%')
                    ax[n].set_title(i)
                    c+=1
                    if (n+1)%2==0:
                        r+=1
                        c=0
                if variables_length == 1:
                    ax[1].axis('off')
            plt.show()
            
            
    def categorical_variables_vs_target(self, data, categorical_variables, target, crosstabs=True, crosstab_show_missing=False, crosstab_proportion=True, col_proportion=True, plots=True, vars_to_exclude_from_plots=[],plot_width=15,plot_height=20):
        categorical_variables = categorical_variables.copy()
        data = data.copy()
        categorical_variables = categorical_variables.copy()
        for i in categorical_variables:
            ct = pd.crosstab(columns=data[i],index=data[target])
            stat, p, dof, expected = chi2_contingency(ct) 
            print(f"\n{'-'*len(f'Chi-Square test between {i} & {target}')}")
            print(f'Chi-Square test between {i} & {target}')
            print(f"{'-'*len(f'Chi-Square test between {i} & {target}')}")
            print(f"\nH0: THERE IS NO RELATIONSHIP BETWEEN {target.upper()} & {i.upper()}\nH1: THERE IS RELATIONSHIP BETWEEN {target.upper()} & {i.upper()}")
            print(f"\nP-VALUE: {p}")
            print("REJECT H0" if p<0.05 else "FAILED TO REJECT H0")
        
        
        if plots == True:
            for rem in vars_to_exclude_from_plots:
                categorical_variables.remove(rem)
            cat_variables_length = len(categorical_variables)
            rows_in_plot = int(cat_variables_length / 2) if (cat_variables_length % 2 == 0) else (int(cat_variables_length / 2) + 1)
            r = c = 0
            fig,ax = plt.subplots(rows_in_plot,2,figsize=(plot_width,plot_height))
        
    
            if rows_in_plot > 1:
                for n,i in enumerate(categorical_variables):
                        ct = pd.crosstab(columns=data[i],index=data[target],normalize="columns")
                        ct.T.plot(kind="bar",stacked=True,color=["black","gray"],ax=ax[r,c])
                        ax[r,c].set_ylabel("% of observations")
                        ax[r,c].set_xlabel("")
                        ax[r,c].set_title(f'{i} vs {target}')
                        c+=1
                        if (n+1)%2==0:
                            r+=1
                            c=0
                if (cat_variables_length % 2) != 0:
                    ax[r,c].axis("off")
            else:
                for n,i in enumerate(categorical_variables):
                        ct = pd.crosstab(columns=data[i],index=data[target],normalize="columns")
                        ct.T.plot(kind="bar",stacked=True,color=["black","gray"],ax=ax[c])
                        ax[c].set_ylabel("% of observations")
                        ax[c].set_xlabel("")
                        ax[c].set_title(f'{i} vs {target}')
                        c+=1
                        if (n+1)%2==0:
                            r+=1
                            c=0
                if cat_variables_length == 1:
                    ax[1].axis('off')


            plt.show()
            
            
        if crosstabs == True:
            for i in categorical_variables:
                
                if crosstab_proportion == True:
                    if col_proportion == True:
                        print(f"\n{'-'*len(f'Crosstab (Column-wise Proportion) between {i} & {target}')}")
                        print(f'Crosstab (Column-wise Proportion) between {i} & {target}')
                        if crosstab_show_missing == True:
                            data_1 = data.copy()
                            data_1.fillna('NaN/Missing',inplace=True)
                            ct = np.round(pd.crosstab(index=data_1[i],columns=data_1[target],normalize='columns') * 100, 2)
                        else:
                            ct = np.round(pd.crosstab(index=data[i],columns=data[target],normalize='columns') * 100, 2)
                        print(f"{'-'*len(f'Crosstab (Column-wise Proportion) between {i} & {target}')}")
                    else:
                        print(f"\n{'-'*len(f'Crosstab (Row-wise Proportion) between {i} & {target}')}")
                        print(f'Crosstab (Row-wise Proportion) between {i} & {target}')
                        if crosstab_show_missing == True:
                            data_1 = data.copy()
                            data_1.fillna('NaN/Missing',inplace=True)
                            ct = np.round(pd.crosstab(index=data_1[i],columns=data_1[target],normalize='index') * 100, 2)
                        else:
                            ct = np.round(pd.crosstab(index=data[i],columns=data[target],normalize='index') * 100, 2)
                        print(f"{'-'*len(f'Crosstab (Row-wise Proportion) between {i} & {target}')}")
                else:
                    print(f"\n{'-'*len(f'Crosstab between {i} & {target}')}")
                    print(f'Crosstab between {i} & {target}')
                    print(f"{'-'*len(f'Crosstab between {i} & {target}')}")
                    if crosstab_show_missing == True:
                            data_1 = data.copy()
                            data_1.fillna('NaN/Missing',inplace=True)
                            ct = pd.crosstab(index=data_1[i],columns=data_1[target])
                    else:
                        ct = pd.crosstab(index=data[i],columns=data[target])
                print(ct)
                
                
    def target_distribution(self, data, target):
        data = data.copy()
        sns.countplot(data=data,x=target,palette=['black','gray']);
        tgt_cnt = data[target].value_counts()
        tgt_prop = np.round(data[target].value_counts(normalize=True)*100,1)
        plt.text(x=-0.2,y=tgt_cnt[0]/2,s=f'{tgt_cnt[0]:,} ({tgt_prop[0]}%)',backgroundcolor='white')
        plt.text(x=0.8,y=tgt_cnt[1]/2,s=f'{tgt_cnt[1]:,} ({tgt_prop[1]}%)',backgroundcolor='white')
        plt.title('Target Distribution')
        
        
    def mutual_info(self, data, categorical_variables, target, plot_width=15, plot_height=10):
        df = data.copy()

        X = df.drop(columns=[target]).copy()
        y = df[target].values.copy()

        for i in categorical_variables:
            le = LabelEncoder()
            X[i] = le.fit_transform(X[i])

        mutual_info = mutual_info_classif(X=X,y=y,discrete_features=(X.dtypes == np.int64),random_state=11)
        mutual_info_df = pd.DataFrame({'feature':X.columns,'MI':mutual_info}).sort_values(by='MI',ascending=False)
        mutual_info_df.sort_values(by='MI').plot(x='feature',y='MI',kind='barh',figsize=(plot_width,plot_height),color='black',title='Mutual information')
        for n,k in enumerate(range((len(mutual_info_df)-1),-1,-1)):
            plt.text(y=n-0.01,x=mutual_info_df.iloc[k,1],s=np.round(mutual_info_df.iloc[k,1],4))
        plt.show()

        return mutual_info_df
    
    def missing_value_analysis(self, data, col_to_analyze, cols_to_use, target):
        main_df = pd.DataFrame()
        for col in cols_to_use:
            df = data.copy()
            df = df.set_index(col).copy()
            df.fillna('missing',inplace=True)
            df[df!='missing'] = np.nan
            df = df.groupby(col).count().copy()
            vc = data[col].value_counts().copy()
            df = df.merge(vc,left_index=True,right_index=True,how='left').copy()
            df = np.round(df.apply(lambda x: x/df.iloc[:,-1]) * 100, 2).copy()
            df.drop(columns=df.columns[-1], inplace=True)
            df.index = col + ": " + pd.Series(df.index)
            main_df = main_df.append(df).copy()

        df = main_df[[col_to_analyze]].sort_values(by=col_to_analyze,ascending=False)
        fig = plt.figure(figsize=(20,2))
        plt.title(f'Missing Values Analysis of {col_to_analyze}')
        n = int(np.ceil((len(df)/2)))
        sns.heatmap(df.iloc[:n,:].T,annot=True,cbar=False,linewidths=0.01,fmt='.1f',cmap=['white'],linecolor='black',square=True)
        plt.show()
        fig = plt.figure(figsize=(20,2))
        plt.title(f'Missing Values Analysis of {col_to_analyze}')
        sns.heatmap(df.iloc[n:,:].T,annot=True,cbar=False,linewidths=0.01,fmt='.1f',cmap=['white'],linecolor='black',square=True)
        plt.show()


In [None]:
data = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
data.drop(columns=['enrollee_id'],inplace=True)

<h2>Creating an instance of EdaBinaryClassif class

In [None]:
eda = AutoEdaBinaryClassif()

<h2>Preliminary EDA

In [None]:
print(eda.primary_eda.__doc__)

In [None]:
eda.primary_eda(data)

<h3>Data type of all the variables looks good. 'city' and 'experience' have high cardinality. Both the numeric variables have no missing values. But we can see that the count of categorical variables shows missing values.

<h2>Missing Values Summary

In [None]:
print(eda.missing_value_summary.__doc__)

In [None]:
missing_data = eda.missing_value_summary(data,plot_width=10,plot_height=12)

<h3>'company_type', 'company_size', 'gender' and 'major_discipline' have lot of missing values.

<h2>Storing Categorical & Numeric Features in Separate Variables

In [None]:
categorical_variables = list(data.select_dtypes('object').columns)

numeric_variables = list(data.select_dtypes(['float64','int64']).columns)
numeric_variables.remove('target')

target = 'target'

print(f'Categorical Variables ({len(categorical_variables)}):\n{categorical_variables}\n\nNumeric Variables ({len(numeric_variables)}):\n{numeric_variables}\n\nTarget:\n{target}')

<h2>Missing value analysis to find out MAR and NMAR cases, where missing value % is high

In [None]:
eda.missing_value_analysis(data=data,
                       col_to_analyze='major_discipline',
                       cols_to_use=['gender','relevent_experience','enrolled_university',
                                    'education_level','experience',
                                    'company_size','company_type','last_new_job'],
                       target=target)

<h3>In the above heatmap, we analyzed the missing values for 'major_discipline' variable. We find that 100% of the values are missing in 'major_discipline', where 'education_level' is High school and primary school. This is fine, as people who have primary school as their highest education can't have a major discipline. So this case must be marked separately.

In [None]:
eda.missing_value_analysis(data=data,
                       col_to_analyze='gender',
                       cols_to_use=['relevent_experience','enrolled_university',
                                    'education_level','experience','major_discipline',
                                    'company_size','company_type','last_new_job'],
                       target=target)

<h3>People with 'experience' less than or equal to 2 haven't disclosed their gender. We can't find any reason behind it.

In [None]:
eda.missing_value_analysis(data=data,
                       col_to_analyze='company_type',
                       cols_to_use=['gender','relevent_experience','enrolled_university',
                                    'education_level','experience','major_discipline',
                                    'company_size','last_new_job'],
                       target=target)

<h2>Visualizing Class Separation

In [None]:
print(eda.class_separation.__doc__)

<h3>Class Separation Using PCA

In [None]:
eda.class_separation(data=data,
                 categorical_variables=categorical_variables,
                 method='pca',
                 plot_alpha=0.5)

<h3>Class Separation With SVD

In [None]:
eda.class_separation(data=data,
                 categorical_variables=categorical_variables,
                 method='svd',
                 plot_alpha=0.5)

<h2>Distribution of Numeric Variables

In [None]:
print(eda.density_plots.__doc__)

In [None]:
eda.density_plots(data=data,
                  numeric_variables=numeric_variables,
                  plot_width = 12,
                  plot_height = 5)

<h2>Cumulative distribution Function (CDF) of Numeric Variables

In [None]:
eda.cdf(data=data,
        numeric_variables=numeric_variables,
        plot_height=5)

<h3>The above CDF shows ~50% of 'city_development_index' is less than 0.9. While, 80% of 'training_hours' are less that 100.

<h2>Normality Tests

In [None]:
print(eda.normality_test.__doc__)

In [None]:
eda.normality_test(data=data,
                   numeric_variables=numeric_variables)

<h2>Q-Q Plots

In [None]:
print(eda.qqplots.__doc__)

In [None]:
eda.qqplots(data=data,
           numeric_variables=numeric_variables)

<h2>Box Plots To Detect Outliers

In [None]:
print(eda.boxplots.__doc__)

In [None]:
eda.boxplots(data=data,
            numeric_variables=numeric_variables,
            plot_width=10,
            plot_height=4)

<h2>Skewness Test

In [None]:
print(eda.skewness_test.__doc__)

In [None]:
eda.skewness_test(data=data,
           numeric_variables=numeric_variables)

<h2>Correlation Between Features

In [None]:
print(eda.feature_correlation.__doc__)

In [None]:
eda.feature_correlation(data=data,
           numeric_variables=numeric_variables,
           corr_matrix_width=3,
           corr_matrix_height=3)

<h3>We can see no relation between 'city_development_index' and 'training_hours'

<h2>Numeric Variables Vs Target

In [None]:
print(eda.numeric_variables_vs_target.__doc__)

In [None]:
eda.numeric_variables_vs_target(data=data,
                                numeric_variables=numeric_variables,
                                target=target,
                                plot_width=12,
                                plot_height=5)

<h3><ol><li>Relatively more employees stay with the companies located in cities with higher development index.</li>
    <li>Training hours have no major impact on the target variable. People staying with and leaving the company have a similar distribution of training hours.</li></ol></h3>

<h2>Categorical Variable Distribution

In [None]:
eda.categorical_variable_distribution(data=data,
                                     categorical_variables=categorical_variables,
                                     plot_width=20,
                                     plot_height=35,
                                     vars_to_exclude_from_plots=['city'])

<h2>Categorical Variables Vs Target

In [None]:
eda.categorical_variables_vs_target(data=data,
                                    categorical_variables=categorical_variables,
                                    target=target,
                                    vars_to_exclude_from_plots=['city'],
                                    plot_width=20,
                                    plot_height=35,
                                    col_proportion=False,
                                    crosstab_show_missing=False)

<h3><ol><li>No strong relationship between target and gender.</li>
    <li>People with relevant experience are more likely to stay with their current company</li>
    <li>People with higher 'experience' are likely to stay with their current company.</li>
    <li>People enrolled in a full time course are more likely to leave.</li>
    <li>Graduates and people with masters are more likely to leave.</li></ol></h3>

<h2>Target Distribution

In [None]:
eda.target_distribution(data=data,
                       target=target)

<h3>The class is imbalanced

In [None]:
mutual_info = eda.mutual_info(data=data,
               categorical_variables=categorical_variables,
               target=target,
               plot_height=7)

<h1>WORK IN PROGRESS