In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import missingno as msno
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.ensemble import IsolationForest
from scipy import stats
import os
import kaleido
import warnings
warnings.filterwarnings('ignore')

In [2]:
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(style="whitegrid")
sns.set_palette("Blues_r")

In [None]:
if not os.path.exists('visualisasi'):
    os.makedirs('visualisasi')

In [3]:
def save_plotly(fig, filename):
    fig.write_image(f"visualisasi/{filename}.png")
    return fig

In [4]:
def styling_ax(ax, title, xlabel=None, ylabel=None):
    ax.set_title(title, fontsize=14, fontweight='bold', pad=20)
    if xlabel:
        ax.set_xlabel(xlabel, fontsize=12)
    if ylabel:
        ax.set_ylabel(ylabel, fontsize=12)
    ax.tick_params(labelsize=10)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    return ax

# Load Dataset

In [5]:
df = pd.read_csv('./data/employee_data.csv')

In [6]:
print(f"Jumlah baris data: {df.shape[0]}")
print(f"Jumlah kolom: {df.shape[1]}")

df.head()

Jumlah baris data: 1470
Jumlah kolom: 35


Unnamed: 0,EmployeeId,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1,38,,Travel_Frequently,1444,Human Resources,1,4,Other,1,...,2,80,1,7,2,3,6,2,1,2
1,2,37,1.0,Travel_Rarely,1141,Research & Development,11,2,Medical,1,...,1,80,0,15,2,1,1,0,0,0
2,3,51,1.0,Travel_Rarely,1323,Research & Development,4,4,Life Sciences,1,...,3,80,3,18,2,4,10,0,2,7
3,4,42,0.0,Travel_Frequently,555,Sales,26,3,Marketing,1,...,4,80,1,23,2,4,20,4,4,8
4,5,40,,Travel_Rarely,1194,Research & Development,2,4,Medical,1,...,2,80,3,20,2,3,5,3,0,2


In [10]:
print("\nTipe data setiap kolom:")
df.dtypes


Tipe data setiap kolom:


EmployeeId                    int64
Age                           int64
Attrition                   float64
BusinessTravel               object
DailyRate                     int64
Department                   object
DistanceFromHome              int64
Education                     int64
EducationField               object
EmployeeCount                 int64
EnvironmentSatisfaction       int64
Gender                       object
HourlyRate                    int64
JobInvolvement                int64
JobLevel                      int64
JobRole                      object
JobSatisfaction               int64
MaritalStatus                object
MonthlyIncome                 int64
MonthlyRate                   int64
NumCompaniesWorked            int64
Over18                       object
OverTime                     object
PercentSalaryHike             int64
PerformanceRating             int64
RelationshipSatisfaction      int64
StandardHours                 int64
StockOptionLevel            

In [11]:
print("\nStatistik Deskriptif:")
print(df.describe().T)


Statistik Deskriptif:
                           count          mean          std     min      25%  \
EmployeeId                1470.0    735.500000   424.496761     1.0   368.25   
Age                       1470.0     36.923810     9.135373    18.0    30.00   
Attrition                 1058.0      0.169187     0.375094     0.0     0.00   
DailyRate                 1470.0    802.485714   403.509100   102.0   465.00   
DistanceFromHome          1470.0      9.192517     8.106864     1.0     2.00   
Education                 1470.0      2.912925     1.024165     1.0     2.00   
EmployeeCount             1470.0      1.000000     0.000000     1.0     1.00   
EnvironmentSatisfaction   1470.0      2.721769     1.093082     1.0     2.00   
HourlyRate                1470.0     65.891156    20.329428    30.0    48.00   
JobInvolvement            1470.0      2.729932     0.711561     1.0     2.00   
JobLevel                  1470.0      2.063946     1.106940     1.0     1.00   
JobSatisfaction  

In [36]:
def plot_numeric_summary(df):
    important_vars = ['Age', 'MonthlyIncome', 'DistanceFromHome', 'YearsAtCompany', 
                      'JobLevel', 'JobSatisfaction', 'WorkLifeBalance']
    
    fig = make_subplots(
        rows=3, 
        cols=3, 
        subplot_titles=important_vars,
        vertical_spacing=0.1,
        horizontal_spacing=0.1
    )
    
    colors = {'Mean': 'royalblue', 'Median': 'darkorange', 'Min': 'green', 
              'Max': 'red', 'StdDev': 'purple'}
    
    for i, var in enumerate(important_vars):
        row = i // 3 + 1
        col = i % 3 + 1
        
        stats = {
            'Mean': df[var].mean(),
            'Median': df[var].median(),
            'Min': df[var].min(),
            'Max': df[var].max(),
            'StdDev': df[var].std()
        }
        
        for j, (stat, value) in enumerate(stats.items()):
            fig.add_trace(
                go.Bar(
                    x=[stat],
                    y=[value],
                    name=stat,
                    marker_color=colors[stat],
                    showlegend=True if (i == 0) else False
                ),
                row=row, col=col
            )
    
    fig.update_layout(
        title_text="Statistik Deskriptif - Variabel Numerik",
        title_font_size=18,
        height=800,
        width=1000,
        legend_title="Statistik",
        plot_bgcolor='white'
    )
    
    for i, var in enumerate(important_vars):
        row = i // 3 + 1
        col = i % 3 + 1
    
    save_plotly(fig, "numeric_summary_subplots")
    return fig

In [37]:
numeric_summary_fig = plot_numeric_summary(df)
numeric_summary_fig.show()

In [34]:
def plot_categorical_summary(df):
    categorical_cols = ['BusinessTravel', 'Department', 'EducationField', 
                        'Gender', 'JobRole', 'MaritalStatus', 'OverTime']
    
    fig = make_subplots(rows=3, cols=3, subplot_titles=categorical_cols)
    
    color_palette = [
        'royalblue',       
        'tomato',          
        'mediumseagreen',  
        'darkorchid',      
        'darkorange',      
        'darkturquoise',   
        'crimson'          
    ]
    
    for i, col in enumerate(categorical_cols):
        row = i // 3 + 1
        col_pos = i % 3 + 1
        
        value_counts = df[col].value_counts().reset_index()
        value_counts.columns = [col, 'Count']
        
        fig.add_trace(
            go.Bar(
                x=value_counts[col], 
                y=value_counts['Count'],
                marker_color=color_palette[i],
                showlegend=False
            ),
            row=row, col=col_pos
        )
    
    fig.update_layout(
        title_text="Distribusi Variabel Kategorikal",
        title_font_size=18,
        height=900,
        width=900,
        plot_bgcolor='white'
    )
    
    save_plotly(fig, "categorical_summary")
    return fig

In [None]:
categorical_summary_fig = plot_categorical_summary(df)
categorical_summary_fig.show()

# Data Cleaning

In [43]:
print("Jumlah nilai yang hilang dalam dataset:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

Jumlah nilai yang hilang dalam dataset:
Attrition    412
dtype: int64


In [None]:
attrition_distribution = df['Attrition'].value_counts(dropna=False)
print("Distribusi nilai Attrition:")
print(attrition_distribution)

Distribusi nilai Attrition:
Attrition
0.0    879
NaN    412
1.0    179
Name: count, dtype: int64


## Handle Missing Value