# Exploratory Data Analysis - Employee Dimension Data
This notebook performs EDA on the `dim_employees_anon.csv` file from the Data Edge Internal Data folder.

In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [2]:
# Set Data Directory Path
data_dir = r"c:\Users\guine\Documents\BC#4\Hyper_python\BI project\Data Edge Internal Data"
file_path = os.path.join(data_dir, "dim_employees_anon.csv")

# Verify file exists
if os.path.exists(file_path):
    print(f"✓ File found: {file_path}")
    print(f"File size: {os.path.getsize(file_path)} bytes")
else:
    print(f"✗ File not found: {file_path}")

✓ File found: c:\Users\guine\Documents\BC#4\Hyper_python\BI project\Data Edge Internal Data\dim_employees_anon.csv
File size: 488 bytes


In [3]:
# Load Employee Data
try:
    df_employees = pd.read_csv(file_path)
    print("✓ Data loaded successfully!")
    print(f"Dataset shape: {df_employees.shape}")
except Exception as e:
    print(f"✗ Error loading data: {e}")

✓ Data loaded successfully!
Dataset shape: (13, 6)


## Dataset Overview

In [4]:
# Basic Information
print("=== BASIC INFORMATION ===")
print(f"Shape: {df_employees.shape}")
print(f"Columns: {list(df_employees.columns)}")
print(f"Data types:\n{df_employees.dtypes}")
print(f"\nMemory usage: {df_employees.memory_usage(deep=True).sum() / 1024:.2f} KB")

=== BASIC INFORMATION ===
Shape: (13, 6)
Columns: ['employee_id', 'employee_code', 'first_name', 'last_name', 'is_active', 'practice']
Data types:
employee_id       int64
employee_code     int64
first_name       object
last_name        object
is_active          bool
practice         object
dtype: object

Memory usage: 2.74 KB


In [5]:
# First Few Rows
print("=== FIRST 5 ROWS ===")
display(df_employees.head())

print("\n=== LAST 5 ROWS ===")
display(df_employees.tail())

=== FIRST 5 ROWS ===


Unnamed: 0,employee_id,employee_code,first_name,last_name,is_active,practice
0,20,22,Luke,Rhinehart,True,
1,21,23,Alexander,Macedon,True,
2,12,18,Rachel,Lamb,True,Analytics
3,2,9,Bingo,Storm,True,Analytics
4,9,15,Astarion,Baldersson,True,Analytics



=== LAST 5 ROWS ===


Unnamed: 0,employee_id,employee_code,first_name,last_name,is_active,practice
8,11,3,ilya,Altman,True,Business
9,4,19,Gunnar,Gunnarsson,True,Business
10,10,12,Dag,Ström,True,Data
11,6,4,Jana,Bjork,True,Data
12,8,1,Gabriel,Lennartsson,True,Data


## Data Quality Assessment

In [6]:
# Missing Values Analysis
print("=== MISSING VALUES ===")
missing_data = df_employees.isnull().sum()
missing_percent = (missing_data / len(df_employees)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)

print(missing_df[missing_df['Missing Count'] > 0])

# Duplicates
print(f"\n=== DUPLICATES ===")
print(f"Total duplicated rows: {df_employees.duplicated().sum()}")
print(f"Percentage of duplicates: {(df_employees.duplicated().sum() / len(df_employees)) * 100:.2f}%")

=== MISSING VALUES ===
          Missing Count  Missing Percentage
practice              2           15.384615

=== DUPLICATES ===
Total duplicated rows: 0
Percentage of duplicates: 0.00%


## Statistical Summary

In [9]:
df_employees.practice.value_counts()

practice
Analytics    4
Business     4
Data         3
Name: count, dtype: int64