# Exploratory Data Analysis - Employee Dimension Data
This notebook performs EDA on the `dim_employees_anon.csv` file from the Data Edge Internal Data folder.

In [2]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [3]:
# Set Data Directory Path
data_dir = r"c:\Users\guine\Documents\BC#4\Hyper_python\BI project\Data Edge Internal Data"
file_path = os.path.join(data_dir, "stg_qbis__activity_time.csv")

# Verify file exists
if os.path.exists(file_path):
    print(f"✓ File found: {file_path}")
    print(f"File size: {os.path.getsize(file_path)} bytes")
else:
    print(f"✗ File not found: {file_path}")

✓ File found: c:\Users\guine\Documents\BC#4\Hyper_python\BI project\Data Edge Internal Data\stg_qbis__activity_time.csv
File size: 52639 bytes


In [4]:
# Load Employee Data
try:
    df_qbis_time = pd.read_csv(file_path)
    print("✓ Data loaded successfully!")
    print(f"Dataset shape: {df_qbis_time.shape}")
except Exception as e:
    print(f"✗ Error loading data: {e}")

✓ Data loaded successfully!
Dataset shape: (866, 8)


## Dataset Overview

In [6]:
# Basic Information
print("=== BASIC INFORMATION ===")
print(f"Shape: {df_qbis_time.shape}")
print(f"Columns: {list(df_qbis_time.columns)}")
print(f"Data types:\n{df_qbis_time.dtypes}")
print(f"\nMemory usage: {df_qbis_time.memory_usage(deep=True).sum() / 1024:.2f} KB")

=== BASIC INFORMATION ===
Shape: (866, 8)
Columns: ['activity_time_id', 'employee_id', 'activity_id', 'activity_date', 'minutes', 'factor_value', 'notes_internal', 'processed_at']
Data types:
activity_time_id      int64
employee_id           int64
activity_id           int64
activity_date        object
minutes               int64
factor_value        float64
notes_internal       object
processed_at         object
dtype: object

Memory usage: 191.56 KB


In [7]:
# First Few Rows
print("=== FIRST 5 ROWS ===")
display(df_qbis_time.head())

print("\n=== LAST 5 ROWS ===")
display(df_qbis_time.tail())

=== FIRST 5 ROWS ===


Unnamed: 0,activity_time_id,employee_id,activity_id,activity_date,minutes,factor_value,notes_internal,processed_at
0,750,5,20,2025-05-10,0,0.0,Data Innovation Summit + Interna möten fredag,2025-09-08 05:26:08.566517 UTC
1,1256,3,17,2025-06-26,30,1.0,,2025-09-08 05:26:08.566517 UTC
2,1255,3,17,2025-06-25,30,1.0,,2025-09-08 05:26:08.566517 UTC
3,1257,3,17,2025-06-27,30,1.0,,2025-09-08 05:26:08.566517 UTC
4,1564,3,21,2025-08-21,30,1.0,,2025-09-08 05:26:08.566517 UTC



=== LAST 5 ROWS ===


Unnamed: 0,activity_time_id,employee_id,activity_id,activity_date,minutes,factor_value,notes_internal,processed_at
861,230,4,14,2025-04-02,540,1.0,,2025-09-08 05:26:08.566517 UTC
862,222,4,14,2025-04-07,540,1.0,,2025-09-08 05:26:08.566517 UTC
863,209,4,14,2025-04-22,600,1.0,,2025-09-08 05:26:08.566517 UTC
864,210,4,14,2025-04-23,600,1.0,,2025-09-08 05:26:08.566517 UTC
865,359,5,18,2025-05-05,960,1.0,"FYI Lägger 8 tim extra som korrigering, då jag...",2025-09-08 05:26:08.566517 UTC


## Data Quality Assessment

In [8]:
# Missing Values Analysis
print("=== MISSING VALUES ===")
missing_data = df_qbis_time.isnull().sum()
missing_percent = (missing_data / len(df_qbis_time)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)

print(missing_df[missing_df['Missing Count'] > 0])

# Duplicates
print(f"\n=== DUPLICATES ===")
print(f"Total duplicated rows: {df_qbis_time.duplicated().sum()}")
print(f"Percentage of duplicates: {(df_qbis_time.duplicated().sum() / len(df_qbis_time)) * 100:.2f}%")

=== MISSING VALUES ===
                Missing Count  Missing Percentage
notes_internal            863            99.65358

=== DUPLICATES ===
Total duplicated rows: 0
Percentage of duplicates: 0.00%


## Statistical Summary

In [11]:
df_qbis_time.employee_id.value_counts()

employee_id
3     174
5      86
2      85
4      82
6      81
9      80
11     79
8      78
10     67
12     54
Name: count, dtype: int64