# Exploratory Data Analysis - Employee Dimension Data
This notebook performs EDA on the `dim_employees_anon.csv` file from the Data Edge Internal Data folder.

In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [2]:
# Set Data Directory Path
data_dir = r"c:\Users\guine\Documents\BC#4\Hyper_python\BI project\Data Edge Internal Data"
file_path = os.path.join(data_dir, "dim__hubspot_sales_pipeline_stages.csv")

# Verify file exists
if os.path.exists(file_path):
    print(f"✓ File found: {file_path}")
    print(f"File size: {os.path.getsize(file_path)} bytes")
else:
    print(f"✗ File not found: {file_path}")

✓ File found: c:\Users\guine\Documents\BC#4\Hyper_python\BI project\Data Edge Internal Data\dim__hubspot_sales_pipeline_stages.csv
File size: 347 bytes


In [6]:
# Load Employee Data
try:
    df_sales_hub = pd.read_csv(file_path)
    print("✓ Data loaded successfully!")
    print(f"Dataset shape: {df_sales_hub.shape}")
except Exception as e:
    print(f"✗ Error loading data: {e}")

✓ Data loaded successfully!
Dataset shape: (6, 6)


## Dataset Overview

In [7]:
# Basic Information
print("=== BASIC INFORMATION ===")
print(f"Shape: {df_sales_hub.shape}")
print(f"Columns: {list(df_sales_hub.columns)}")
print(f"Data types:\n{df_sales_hub.dtypes}")
print(f"\nMemory usage: {df_sales_hub.memory_usage(deep=True).sum() / 1024:.2f} KB")

=== BASIC INFORMATION ===
Shape: (6, 6)
Columns: ['pipeline_stage_id', 'pipeline_stage_order', 'pipeline_stage', 'close_probability', 'stage_is_archived', 'deal_is_closed']
Data types:
pipeline_stage_id        object
pipeline_stage_order      int64
pipeline_stage           object
close_probability       float64
stage_is_archived          bool
deal_is_closed             bool
dtype: object

Memory usage: 1.02 KB


In [8]:
# First Few Rows
print("=== FIRST 5 ROWS ===")
display(df_sales_hub.head())

print("\n=== LAST 5 ROWS ===")
display(df_sales_hub.tail())

=== FIRST 5 ROWS ===


Unnamed: 0,pipeline_stage_id,pipeline_stage_order,pipeline_stage,close_probability,stage_is_archived,deal_is_closed
0,1102499,1,Qualification,0.2,False,False
1,12008384,0,Lead,0.1,False,False
2,qualifiedtobuy,3,Proposal,0.6,False,False
3,appointmentscheduled,2,Exploration,0.4,False,False
4,closedlost,5,Closed Lost,0.0,False,True



=== LAST 5 ROWS ===


Unnamed: 0,pipeline_stage_id,pipeline_stage_order,pipeline_stage,close_probability,stage_is_archived,deal_is_closed
1,12008384,0,Lead,0.1,False,False
2,qualifiedtobuy,3,Proposal,0.6,False,False
3,appointmentscheduled,2,Exploration,0.4,False,False
4,closedlost,5,Closed Lost,0.0,False,True
5,closedwon,4,Closed Won,1.0,False,True


## Data Quality Assessment

In [9]:
# Missing Values Analysis
print("=== MISSING VALUES ===")
missing_data = df_sales_hub.isnull().sum()
missing_percent = (missing_data / len(df_sales_hub)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)

print(missing_df[missing_df['Missing Count'] > 0])

# Duplicates
print(f"\n=== DUPLICATES ===")
print(f"Total duplicated rows: {df_sales_hub.duplicated().sum()}")
print(f"Percentage of duplicates: {(df_sales_hub.duplicated().sum() / len(df_sales_hub)) * 100:.2f}%")

=== MISSING VALUES ===
Empty DataFrame
Columns: [Missing Count, Missing Percentage]
Index: []

=== DUPLICATES ===
Total duplicated rows: 0
Percentage of duplicates: 0.00%


## Statistical Summary

=== IS_ACTIVE COLUMN SUMMARY ===
