## WGU D599: Data Preparation and Exploration
#### John D. Pickering

In [1]:
# import dependencies
import json
import csv
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import ast
import numpy as np
import shap
import plotly
from scipy.stats import zscore
import seaborn as sns
from collections import Counter
import re
from typing import Dict, List, Tuple, Any
import warnings
warnings.filterwarnings('ignore')

In [2]:
# read dataset into pandas as df
df = pd.read_csv('Employee Turnover Dataset.csv', low_memory=False)

In [3]:
# A1 - Identify the number of records and variables (columns)
# Rows: 10199
# Columns: 16
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10199 entries, 0 to 10198
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   EmployeeNumber                10199 non-null  int64  
 1   Age                           10199 non-null  int64  
 2   Tenure                        10199 non-null  int64  
 3   Turnover                      10199 non-null  object 
 4   HourlyRate                    10199 non-null  object 
 5   HoursWeekly                   10199 non-null  int64  
 6   CompensationType              10199 non-null  object 
 7   AnnualSalary                  10199 non-null  float64
 8   DrivingCommuterDistance       10199 non-null  int64  
 9   JobRoleArea                   10199 non-null  object 
 10  Gender                        10199 non-null  object 
 11  MaritalStatus                 10199 non-null  object 
 12  NumCompaniesPreviouslyWorked  9534 non-null   float64
 13  A

In [4]:
# A2 - List each variable and indicate the variable’s data type 
# (quantitative/numerical or qualitative/categorical) and data subtype (i.e., continuous/discrete or nominal/ordinal).
def variable_type_summary(df):
    summary = pd.DataFrame({
        'Column': df.columns,
        'Pandas_Dtype': df.dtypes.astype(str),
        'Non_Null_Count': df.notnull().sum()
    })

    summary['Variable_Type'] = summary['Pandas_Dtype'].apply(lambda x:
        'Quantitative' if 'int' in x or 'float' in x else
        'Qualitative'
    )

    def guess_subtype(col):
        if df[col].dtype in ['int64', 'float64']:
            unique_vals = df[col].dropna().unique()
            if df[col].dtype == 'int64' and len(unique_vals) < 20:
                return 'Discrete'
            else:
                return 'Continuous'
        elif df[col].dtype == 'object' or df[col].dtype.name == 'category':
            n_unique = df[col].nunique()
            if n_unique < 10:
                unique_vals = df[col].dropna().unique()
                return 'Ordinal' if sorted(unique_vals) == list(unique_vals) else 'Nominal'
            else:
                return 'Nominal'
        return 'Unknown'

    summary['Subtype'] = summary['Column'].apply(guess_subtype)

    return summary[['Column', 'Pandas_Dtype', 'Variable_Type', 'Subtype']]

summary_table = variable_type_summary(df)
summary_table


Unnamed: 0,Column,Pandas_Dtype,Variable_Type,Subtype
EmployeeNumber,EmployeeNumber,int64,Quantitative,Continuous
Age,Age,int64,Quantitative,Continuous
Tenure,Tenure,int64,Quantitative,Continuous
Turnover,Turnover,object,Qualitative,Nominal
HourlyRate,HourlyRate,object,Qualitative,Nominal
HoursWeekly,HoursWeekly,int64,Quantitative,Discrete
CompensationType,CompensationType,object,Qualitative,Ordinal
AnnualSalary,AnnualSalary,float64,Quantitative,Continuous
DrivingCommuterDistance,DrivingCommuterDistance,int64,Quantitative,Continuous
JobRoleArea,JobRoleArea,object,Qualitative,Nominal


In [5]:
# A3 - Identify a sample of observable values for each variable.
df.head(5).T

Unnamed: 0,0,1,2,3,4
EmployeeNumber,1,2,3,4,5
Age,28,33,22,23,40
Tenure,6,2,1,1,6
Turnover,Yes,Yes,No,No,No
HourlyRate,$24.37,$24.37,$22.52,$22.52,$88.77
HoursWeekly,40,40,40,40,40
CompensationType,Salary,Salary,Salary,Salary,Salary
AnnualSalary,50689.6,50689.6,46841.6,46841.6,284641.6
DrivingCommuterDistance,89,89,35,35,12
JobRoleArea,Research,Research,Information_Technology,Information_Technology,Sales


In [6]:
# B1 - Explain how you inspected the dataset to detect the following data quality issues: 
# Get total rows of duplicated data
df.duplicated().sum()

np.int64(99)

In [7]:
# B1 - Show duplicated data
df[df.duplicated()]

Unnamed: 0,EmployeeNumber,Age,Tenure,Turnover,HourlyRate,HoursWeekly,CompensationType,AnnualSalary,DrivingCommuterDistance,JobRoleArea,Gender,MaritalStatus,NumCompaniesPreviouslyWorked,AnnualProfessionalDevHrs,PaycheckMethod,TextMessageOptIn
10100,1,28,6,Yes,$24.37,40,Salary,50689.6,89,Research,Female,Married,3.0,7.0,Mail Check,Yes
10101,2,33,2,Yes,$24.37,40,Salary,50689.6,89,Research,Female,Married,6.0,7.0,Mail Check,Yes
10102,3,22,1,No,$22.52,40,Salary,46841.6,35,Information_Technology,Female,Single,1.0,8.0,Mailed Check,Yes
10103,4,23,1,No,$22.52,40,Salary,46841.6,35,Information_Technology,Female,Single,3.0,,Mailed Check,Yes
10104,5,40,6,No,$88.77,40,Salary,284641.6,12,Sales,Prefer Not to Answer,Single,7.0,,Mail Check,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10194,95,48,13,Yes,$85.40,40,Salary,177632.0,31,Research,Male,Single,7.0,5.0,Mail Check,
10195,96,54,17,No,$85.40,40,Salary,177632.0,31,Research,Male,Single,2.0,25.0,Mail Check,Yes
10196,97,44,6,No,$71.90,40,Salary,149552.0,32,Marketing,Male,Married,6.0,,Mail Check,Yes
10197,98,58,19,No,$71.90,40,Salary,149552.0,32,Marketing,Male,Married,5.0,23.0,Mail Check,Yes


In [8]:
def check_missing_for_column(df, column_name):
    if column_name not in df.columns:
        return f"Column '{column_name}' not found in DataFrame."
    
    total_rows = len(df)
    missing_count = df[column_name].isna().sum()
    missing_percent = round((missing_count / total_rows) * 100, 2)
    
    return {
        'Column': column_name,
        'Missing Count': missing_count,
        'Missing %': f"{missing_percent}%"
    }
check_missing_for_column(df, 'EmployeeNumber')

{'Column': 'EmployeeNumber', 'Missing Count': np.int64(0), 'Missing %': '0.0%'}

In [9]:
# 1 - EmployeeNumber
# Check first 5 rows of data
df['EmployeeNumber'].head()


0    1
1    2
2    3
3    4
4    5
Name: EmployeeNumber, dtype: int64

In [10]:
df.isna().sum()

EmployeeNumber                     0
Age                                0
Tenure                             0
Turnover                           0
HourlyRate                         0
HoursWeekly                        0
CompensationType                   0
AnnualSalary                       0
DrivingCommuterDistance            0
JobRoleArea                        0
Gender                             0
MaritalStatus                      0
NumCompaniesPreviouslyWorked     665
AnnualProfessionalDevHrs        1969
PaycheckMethod                     0
TextMessageOptIn                2266
dtype: int64

In [11]:
# B1 - Explain how you inspected the dataset to detect the following data quality issues: 
# missing values
# Get the number of missing values p/comlumn
df.isna().sum()

EmployeeNumber                     0
Age                                0
Tenure                             0
Turnover                           0
HourlyRate                         0
HoursWeekly                        0
CompensationType                   0
AnnualSalary                       0
DrivingCommuterDistance            0
JobRoleArea                        0
Gender                             0
MaritalStatus                      0
NumCompaniesPreviouslyWorked     665
AnnualProfessionalDevHrs        1969
PaycheckMethod                     0
TextMessageOptIn                2266
dtype: int64

In [12]:
# B1 - missing values 
# Show percent of values missing by column.
df.isna().mean() * 100

EmployeeNumber                   0.000000
Age                              0.000000
Tenure                           0.000000
Turnover                         0.000000
HourlyRate                       0.000000
HoursWeekly                      0.000000
CompensationType                 0.000000
AnnualSalary                     0.000000
DrivingCommuterDistance          0.000000
JobRoleArea                      0.000000
Gender                           0.000000
MaritalStatus                    0.000000
NumCompaniesPreviouslyWorked     6.520247
AnnualProfessionalDevHrs        19.305814
PaycheckMethod                   0.000000
TextMessageOptIn                22.217864
dtype: float64

In [13]:
# B1 - inconsistent entries
#  list all unique values in each categorical column
for col in df.select_dtypes(include='object'):
    print(f"{col}:", df[col].unique())

Turnover: ['Yes' 'No']
HourlyRate : ['$24.37 ' '$22.52 ' '$88.77 ' ... '$30.86 ' '$95.07 ' '$93.05 ']
CompensationType: ['Salary']
JobRoleArea: ['Research' 'Information_Technology' 'Sales' 'Human_Resources'
 'Laboratory' 'Manufacturing' 'Healthcare' 'Marketing'
 'InformationTechnology' 'HumanResources' 'Information Technology'
 'Human Resources']
Gender: ['Female' 'Prefer Not to Answer' 'Male']
MaritalStatus: ['Married' 'Single' 'Divorced']
PaycheckMethod: ['Mail Check' 'Mailed Check' 'Direct_Deposit' 'DirectDeposit'
 'Direct Deposit' 'Mail_Check' 'MailedCheck']
TextMessageOptIn: ['Yes' nan 'No']


In [14]:
# B1 - inconsistent entries
# Find rare categories
def find_rare_categories(df, column, threshold=10):
    value_counts = df[column].value_counts(dropna=False)
    rare = value_counts[value_counts < threshold]
    return rare.reset_index().rename(columns={'index': column, column: 'Count'})

# Check rare JobRoleArea values
rare_job_roles = find_rare_categories(df, 'JobRoleArea', threshold=10)
print(rare_job_roles)

Empty DataFrame
Columns: [Count, count]
Index: []


In [15]:
# B1 - Formatting Errors
# Check for data types to ensure each field is listed correctly
df.dtypes

EmployeeNumber                    int64
Age                               int64
Tenure                            int64
Turnover                         object
HourlyRate                       object
HoursWeekly                       int64
CompensationType                 object
AnnualSalary                    float64
DrivingCommuterDistance           int64
JobRoleArea                      object
Gender                           object
MaritalStatus                    object
NumCompaniesPreviouslyWorked    float64
AnnualProfessionalDevHrs        float64
PaycheckMethod                   object
TextMessageOptIn                 object
dtype: object

In [16]:
# B1 - Formatting issues
# 1. Check for leading/trailing whitespace in string columns (before cleaning)
string_columns = df.select_dtypes(include='object').columns
for col in string_columns:
    whitespace_issues = df[col].apply(lambda x: isinstance(x, str) and (x != x.strip()))
    if whitespace_issues.any():
        print(f"Column '{col}' has entries with leading/trailing whitespace.")

# 2. Check for inconsistent casing
for col in string_columns:
    unique_vals = df[col].dropna().unique()
    if any(v != v.title() for v in unique_vals if isinstance(v, str)):
        print(f"Column '{col}' has inconsistent casing:")
        print(pd.Series(unique_vals))

# 3. Check for special characters or formatting symbols in string columns
import re
for col in string_columns:
    if df[col].astype(str).str.contains(r'[\$%#@!&*]', regex=True).any():
        print(f"Column '{col}' contains special characters.")

# 4. Check for unexpected numeric types stored as objects
for col in string_columns:
    sample = df[col].dropna().sample(n=min(100, df[col].dropna().shape[0]), random_state=1)
    if sample.apply(lambda x: str(x).replace('.', '', 1).isdigit()).mean() > 0.8:
        print(f"Column '{col}' may be numeric but stored as object.")

# 5. Check for placeholder or dummy values (e.g., 'N/A', 'unknown', '-')
placeholder_values = ['n/a', 'na', 'unknown', '-', '--', 'none', 'null']
for col in string_columns:
    found = df[col].astype(str).str.lower().isin(placeholder_values).sum()
    if found > 0:
        print(f"Column '{col}' has {found} placeholder or dummy values.")


Column 'HourlyRate ' has entries with leading/trailing whitespace.
Column 'JobRoleArea' has inconsistent casing:
0                   Research
1     Information_Technology
2                      Sales
3            Human_Resources
4                 Laboratory
5              Manufacturing
6                 Healthcare
7                  Marketing
8      InformationTechnology
9             HumanResources
10    Information Technology
11           Human Resources
dtype: object
Column 'Gender' has inconsistent casing:
0                  Female
1    Prefer Not to Answer
2                    Male
dtype: object
Column 'PaycheckMethod' has inconsistent casing:
0        Mail Check
1      Mailed Check
2    Direct_Deposit
3     DirectDeposit
4    Direct Deposit
5        Mail_Check
6       MailedCheck
dtype: object
Column 'HourlyRate ' contains special characters.


In [19]:
summary_df = inspect_columns(df, df.columns)



       0
0   None
1   None
2   None
3   None
4   None
5   None
6   None
7   None
8   None
9   None
10  None
11  None
12  None
13  None
14  None
15  None


In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from typing import Dict, List, Tuple, Any

def inspect_data_quality(df: pd.DataFrame, 
                        numeric_columns: List[str] = None,
                        categorical_columns: List[str] = None,
                        outlier_method: str = 'iqr',
                        outlier_threshold: float = 1.5) -> Dict[str, Any]:
    """
    Comprehensive data quality inspection function that checks for:
    - Duplicate entries
    - Missing values
    - Inconsistent entries
    - Formatting errors
    - Outliers
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe to inspect
    numeric_columns : List[str], optional
        List of numeric column names. If None, will auto-detect
    categorical_columns : List[str], optional
        List of categorical column names. If None, will auto-detect
    outlier_method : str, default 'iqr'
        Method for outlier detection ('iqr', 'zscore', 'modified_zscore')
    outlier_threshold : float, default 1.5
        Threshold for outlier detection
        
    Returns:
    --------
    Dict[str, Any] : Comprehensive report of data quality issues
    """
    
    report = {
        'dataset_overview': {},
        'duplicates': {},
        'missing_values': {},
        'inconsistent_entries': {},
        'formatting_errors': {},
        'outliers': {},
        'summary': {}
    }
    
    # Dataset Overview
    report['dataset_overview'] = {
        'total_rows': len(df),
        'total_columns': len(df.columns),
        'columns': list(df.columns),
        'data_types': df.dtypes.to_dict()
    }
    
    # Auto-detect column types if not specified
    if numeric_columns is None:
        numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    if categorical_columns is None:
        categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # 1. DUPLICATE ENTRIES
    print("🔍 Checking for duplicate entries...")
    
    # Full row duplicates
    full_duplicates = df.duplicated()
    report['duplicates']['full_row_duplicates'] = {
        'count': full_duplicates.sum(),
        'percentage': (full_duplicates.sum() / len(df)) * 100,
        'duplicate_indices': df[full_duplicates].index.tolist()
    }
    
    # Column-wise duplicate analysis
    column_duplicates = {}
    for col in df.columns:
        col_dups = df[col].duplicated()
        column_duplicates[col] = {
            'count': col_dups.sum(),
            'percentage': (col_dups.sum() / len(df)) * 100,
            'unique_values': df[col].nunique(),
            'unique_percentage': (df[col].nunique() / len(df)) * 100
        }
    
    report['duplicates']['column_wise'] = column_duplicates
    
    # 2. MISSING VALUES
    print("🔍 Checking for missing values...")
    
    missing_stats = {}
    for col in df.columns:
        missing_count = df[col].isnull().sum()
        missing_stats[col] = {
            'count': int(missing_count),
            'percentage': (missing_count / len(df)) * 100,
            'missing_indices': df[df[col].isnull()].index.tolist()
        }
        
        # Check for different representations of missing values
        if df[col].dtype == 'object':
            potential_missing = df[col].isin(['', ' ', 'NULL', 'null', 'NaN', 'nan', 'N/A', 'n/a', 'None', 'none'])
            if potential_missing.sum() > 0:
                missing_stats[col]['potential_missing_representations'] = {
                    'count': int(potential_missing.sum()),
                    'values': df[potential_missing][col].value_counts().to_dict()
                }
    
    report['missing_values'] = missing_stats
    
    # 3. INCONSISTENT ENTRIES
    print("🔍 Checking for inconsistent entries...")
    
    inconsistency_report = {}
    
    for col in categorical_columns:
        if col in df.columns:
            inconsistencies = {}
            
            # Case variations
            if df[col].dtype == 'object':
                values = df[col].dropna().astype(str)
                case_variations = {}
                
                # Group by lowercase to find case variations
                lowercase_groups = values.str.lower().value_counts()
                for lower_val in lowercase_groups.index:
                    original_variations = values[values.str.lower() == lower_val].unique()
                    if len(original_variations) > 1:
                        case_variations[lower_val] = original_variations.tolist()
                
                if case_variations:
                    inconsistencies['case_variations'] = case_variations
                
                # Whitespace issues
                whitespace_issues = {}
                for val in values.unique():
                    if val != val.strip():
                        whitespace_issues[val] = val.strip()
                
                if whitespace_issues:
                    inconsistencies['whitespace_issues'] = whitespace_issues
                
                # Similar values (potential typos)
                from difflib import SequenceMatcher
                unique_vals = values.unique()
                similar_pairs = []
                
                for i, val1 in enumerate(unique_vals):
                    for val2 in unique_vals[i+1:]:
                        similarity = SequenceMatcher(None, str(val1).lower(), str(val2).lower()).ratio()
                        if 0.8 <= similarity < 1.0:  # High similarity but not identical
                            similar_pairs.append({
                                'value1': val1,
                                'value2': val2,
                                'similarity': similarity,
                                'count1': (values == val1).sum(),
                                'count2': (values == val2).sum()
                            })
                
                if similar_pairs:
                    inconsistencies['similar_values'] = similar_pairs
            
            if inconsistencies:
                inconsistency_report[col] = inconsistencies
    
    report['inconsistent_entries'] = inconsistency_report
    
    # 4. FORMATTING ERRORS
    print("🔍 Checking for formatting errors...")
    
    formatting_errors = {}
    
    for col in df.columns:
        col_errors = {}
        
        if df[col].dtype == 'object':
            values = df[col].dropna().astype(str)
            
            # Check for mixed data types in string columns
            numeric_pattern = re.compile(r'^-?\d+\.?\d*$')
            date_pattern = re.compile(r'\d{1,4}[-/]\d{1,2}[-/]\d{1,4}')
            email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
            
            mixed_types = {
                'numeric_like': values[values.str.match(numeric_pattern, na=False)].tolist(),
                'date_like': values[values.str.match(date_pattern, na=False)].tolist(),
                'email_like': values[values.str.match(email_pattern, na=False)].tolist()
            }
            
            # Remove empty lists
            mixed_types = {k: v for k, v in mixed_types.items() if v}
            if mixed_types:
                col_errors['mixed_data_types'] = mixed_types
            
            # Check for unusual characters or encoding issues
            unusual_chars = []
            for val in values.unique()[:100]:  # Check first 100 unique values
                if any(ord(char) > 127 for char in str(val)):  # Non-ASCII characters
                    unusual_chars.append(val)
            
            if unusual_chars:
                col_errors['unusual_characters'] = unusual_chars[:10]  # Show first 10
        
        # Check numeric columns stored as strings
        elif col in numeric_columns and df[col].dtype == 'object':
            non_numeric = df[~df[col].str.match(r'^-?\d+\.?\d*$', na=False)][col].dropna()
            if len(non_numeric) > 0:
                col_errors['non_numeric_in_numeric_column'] = non_numeric.tolist()[:10]
        
        if col_errors:
            formatting_errors[col] = col_errors
    
    report['formatting_errors'] = formatting_errors
    
    # 5. OUTLIERS
    print("🔍 Checking for outliers...")
    
    outlier_report = {}
    
    for col in numeric_columns:
        if col in df.columns and df[col].dtype in ['int64', 'float64']:
            col_data = df[col].dropna()
            
            if len(col_data) == 0:
                continue
                
            outliers = {}
            
            if outlier_method == 'iqr':
                Q1 = col_data.quantile(0.25)
                Q3 = col_data.quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - outlier_threshold * IQR
                upper_bound = Q3 + outlier_threshold * IQR
                
                outlier_mask = (col_data < lower_bound) | (col_data > upper_bound)
                outlier_values = col_data[outlier_mask]
                
                outliers['method'] = 'IQR'
                outliers['bounds'] = {'lower': lower_bound, 'upper': upper_bound}
                
            elif outlier_method == 'zscore':
                z_scores = np.abs((col_data - col_data.mean()) / col_data.std())
                outlier_mask = z_scores > outlier_threshold
                outlier_values = col_data[outlier_mask]
                
                outliers['method'] = 'Z-Score'
                outliers['threshold'] = outlier_threshold
                
            elif outlier_method == 'modified_zscore':
                median = col_data.median()
                mad = np.median(np.abs(col_data - median))
                modified_z_scores = 0.6745 * (col_data - median) / mad
                outlier_mask = np.abs(modified_z_scores) > outlier_threshold
                outlier_values = col_data[outlier_mask]
                
                outliers['method'] = 'Modified Z-Score'
                outliers['threshold'] = outlier_threshold
            
            if len(outlier_values) > 0:
                outliers.update({
                    'count': len(outlier_values),
                    'percentage': (len(outlier_values) / len(col_data)) * 100,
                    'values': outlier_values.tolist(),
                    'indices': outlier_values.index.tolist(),
                    'statistics': {
                        'min_outlier': outlier_values.min(),
                        'max_outlier': outlier_values.max(),
                        'mean_outlier': outlier_values.mean()
                    }
                })
                
                outlier_report[col] = outliers
    
    report['outliers'] = outlier_report
    
    # 6. SUMMARY
    print("📊 Generating summary...")
    
    total_issues = 0
    issue_categories = []
    
    if report['duplicates']['full_row_duplicates']['count'] > 0:
        total_issues += report['duplicates']['full_row_duplicates']['count']
        issue_categories.append('duplicates')
    
    missing_issues = sum([stats['count'] for stats in report['missing_values'].values()])
    if missing_issues > 0:
        total_issues += missing_issues
        issue_categories.append('missing_values')
    
    if report['inconsistent_entries']:
        total_issues += len(report['inconsistent_entries'])
        issue_categories.append('inconsistent_entries')
    
    if report['formatting_errors']:
        total_issues += len(report['formatting_errors'])
        issue_categories.append('formatting_errors')
    
    outlier_issues = sum([stats['count'] for stats in report['outliers'].values()])
    if outlier_issues > 0:
        total_issues += outlier_issues
        issue_categories.append('outliers')
    
    report['summary'] = {
        'total_issues_found': total_issues,
        'issue_categories': issue_categories,
        'data_quality_score': max(0, 100 - (total_issues / len(df)) * 100),
        'recommendations': []
    }
    
    # Add recommendations
    recommendations = []
    if report['duplicates']['full_row_duplicates']['count'] > 0:
        recommendations.append("Remove duplicate rows to improve data integrity")
    if missing_issues > 0:
        recommendations.append("Handle missing values through imputation or removal")
    if report['inconsistent_entries']:
        recommendations.append("Standardize categorical values and fix case/whitespace issues")
    if report['formatting_errors']:
        recommendations.append("Clean formatting errors and ensure consistent data types")
    if outlier_issues > 0:
        recommendations.append("Investigate outliers - they may indicate data errors or genuine extreme values")
    
    report['summary']['recommendations'] = recommendations
    
    print("✅ Data quality inspection completed!")
    return report

def print_quality_report(report: Dict[str, Any]) -> None:
    """
    Print a formatted version of the data quality report
    """
    print("=" * 80)
    print("📋 DATA QUALITY INSPECTION REPORT")
    print("=" * 80)
    
    # Dataset Overview
    print("\n📊 DATASET OVERVIEW")
    print("-" * 40)
    overview = report['dataset_overview']
    print(f"Total Rows: {overview['total_rows']:,}")
    print(f"Total Columns: {overview['total_columns']}")
    print(f"Columns: {', '.join(overview['columns'])}")
    
    # Summary
    print("\n🎯 SUMMARY")
    print("-" * 40)
    summary = report['summary']
    print(f"Data Quality Score: {summary['data_quality_score']:.1f}/100")
    print(f"Total Issues Found: {summary['total_issues_found']:,}")
    print(f"Issue Categories: {', '.join(summary['issue_categories']) if summary['issue_categories'] else 'None'}")
    
    # Recommendations
    if summary['recommendations']:
        print("\n💡 RECOMMENDATIONS")
        print("-" * 40)
        for i, rec in enumerate(summary['recommendations'], 1):
            print(f"{i}. {rec}")
    
    # Detailed findings
    print("\n🔍 DETAILED FINDINGS")
    print("-" * 40)
    
    # Duplicates
    dup_count = report['duplicates']['full_row_duplicates']['count']
    print(f"Duplicate Rows: {dup_count:,} ({report['duplicates']['full_row_duplicates']['percentage']:.2f}%)")
    
    # Missing Values
    missing_cols = [col for col, stats in report['missing_values'].items() if stats['count'] > 0]
    print(f"Columns with Missing Values: {len(missing_cols)}")
    if missing_cols:
        for col in missing_cols[:5]:  # Show top 5
            stats = report['missing_values'][col]
            print(f"  • {col}: {stats['count']:,} missing ({stats['percentage']:.2f}%)")
    
    # Inconsistencies
    inconsistent_cols = len(report['inconsistent_entries'])
    print(f"Columns with Inconsistencies: {inconsistent_cols}")
    
    # Formatting Errors
    format_error_cols = len(report['formatting_errors'])
    print(f"Columns with Formatting Errors: {format_error_cols}")
    
    # Outliers
    outlier_cols = len(report['outliers'])
    print(f"Columns with Outliers: {outlier_cols}")
    if outlier_cols > 0:
        for col, stats in list(report['outliers'].items())[:3]:  # Show top 3
            print(f"  • {col}: {stats['count']:,} outliers ({stats['percentage']:.2f}%)")
    
    print("\n" + "=" * 80)

# Example usage:
if __name__ == "__main__":
    # Load your dataset
    df = pd.read_csv('Employee Turnover Dataset.csv')
    
    # Define column types (adjust based on your data)
    numeric_cols = ['EmployeeNumber', 'Age', 'Tenure', 'HoursWeekly', 
                   'AnnualSalary', 'DrivingCommuterDistance', 
                   'NumCompaniesPreviouslyWorked', 'AnnualProfessionalDevHrs']
    
    categorical_cols = ['Turnover', 'HourlyRate ', 'CompensationType', 
                       'JobRoleArea', 'Gender', 'MaritalStatus', 
                       'PaycheckMethod', 'TextMessageOptIn']
    
    # Run inspection
    quality_report = inspect_data_quality(
        df, 
        numeric_columns=numeric_cols,
        categorical_columns=categorical_cols,
        outlier_method='iqr',
        outlier_threshold=1.5
    )
    
    # Print formatted report
    print_quality_report(quality_report)

In [None]:
# B1 - Formatting issues
df['Gender'].value_counts()  # or compare .str.lower() vs .str.title()

In [None]:
# B1 - Outliers
# Annual Salary - Look for outliers in an inv
sns.boxplot(x=df['AnnualSalary'])

In [None]:
# B1 - Outliers 
# Age
sns.boxplot(x=df['Age'])

In [None]:
# B1 - Outliers
# Driving Communter Distance
sns.boxplot(x=df['DrivingCommuterDistance'])

In [None]:
# B1 - Outliers
# Annual Professional DevHrs
sns.boxplot(x=df['AnnualProfessionalDevHrs'])

In [None]:
z_scores = zscore(df['AnnualSalary'].dropna())
outliers = df[(abs(z_scores) > 3)]

In [None]:
# ----------------------------------------
# C - Clean the data
# ----------------------------------------

# Step 0: Strip column names
df.columns = df.columns.str.strip()

# Step 1: Remove Duplicates
df_cleaned = df.drop_duplicates().copy()

# Step 2: Handle Missing Values
if 'AnnualProfessionalDevHrs' in df_cleaned.columns:
    median_dev_hours = df_cleaned['AnnualProfessionalDevHrs'].median()
    df_cleaned.loc[:, 'AnnualProfessionalDevHrs'] = df_cleaned['AnnualProfessionalDevHrs'].fillna(median_dev_hours)

# Step 3: Fix Inconsistent Entries
if 'PaycheckMethod' in df_cleaned.columns:
    df_cleaned.loc[:, 'PaycheckMethod'] = (
        df_cleaned['PaycheckMethod']
        .astype(str)
        .str.strip()
        .replace({
            'Mailed Check': 'Mail Check',
            'Mail_Check': 'Mail Check',
            'Mailedcheck': 'Mail Check',
            'DirectDeposit': 'Direct Deposit',
            'Direct_Deposit': 'Direct Deposit'
        })
    )

if 'JobRoleArea' in df_cleaned.columns:
    df_cleaned.loc[:, 'JobRoleArea'] = df_cleaned['JobRoleArea'].replace({
        'InformationTechnology': 'Information Technology',
        'Information_Technology': 'Information Technology',
        'HumanResources': 'Human Resources',
        'Human_Resources': 'Human Resources'
    })

text_columns = ['Gender', 'MaritalStatus', 'CompensationType', 'JobRoleArea', 'TextMessageOptIn', 'PaycheckMethod']
for col in text_columns:
    if col in df_cleaned.columns:
        df_cleaned.loc[:, col] = df_cleaned[col].astype(str).str.strip().str.title()

# Step 4: Fix Formatting
if 'HourlyRate' in df_cleaned.columns and df_cleaned['HourlyRate'].dtype == 'object':
    df_cleaned.loc[:, 'HourlyRate'] = (
        df_cleaned['HourlyRate']
        .astype(str)
        .str.replace('$', '', regex=False)
        .str.strip()
        .astype(float)
    )

# Safely strip whitespace from all object-type string fields
for col in df_cleaned.select_dtypes(include='object').columns:
    df_cleaned.loc[:, col] = df_cleaned[col].astype(str).str.strip()

# Step 5: Handle Outliers
if 'AnnualSalary' in df_cleaned.columns:
    Q1 = df_cleaned['AnnualSalary'].quantile(0.25)
    Q3 = df_cleaned['AnnualSalary'].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    df_cleaned.loc[:, 'AnnualSalary'] = df_cleaned['AnnualSalary'].apply(lambda x: min(x, upper_bound))


In [None]:
# Check for unique values post cleaning
for col in df_cleaned.select_dtypes(include='object'):
    print(f"{col}:", df_cleaned[col].unique())

In [None]:
# Export cleaned file
df_cleaned.to_csv('Employee_Turnover_Cleaned.csv', index=False)
print('Cleaned File exported')