### D599 Data Preparation and Exploration Task 1
#### John D. Pickering
#### Environment: Jupyter Notebook
#### Language: Python with Pandas

In [1]:
# import dependencies
import json
import csv
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import ast
import numpy as np
import plotly
from scipy.stats import zscore
import seaborn as sns
from collections import Counter
import re
from typing import Dict, List, Tuple, Any
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Read dataseet into pandas
df = pd.read_csv('Employee Turnover Dataset.csv', low_memory=False)

In [3]:
# Clean up column names: strip spaces, lowercase, replace internal spaces
df.columns = (
    df.columns.str.strip()   # remove leading/trailing whitespace
              .str.replace(" ", "_")  # replace spaces with underscores
              .str.lower()   # optional: standardize casing
)

In [4]:
# A1 - Identify the number of records and variables (columns) + ensure dataset has been read into Pandas
# Rows: 10199
# Columns: 16
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10199 entries, 0 to 10198
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   employeenumber                10199 non-null  int64  
 1   age                           10199 non-null  int64  
 2   tenure                        10199 non-null  int64  
 3   turnover                      10199 non-null  object 
 4   hourlyrate                    10199 non-null  object 
 5   hoursweekly                   10199 non-null  int64  
 6   compensationtype              10199 non-null  object 
 7   annualsalary                  10199 non-null  float64
 8   drivingcommuterdistance       10199 non-null  int64  
 9   jobrolearea                   10199 non-null  object 
 10  gender                        10199 non-null  object 
 11  maritalstatus                 10199 non-null  object 
 12  numcompaniespreviouslyworked  9534 non-null   float64
 13  a

In [5]:
# B1 - Explain how you inspected the dataset to detect the following data quality issues: 
# Get total rows of duplicated data
df.duplicated().sum()

np.int64(99)

In [6]:
# A1 - Show variable data for review
df.head(5).T

Unnamed: 0,0,1,2,3,4
employeenumber,1,2,3,4,5
age,28,33,22,23,40
tenure,6,2,1,1,6
turnover,Yes,Yes,No,No,No
hourlyrate,$24.37,$24.37,$22.52,$22.52,$88.77
hoursweekly,40,40,40,40,40
compensationtype,Salary,Salary,Salary,Salary,Salary
annualsalary,50689.6,50689.6,46841.6,46841.6,284641.6
drivingcommuterdistance,89,89,35,35,12
jobrolearea,Research,Research,Information_Technology,Information_Technology,Sales


In [7]:
# Show Numeric Analysis
df.describe()
# Can see that there is an issue with Min/Max for Driving Commuter Distance
# Can see negatives for AnnualSalary

Unnamed: 0,employeenumber,age,tenure,hoursweekly,annualsalary,drivingcommuterdistance,numcompaniespreviouslyworked,annualprofessionaldevhrs
count,10199.0,10199.0,10199.0,10199.0,10199.0,10199.0,9534.0,8230.0
mean,5001.960977,44.028826,8.992744,40.0,120947.568526,45.411903,4.21481,14.938518
std,2942.709195,10.217864,5.511985,0.0,77566.715759,54.01175,2.481994,6.087415
min,1.0,21.0,1.0,40.0,-33326.4,-275.0,1.0,5.0
25%,2451.5,37.0,5.0,40.0,63252.8,13.0,2.0,10.0
50%,5001.0,44.0,8.0,40.0,101566.4,42.0,4.0,15.0
75%,7550.5,53.0,13.0,40.0,153878.4,71.0,6.0,20.0
max,10100.0,61.0,20.0,40.0,339950.4,950.0,9.0,25.0


In [8]:
# A2 - List each variable and indicate the variable’s data type and attempt 
# (quantitative/numerical or qualitative/categorical) and data subtype (i.e., continuous/discrete or nominal/ordinal).
def variable_type_summary(df):
    summary = pd.DataFrame({
        'Column': df.columns,
        'Pandas_Dtype': df.dtypes.astype(str),
        'Non_Null_Count': df.notnull().sum()
    })

    summary['Variable_Type'] = summary['Pandas_Dtype'].apply(lambda x:
        'Quantitative' if 'int' in x or 'float' in x else
        'Qualitative'
    )

    def guess_subtype(col):
        if df[col].dtype in ['int64', 'float64']:
            unique_vals = df[col].dropna().unique()
            if df[col].dtype == 'int64' and len(unique_vals) < 20:
                return 'Discrete'
            else:
                return 'Continuous'
        elif df[col].dtype == 'object' or df[col].dtype.name == 'category':
            n_unique = df[col].nunique()
            if n_unique < 10:
                unique_vals = df[col].dropna().unique()
                return 'Ordinal' if sorted(unique_vals) == list(unique_vals) else 'Nominal'
            else:
                return 'Nominal'
        return 'Unknown'

    summary['Subtype'] = summary['Column'].apply(guess_subtype)

    return summary[['Column', 'Pandas_Dtype', 'Variable_Type', 'Subtype']]

summary_table = variable_type_summary(df)
summary_table

Unnamed: 0,Column,Pandas_Dtype,Variable_Type,Subtype
employeenumber,employeenumber,int64,Quantitative,Continuous
age,age,int64,Quantitative,Continuous
tenure,tenure,int64,Quantitative,Continuous
turnover,turnover,object,Qualitative,Nominal
hourlyrate,hourlyrate,object,Qualitative,Nominal
hoursweekly,hoursweekly,int64,Quantitative,Discrete
compensationtype,compensationtype,object,Qualitative,Ordinal
annualsalary,annualsalary,float64,Quantitative,Continuous
drivingcommuterdistance,drivingcommuterdistance,int64,Quantitative,Continuous
jobrolearea,jobrolearea,object,Qualitative,Nominal


In [9]:
# B2 - Check Missing values by column
def missing_values_by_column(dataframe):
    missing_counts = dataframe.isnull().sum()
    missing_percentage = (missing_counts / len(dataframe)) * 100
    missing_df = pd.DataFrame({
        'Missing Values': missing_counts,
        'Percentage': missing_percentage
    }).sort_values(by='Missing Values', ascending=False)
    return missing_df

# Run the function
missing_df = missing_values_by_column(df)

# Display the results
print(missing_df)

                              Missing Values  Percentage
textmessageoptin                        2266   22.217864
annualprofessionaldevhrs                1969   19.305814
numcompaniespreviouslyworked             665    6.520247
employeenumber                             0    0.000000
hourlyrate                                 0    0.000000
age                                        0    0.000000
tenure                                     0    0.000000
turnover                                   0    0.000000
annualsalary                               0    0.000000
compensationtype                           0    0.000000
hoursweekly                                0    0.000000
drivingcommuterdistance                    0    0.000000
maritalstatus                              0    0.000000
gender                                     0    0.000000
jobrolearea                                0    0.000000
paycheckmethod                             0    0.000000


In [10]:
# B2 - check inconsistent entries by checking unique values
def check_inconsistent_entries(dataframe):
    for col in dataframe.columns:
        unique_vals = dataframe[col].dropna().unique()
        print(f"\nColumn: {col}")
        print(f"Unique count: {len(unique_vals)}")
        print("Unique values:", unique_vals)

# Run the check
check_inconsistent_entries(df)
# Based on the data below I found that three columns have inconsistent entrires
# HourlyRate - Should be numeric, set as float.  
# JobRoleArea - Has too many unique values.  Need to create one entry for Information_Technology and human resources


Column: employeenumber
Unique count: 10100
Unique values: [    1     2     3 ... 10098 10099 10100]

Column: age
Unique count: 41
Unique values: [28 33 22 23 40 45 34 37 24 30 38 47 55 59 29 35 44 54 36 32 41 56 21 27
 50 31 46 48 39 57 52 53 58 49 42 60 43 61 26 51 25]

Column: tenure
Unique count: 20
Unique values: [ 6  2  1 16  9  3  8  4 20 10  7 15  5 18 17 14 13 19 12 11]

Column: turnover
Unique count: 2
Unique values: ['Yes' 'No']

Column: hourlyrate
Unique count: 5244
Unique values: ['$24.37 ' '$22.52 ' '$88.77 ' ... '$30.86 ' '$95.07 ' '$93.05 ']

Column: hoursweekly
Unique count: 1
Unique values: [40]

Column: compensationtype
Unique count: 1
Unique values: ['Salary']

Column: annualsalary
Unique count: 5538
Unique values: [ 50689.6  46841.6 284641.6 ... 337745.6 164902.4 333544. ]

Column: drivingcommuterdistance
Unique count: 120
Unique values: [  89   35   12    0   76   15    2   36   60   14   75    5  910   28
   -4   33   79   50   13   57   82    4   42   -5   64   

In [11]:
#B2 - View Formatting Errors
def find_formatting_errors(df, force_numeric_cols=None, numeric_object_threshold: float = 0.9):
    """
    Identify potential formatting errors in a DataFrame with tightened reporting.
    """
    results = {}
    force_numeric_cols = force_numeric_cols or []

    for col in df.columns:
        s = df[col]

        # --- 1. Forced numeric columns ---
        if col in force_numeric_cols:
            numeric_version = pd.to_numeric(s, errors="coerce")
            parse_rate = numeric_version.notna().sum() / max(1, s.notna().sum())

            if parse_rate < 1.0:
                results[col] = {
                    "type": "dtype_mismatch_numeric_expected",
                    "issue": "Column should be numeric but contains non-numeric values",
                    "parse_rate": round(100 * parse_rate, 2),
                    "examples_failed": s[numeric_version.isna()].dropna().unique().tolist()[:10],
                }
            else:
                if not pd.api.types.is_numeric_dtype(s):
                    results[col] = {
                        "type": "dtype_mismatch_numeric_expected",
                        "issue": "Likely numeric column stored as object",
                        "parse_rate": round(100 * parse_rate, 2),
                        "examples": s.dropna().unique().tolist()[:10],
                    }
                else:
                    # If it's already numeric and clean, log it as OK
                    results[col] = {
                        "type": "numeric_ok",
                        "issue": "Column is numeric and contains valid values",
                        "parse_rate": 100.0,
                    }
            continue  # done with this col

        # --- 2. Object columns (not forced numeric) ---
        if pd.api.types.is_object_dtype(s):
            numeric_version = pd.to_numeric(s, errors="coerce")
            parse_rate = numeric_version.notna().sum() / max(1, s.notna().sum())

            if parse_rate >= numeric_object_threshold:
                results[col] = {
                    "type": "dtype_mismatch_numeric_expected",
                    "issue": "Likely numeric column stored as object",
                    "parse_rate": round(100 * parse_rate, 2),
                    "examples": s.dropna().unique().tolist()[:10],
                }
            elif 0 < numeric_version.notna().sum() < s.notna().sum():
                results[col] = {
                    "type": "mixed_numeric_categorical",
                    "issue": "Column mixes numeric-like and categorical values",
                    "examples_non_numeric": s[numeric_version.isna()].dropna().unique().tolist()[:10],
                    "examples_numeric": s[numeric_version.notna()].dropna().unique().tolist()[:10],
                }
            else:
                # Pure categorical → check inconsistencies
                non_null = s.dropna()
                if not non_null.empty:
                    original_unique = set(non_null.unique())
                    normalized = (
                        non_null.str.strip()
                        .str.replace("_", " ", regex=False)
                        .str.lower()
                    )
                    normalized_unique = set(normalized.unique())
                    if len(original_unique) != len(normalized_unique):
                        results[col] = {
                            "type": "categorical_inconsistencies",
                            "issue": "Possible casing/spacing/underscore inconsistencies",
                            "original_values": sorted(list(original_unique))[:20],
                            "suggested_normalized": sorted(list(normalized_unique))[:20],
                        }

        # --- 3. True numeric columns ---
        elif pd.api.types.is_numeric_dtype(s):
            invalid_negatives = s[s < 0]
            if not invalid_negatives.empty:
                results[col] = {
                    "type": "invalid_negative_numeric",
                    "issue": "Negative values may be invalid",
                    "invalid_negatives": invalid_negatives.unique().tolist(),
                }

    return results

# Run the check
formatting_issues = find_formatting_errors(df, force_numeric_cols=["hourlyrate"]) 

# Display results
for col, info in formatting_issues.items():
    print(f"\nColumn: {col}")
    print(f"Type: {info['type']}")
    print(f"Issue: {info['issue']}")
    if "original_values" in info:
        print("Original values:", info["original_values"])
        print("Suggested normalized:", info["suggested_normalized"])
    if "invalid_negatives" in info:
        print("Invalid negatives:", info["invalid_negatives"])


Column: hourlyrate
Type: dtype_mismatch_numeric_expected
Issue: Column should be numeric but contains non-numeric values

Column: annualsalary
Type: invalid_negative_numeric
Issue: Negative values may be invalid
Invalid negatives: [-15896.0, -28660.8, -15022.4, -10433.6, -14475.2, -16540.8, -15001.6, -10392.0, -13601.6, -13352.0, -13435.2, -9705.6, -13268.8, -16041.6, -12056.0, -14828.8, -9497.6, -12852.8, -10932.8, -12624.0, -9268.8, -33326.4, -9580.8, -9872.0, -15494.4, -14980.8, -13331.2, -15099.2, -9643.2, -13560.0, -14558.4, -11452.8, -10953.6, -33222.4, -12748.8, -10641.6, -15334.4, -16374.4, -13414.4, -10412.8, -16353.6, -12868.8, -10246.4, -15521.6, -16270.4, -10828.8, -33056.0]

Column: drivingcommuterdistance
Type: invalid_negative_numeric
Issue: Negative values may be invalid
Invalid negatives: [-4, -5, -8, -7, -10, -2, -12, -3, -11, -6, -9, -14, -13, -1, -15, -125, -275]

Column: jobrolearea
Type: categorical_inconsistencies
Issue: Possible casing/spacing/underscore incons

In [12]:
# Create backup before any cleaning is done
df_backup = df.copy()

In [13]:
import pandas as pd
import re

# Update data types to numerice as needed.  
# I force "drivingcommuterdistance", "hoursweekly" so they go from int64 to float64 for future reporting purposes. 
def _clean_numeric_like(series: pd.Series) -> pd.Series:
    """
    Cleans numeric-like strings:
    - Removes currency symbols, commas, whitespace
    - Removes text units like 'hrs', 'hour', '/hr'
    - Keeps digits, dots, and minus signs
    - Coerces to numeric
    """
    s = series.astype(str)

    # remove common units/words
    s = s.str.replace(r'(hours?|hrs?|/hr)\b', '', flags=re.IGNORECASE, regex=True)

    # remove currency, commas, percent, and other symbols except digits . and -
    s = s.str.replace(r'[^0-9.\-]', '', regex=True)

    # fix multiple minus/dot issues (basic normalization)
    s = s.str.replace(r'(?<=.)\-', '', regex=True)

    return pd.to_numeric(s, errors='coerce')


# --- main cleaner ---
def clean_and_convert_numeric(
    df: pd.DataFrame,
    numeric_threshold: float = 0.95,
    force_numeric_cols: list | None = None,
    force_float_cols: list | None = None,
    preview_examples: int = 5
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Convert likely-numeric columns to numeric dtypes,
    with support for forcing specific columns to numeric or float.
    Adds a 'Reason' column to the audit.
    """
    cleaned = df.copy()
    audit_rows = []

    force_set = set(force_numeric_cols or [])
    force_float_set = set(force_float_cols or [])

    for col in cleaned.columns:
        s = cleaned[col]
        old_dtype = str(s.dtype)
        n_nonnull = int(s.notna().sum())

        direct_num = pd.to_numeric(s, errors='coerce') if not pd.api.types.is_numeric_dtype(s) else s
        direct_ok = int(direct_num.notna().sum()) if n_nonnull else 0
        direct_rate = (direct_ok / n_nonnull) if n_nonnull else 0.0

        if (not pd.api.types.is_numeric_dtype(s)) and (direct_rate < numeric_threshold or col in force_set):
            cleaned_num = _clean_numeric_like(s)
            cleaned_ok = int(cleaned_num.notna().sum()) if n_nonnull else 0
            cleaned_rate = (cleaned_ok / n_nonnull) if n_nonnull else 0.0

            should_convert = (col in force_set) or (cleaned_rate >= numeric_threshold and cleaned_rate >= direct_rate)

            if should_convert:
                cleaned[col] = cleaned_num
                new_dtype = str(cleaned[col].dtype)
                introduced_nans = max(0, n_nonnull - cleaned_ok)
                reason = "Forced numeric" if col in force_set else "Cleaned & converted"
                audit_rows.append({
                    "Column": col,
                    "Old Dtype": old_dtype,
                    "New Dtype": new_dtype,
                    "Reason": reason,
                    "Non-Null Before": n_nonnull,
                    "Parsable After": cleaned_ok,
                    "Parse Rate %": round(100 * cleaned_rate, 2),
                    "Introduced NaNs": introduced_nans,
                })
            else:
                audit_rows.append({
                    "Column": col,
                    "Old Dtype": old_dtype,
                    "New Dtype": old_dtype,
                    "Reason": "Left as-is (low parse rate)",
                    "Non-Null Before": n_nonnull,
                    "Parsable After": cleaned_ok,
                    "Parse Rate %": round(100 * cleaned_rate, 2),
                    "Introduced NaNs": None,
                })
        else:
            if pd.api.types.is_numeric_dtype(s):
                audit_rows.append({
                    "Column": col,
                    "Old Dtype": old_dtype,
                    "New Dtype": old_dtype,
                    "Reason": "Already numeric",
                    "Non-Null Before": n_nonnull,
                    "Parsable After": n_nonnull,
                    "Parse Rate %": 100.00,
                    "Introduced NaNs": 0,
                })
            else:
                should_convert = (col in force_set) or (direct_rate >= numeric_threshold)
                if should_convert:
                    cleaned[col] = direct_num
                    new_dtype = str(cleaned[col].dtype)
                    introduced_nans = max(0, n_nonnull - direct_ok)
                    reason = "Forced numeric" if col in force_set else "Direct conversion"
                    audit_rows.append({
                        "Column": col,
                        "Old Dtype": old_dtype,
                        "New Dtype": new_dtype,
                        "Reason": reason,
                        "Non-Null Before": n_nonnull,
                        "Parsable After": direct_ok,
                        "Parse Rate %": round(100 * direct_rate, 2),
                        "Introduced NaNs": introduced_nans,
                    })
                else:
                    audit_rows.append({
                        "Column": col,
                        "Old Dtype": old_dtype,
                        "New Dtype": old_dtype,
                        "Reason": "Left as-is (low parse rate)",
                        "Non-Null Before": n_nonnull,
                        "Parsable After": direct_ok,
                        "Parse Rate %": round(100 * direct_rate, 2),
                        "Introduced NaNs": None,
                    })

  # Force float conversion (even if already numeric)
    for col in force_float_set:
        if col in cleaned.columns:
            old_dtype = str(cleaned[col].dtype)
            cleaned[col] = cleaned[col].astype(float)
            new_dtype = str(cleaned[col].dtype)

            # If column already exists in audit, update last entry
            if any(a["Column"] == col for a in audit_rows):
                for a in audit_rows:
                    if a["Column"] == col:
                        a.update({
                            "New Dtype": new_dtype,
                            "Reason": "Forced float"
                        })
            else:
                # If somehow missing, add new row
                audit_rows.append({
                    "Column": col,
                    "Old Dtype": old_dtype,
                    "New Dtype": new_dtype,
                    "Reason": "Forced float",
                    "Non-Null Before": int(cleaned[col].notna().sum()),
                    "Parsable After": int(cleaned[col].notna().sum()),
                    "Parse Rate %": 100.00,
                    "Introduced NaNs": 0,
                })

    audit = pd.DataFrame(audit_rows).sort_values(
        by=["Column"], ascending=True
    ).reset_index(drop=True)

    return cleaned, audit

    audit = pd.DataFrame(audit_rows).sort_values(
        by=["Column"], ascending=True
    ).reset_index(drop=True)

    return cleaned, audit
# Run the cleaner audit with forced floats
df, type_audit = clean_and_convert_numeric(
    df,
    force_float_cols=["drivingcommuterdistance", "hoursweekly"]
)


# Inspect audit table
type_audit

Unnamed: 0,Column,Old Dtype,New Dtype,Reason,Non-Null Before,Parsable After,Parse Rate %,Introduced NaNs
0,age,int64,int64,Already numeric,10199,10199,100.0,0.0
1,annualprofessionaldevhrs,float64,float64,Already numeric,8230,8230,100.0,0.0
2,annualsalary,float64,float64,Already numeric,10199,10199,100.0,0.0
3,compensationtype,object,object,Left as-is (low parse rate),10199,0,0.0,
4,drivingcommuterdistance,int64,float64,Forced float,10199,10199,100.0,0.0
5,employeenumber,int64,int64,Already numeric,10199,10199,100.0,0.0
6,gender,object,object,Left as-is (low parse rate),10199,0,0.0,
7,hourlyrate,object,float64,Cleaned & converted,10199,10199,100.0,0.0
8,hoursweekly,int64,float64,Forced float,10199,10199,100.0,0.0
9,jobrolearea,object,object,Left as-is (low parse rate),10199,0,0.0,


In [14]:
# Sanity Check to ensure HourlyRate has conveted to Numeric
df.describe()

Unnamed: 0,employeenumber,age,tenure,hourlyrate,hoursweekly,annualsalary,drivingcommuterdistance,numcompaniespreviouslyworked,annualprofessionaldevhrs
count,10199.0,10199.0,10199.0,10199.0,10199.0,10199.0,10199.0,9534.0,8230.0
mean,5001.960977,44.028826,8.992744,52.792995,40.0,120947.568526,45.411903,4.21481,14.938518
std,2942.709195,10.217864,5.511985,23.94194,0.0,77566.715759,54.01175,2.481994,6.087415
min,1.0,21.0,1.0,17.21,40.0,-33326.4,-275.0,1.0,5.0
25%,2451.5,37.0,5.0,30.955,40.0,63252.8,13.0,2.0,10.0
50%,5001.0,44.0,8.0,48.83,40.0,101566.4,42.0,4.0,15.0
75%,7550.5,53.0,13.0,73.98,40.0,153878.4,71.0,6.0,20.0
max,10100.0,61.0,20.0,98.07,40.0,339950.4,950.0,9.0,25.0


In [15]:
# View data as sanity check after conversion
df.head(5).T

Unnamed: 0,0,1,2,3,4
employeenumber,1,2,3,4,5
age,28,33,22,23,40
tenure,6,2,1,1,6
turnover,Yes,Yes,No,No,No
hourlyrate,24.37,24.37,22.52,22.52,88.77
hoursweekly,40.0,40.0,40.0,40.0,40.0
compensationtype,Salary,Salary,Salary,Salary,Salary
annualsalary,50689.6,50689.6,46841.6,46841.6,284641.6
drivingcommuterdistance,89.0,89.0,35.0,35.0,12.0
jobrolearea,Research,Research,Information_Technology,Information_Technology,Sales


In [None]:

# Set Business Rules for absurd values as a dict

rules = {
    "age": {"min": 18, "max": 80, "integer": True, "fix": "clip", "clip_min": 18, "clip_max": 80},
    "tenure": {"min": 0, "max": 60, "integer": True, "fix": "clip", "clip_min": 0, "clip_max": 60},
    "hoursweekly": {"min": 0, "max": 80, "integer": True, "fix": "clip", "clip_min": 0, "clip_max": 80},
    "annualsalary": {"min": 0, "max": None, "integer": False, "fix": "abs"},
    "hourlyrate": {"min": 0, "max": 200, "integerz": False, "fix": "clip", "clip_min": 0, "clip_max": 200},
    "drivingcommuterdistance": {
        "min": 0, "max": 200, "integer": True,
        "fix": "abs_or_nan", "abs_threshold": 100
    },
    "numcompaniespreviouslyworked": {"min": 0, "max": 50, "integer": True, "fix": "clip", "clip_min": 0, "clip_max": 50},
    "annualprofessionaldevhrs": {"min": 0, "max": 1000, "integer": True, "fix": "clip", "clip_min": 0, "clip_max": 1000},
}

In [16]:
# Set up functions for Absured, Outliers, and cleaning process

# ----------------------------
# Utility: absurd mask & fixer
# ----------------------------
def _absurd_mask(s: pd.Series, r: dict) -> pd.Series:
    m = pd.Series(False, index=s.index)
    if r.get("min") is not None:
        m |= s < r["min"]
    if r.get("max") is not None:
        m |= s > r["max"]
    if r.get("integer", False):
        if not pd.api.types.is_integer_dtype(s):
            finite = s.notna() & np.isfinite(s)
            non_integer = finite & (s != np.floor(s))
            m |= non_integer
    return m

def _apply_absurd_fix(s: pd.Series, r: dict, m: pd.Series) -> pd.Series:
    fix = r.get("fix", None)
    if fix is None or m.sum() == 0:
        return s

    if callable(fix):
        return fix(s)

    if fix == "abs":
        neg_mask = m & (s < 0)
        s.loc[neg_mask] = s.loc[neg_mask].abs()
        return s

    if fix == "abs_or_nan":
        thresh = r.get("abs_threshold", 100)
        neg_mask = s < 0
        small_neg = neg_mask & (s.abs() < thresh)
        s.loc[small_neg] = s.loc[small_neg].abs()
        s.loc[neg_mask & ~small_neg] = np.nan
        # values above max → NaN
        if r.get("max") is not None:
            s.loc[s > r["max"]] = np.nan
        # non-negative below min → NaN
        if r.get("min") is not None:
            s.loc[(s < r["min"]) & (s >= 0)] = np.nan
        return s

    if fix == "clip":
        s = s.clip(lower=r.get("clip_min", None), upper=r.get("clip_max", None))
        return s

    if fix == "nan":
        s.loc[m] = np.nan
        return s

    return s

# ----------------------------
# Utility: outlier detect/cap
# ----------------------------
def detect_and_cap_outliers(
    s: pd.Series,
    method: str = "iqr",
    iqr_factor: float = 1.5,
    z_thresh: float = 3.0,
    cap: bool = True,
    ddof: int = 0
):
    x = s.astype(float)
    finite = np.isfinite(x)

    if method == "iqr":
        q1 = np.nanpercentile(x[finite], 25)
        q3 = np.nanpercentile(x[finite], 75)
        iqr = q3 - q1
        lower = q1 - iqr_factor * iqr
        upper = q3 + iqr_factor * iqr
        out_mask = (x < lower) | (x > upper)
    elif method == "zscore":
        mu = np.nanmean(x[finite])
        sd = np.nanstd(x[finite], ddof=ddof)
        if sd == 0 or np.isnan(sd):
            lower = upper = np.nan
            out_mask = pd.Series(False, index=s.index)
        else:
            z = (x - mu) / sd
            out_mask = np.abs(z) > z_thresh
            lower = mu - z_thresh * sd
            upper = mu + z_thresh * sd
    else:
        raise ValueError("method must be 'iqr' or 'zscore'")

    n_out = int(out_mask.sum())
    if cap and np.isfinite(lower) and np.isfinite(upper):
        x_cap = x.clip(lower=lower, upper=upper)
    else:
        x_cap = x

    return out_mask, lower, upper, n_out, x_cap.astype(s.dtype)


# ------------------------
# Start cleaning process
# -----------------------------------------
def run_task1_cleaning(
    df: pd.DataFrame,
    rules: dict,
    outlier_cols: list[str] | None = None,
    outlier_method: str = "iqr",
    iqr_factor: float = 1.5,
    z_thresh: float = 3.0,
    cap_outliers: bool = True,
    impute_plan: dict | None = None,
    categorical_standardizers: dict | None = None
):
    """
    Returns:
      df_clean : cleaned dataframe
      task_log : long-form log with B1/B2/C1 info per column + duplicates
    """

    logs = []
    df_clean = df.copy()

    # ---------- Duplicates (global) ----------
    # B1: looked via df.duplicated()
    dup_count = int(df_clean.duplicated().sum())
    logs.append({
        "column": "__ALL__",
        "step": "B2",
        "metric": "duplicate_count",
        "before": dup_count,
        "after": None,
        "action": "Checked with df.duplicated().sum()"
    })

    # C1: remove duplicates
    if dup_count > 0:
        df_clean = df_clean.drop_duplicates().reset_index(drop=True)
    logs.append({
        "column": "__ALL__",
        "step": "C1",
        "metric": "duplicates_removed",
        "before": dup_count,
        "after": 0,
        "action": "Removed with df.drop_duplicates()"
    })

    # ---------- Column-by-column ----------
    if outlier_cols is None:
        outlier_cols = [c for c in df_clean.columns if pd.api.types.is_numeric_dtype(df_clean[c])]

    # defaults for imputation
    # Decided to set drivingcommuterdistance to NaN because 0 seems to say that the emaployee is remote. NaN tells us that we are not sure of the status of the data
    
    if impute_plan is None:
        impute_plan = {
            "textmessageoptin": "Unknown",
            "numcompaniespreviouslyworked": "median",
            "annualprofessionaldevhrs": 0
        }

    # standardizers for categorical inconsistencies I identified during inspection
    if categorical_standardizers is None:
        categorical_standardizers = {
            "paycheckmethod": {
                "DirectDeposit": "Direct Deposit",
                "Direct_Deposit": "Direct Deposit",
                "Mail_Check": "Mail Check",
                "Mailed Check": "Mail Check",
                "MailedCheck": "Mail Check"
            },
            "jobrolearea": {
                "Information_Technology": "Information Technology",
                "InformationTechnology": "Information Technology",
                "Human_Resources": "Human Resources",
                "HumanResources": "Human Resources"
                
                
            }
        }

    for col in df_clean.columns:
        s = df_clean[col]
        dtype = str(s.dtype)

        # ---------- Inconsistencies / formatting (categorical only) ----------
        if pd.api.types.is_object_dtype(s):
            unique_before = s.nunique(dropna=False)
            # Strip whitespace (B1/C1)
            df_clean[col] = s.apply(lambda x: x.strip() if isinstance(x, str) else x)
            action_notes = ["Stripped leading/trailing whitespace"]

            # Apply canonical maps if provided
            if col in categorical_standardizers:
                df_clean[col] = df_clean[col].replace(categorical_standardizers[col])
                action_notes.append("Standardized known variants via mapping")

            # Title-case & underscore->space normalization pass (non-destructive idea)
            # (Only demonstrate; you already standardized explicit items above)
            # df_clean[col] = df_clean[col].apply(lambda x: x.replace("_", " ").title() if isinstance(x, str) else x)

            unique_after = df_clean[col].nunique(dropna=False)
            logs.append({
                "column": col,
                "step": "B2",
                "metric": "unique_values_count",
                "before": unique_before,
                "after": unique_after,
                "action": "; ".join(action_notes)
            })

            # Missing values (B2/C1)
            miss_before = int(df_clean[col].isna().sum())
            if col in impute_plan:
                strat = impute_plan[col]
                if strat == "median":
                    # for categorical this won't be used; kept for numeric names appearing here
                    pass
                else:
                    df_clean[col] = df_clean[col].fillna(strat)
                    action = f"Imputed missing with '{strat}'"
            else:
                action = "No imputation rule (left as-is)"

            miss_after = int(df_clean[col].isna().sum())
            logs.append({
                "column": col,
                "step": "B2",
                "metric": "missing_count",
                "before": miss_before,
                "after": miss_after,
                "action": action
            })

        # ---------- Numeric columns ----------
        if pd.api.types.is_numeric_dtype(s):
            # Missing values (B2)
            miss_before = int(s.isna().sum())

            # Absurd values (B1/B2): use provided rules if present
            r = rules.get(col, None)
            if r is not None:
                m_before = _absurd_mask(df_clean[col], r)
                absurd_before = int(m_before.sum())
                # C1: fix absurd values
                if absurd_before > 0:
                    df_clean[col] = _apply_absurd_fix(df_clean[col], r, m_before)
                # Recompute after
                absurd_after = int(_absurd_mask(df_clean[col], r).sum())

                logs.append({
                    "column": col,
                    "step": "B2",
                    "metric": "absurd_count",
                    "before": absurd_before,
                    "after": absurd_after,
                    "action": f"Applied absurd fix: {r.get('fix')} (bounds: {r.get('min')}–{r.get('max')})"
                })
            else:
                absurd_after = np.nan
                logs.append({
                    "column": col,
                    "step": "B2",
                    "metric": "absurd_count",
                    "before": 0,
                    "after": 0,
                    "action": "No absurd rule specified"
                })

            # Imputation for numeric columns
            if col in impute_plan:
                strat = impute_plan[col]
                if strat == "median":
                    val = df_clean[col].median()
                    df_clean[col] = df_clean[col].fillna(val)
                    imp_action = f"Imputed missing with median ({val})"
                else:
                    df_clean[col] = df_clean[col].fillna(strat)
                    imp_action = f"Imputed missing with {strat}"
            else:
                imp_action = "No imputation rule (left as-is)"

            miss_after = int(df_clean[col].isna().sum())
            logs.append({
                "column": col,
                "step": "B2",
                "metric": "missing_count",
                "before": miss_before,
                "after": miss_after,
                "action": imp_action
            })

            # Outliers (B1/B2): detect then cap (C1)
            if col in outlier_cols:
                out_mask, lower, upper, n_out, s_capped = detect_and_cap_outliers(
                    df_clean[col],
                    method=outlier_method,
                    iqr_factor=iqr_factor,
                    z_thresh=z_thresh,
                    cap=cap_outliers
                )
                # Apply cap
                if cap_outliers:
                    df_clean[col] = s_capped
                logs.append({
                    "column": col,
                    "step": "B2",
                    "metric": "outliers_count",
                    "before": n_out,
                    "after": 0 if cap_outliers else n_out,
                    "action": f"Outliers via {outlier_method.upper()} "
                              f"(lower={round(lower,3) if pd.notna(lower) else None}, "
                              f"upper={round(upper,3) if pd.notna(upper) else None}); "
                              f"{'capped to thresholds' if cap_outliers else 'flagged only'}"
                })

    task_log = pd.DataFrame(logs)
    return df_clean, task_log


In [17]:
# Set max display for all tables
pd.set_option("display.max_colwidth", None)  
# Run absurd, outlier, and cleaning on dataset.
df, task_log = run_task1_cleaning(
    df,
    rules=rules,
    outlier_cols=None,        # or a specific list
    outlier_method="iqr",     # or "zscore"
    iqr_factor=1.5,
    cap_outliers=True
)
# Show Task log
task_log

NameError: name 'rules' is not defined

In [None]:

# B2 - check inconsistent entrie after cleaning as sanity check
def check_inconsistent_entries(dataframe):
    for col in dataframe.columns:
        unique_vals = dataframe[col].dropna().unique()
        print(f"\nColumn: {col}")
        print(f"Unique count: {len(unique_vals)}")
        print("Unique values:", unique_vals)

# Run the check
check_inconsistent_entries(df)
# Check to ensure all values 

In [None]:
# Overall duplicate count (before removal is logged)
task_log[task_log.metric == "duplicate_count"][["before"]].tail(1)

In [None]:
# Per-column missing values (before/after)
task_log[task_log.metric == "missing_count"][["column","before","after","action"]].sort_values("column")

In [None]:
# Per-column inconsistent/formatting (categorical unique counts before/after)
task_log[task_log.metric == "unique_values_count"][["column","before","after","action"]].sort_values("column")

In [None]:
# Per-column absurd values (before/after)
task_log[task_log.metric == "absurd_count"][["column","before","after","action"]].sort_values("column")

In [None]:
# Per-column outliers (detected and (if enabled) capped)
# Display maxcolwidth
task_log[task_log.metric == "outliers_count"][["column","before","after","action"]].sort_values("column")

In [None]:
#B2 - View Formatting Errors after cleaning as sanity check
def find_formatting_errors(df, force_numeric_cols=None, numeric_object_threshold: float = 0.9):
    """
    Identify potential formatting errors in a DataFrame with tightened reporting.
    """
    results = {}
    force_numeric_cols = force_numeric_cols or []

    for col in df.columns:
        s = df[col]

        # --- 1. Forced numeric columns ---
        if col in force_numeric_cols:
            numeric_version = pd.to_numeric(s, errors="coerce")
            parse_rate = numeric_version.notna().sum() / max(1, s.notna().sum())

            if parse_rate < 1.0:
                results[col] = {
                    "type": "dtype_mismatch_numeric_expected",
                    "issue": "Column should be numeric but contains non-numeric values",
                    "parse_rate": round(100 * parse_rate, 2),
                    "examples_failed": s[numeric_version.isna()].dropna().unique().tolist()[:10],
                }
            else:
                if not pd.api.types.is_numeric_dtype(s):
                    results[col] = {
                        "type": "dtype_mismatch_numeric_expected",
                        "issue": "Likely numeric column stored as object",
                        "parse_rate": round(100 * parse_rate, 2),
                        "examples": s.dropna().unique().tolist()[:10],
                    }
                else:
                    # If it's already numeric and clean, log it as OK
                    results[col] = {
                        "type": "numeric_ok",
                        "issue": "Column is numeric and contains valid values",
                        "parse_rate": 100.0,
                    }
            continue  # done with this col

        # --- 2. Object columns (not forced numeric) ---
        if pd.api.types.is_object_dtype(s):
            numeric_version = pd.to_numeric(s, errors="coerce")
            parse_rate = numeric_version.notna().sum() / max(1, s.notna().sum())

            if parse_rate >= numeric_object_threshold:
                results[col] = {
                    "type": "dtype_mismatch_numeric_expected",
                    "issue": "Likely numeric column stored as object",
                    "parse_rate": round(100 * parse_rate, 2),
                    "examples": s.dropna().unique().tolist()[:10],
                }
            elif 0 < numeric_version.notna().sum() < s.notna().sum():
                results[col] = {
                    "type": "mixed_numeric_categorical",
                    "issue": "Column mixes numeric-like and categorical values",
                    "examples_non_numeric": s[numeric_version.isna()].dropna().unique().tolist()[:10],
                    "examples_numeric": s[numeric_version.notna()].dropna().unique().tolist()[:10],
                }
            else:
                # Pure categorical → check inconsistencies
                non_null = s.dropna()
                if not non_null.empty:
                    original_unique = set(non_null.unique())
                    normalized = (
                        non_null.str.strip()
                        .str.replace("_", " ", regex=False)
                        .str.lower()
                    )
                    normalized_unique = set(normalized.unique())
                    if len(original_unique) != len(normalized_unique):
                        results[col] = {
                            "type": "categorical_inconsistencies",
                            "issue": "Possible casing/spacing/underscore inconsistencies",
                            "original_values": sorted(list(original_unique))[:20],
                            "suggested_normalized": sorted(list(normalized_unique))[:20],
                        }

        # --- 3. True numeric columns ---
        elif pd.api.types.is_numeric_dtype(s):
            invalid_negatives = s[s < 0]
            if not invalid_negatives.empty:
                results[col] = {
                    "type": "invalid_negative_numeric",
                    "issue": "Negative values may be invalid",
                    "invalid_negatives": invalid_negatives.unique().tolist(),
                }

    return results

# Run the check
formatting_issues = find_formatting_errors(df, force_numeric_cols=["hourlyrate"]) 

# Display results
for col, info in formatting_issues.items():
    print(f"\nColumn: {col}")
    print(f"Type: {info['type']}")
    print(f"Issue: {info['issue']}")
    if "original_values" in info:
        print("Original values:", info["original_values"])
        print("Suggested normalized:", info["suggested_normalized"])
    if "invalid_negatives" in info:
        print("Invalid negatives:", info["invalid_negatives"])

In [None]:
# Export to CSV (no index column)
df.to_csv("Employee_Turnover_Cleaned.csv", index=False)

print("Cleaned dataset exported successfully.")