In [16]:
import pandas as pd
import numpy as np
import re

# A dictionary with detailed and varied sample employee data,
# including a wider range of practical values and inconsistencies.
employee_data = {
    'emp_id': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 101],
    'name': ['Alice Johnson', 'Bob Smith', 'Charlie Lee', 'David Kim', 'Emily Chen', 'Frank Green', 'Grace Davis', 'Hannah White', 'Ivan Petrov', 'Laura Miller', 'Mark Wilson', 'Alice Johnson'],
    'email': ['alice@company.com', 'bob@company.com', 'charlie@company.com', 'david.k@company.com', 'emily.c@company.com', 'frank@company.com', 'grace@company.com', 'hannah.white@company.com', 'ivan.petrov@company.com', 'laura.m@company.com', 'mark.w@company.com', 'alice@company.com'],
    'joining_date': ['2020-03-01', '2021-06-15', '2025-01-01', 'Unknown', '2018-11-23', '2022-09-01', '2023-04-10', '2024-02-20', np.nan, '2021-02-28', '2022-10-10', '2020-03-01'],
    'department': ['Human Resources', 'HR', 'Finance', 'hr', 'IT', 'Marketing', 'Information Technology', 'it', 'Finance', 'marketing', 'Finance', 'Human Resources'],
    'salary': [55000, '60,000', 61000, 65000, 54000, 80000, 72000, 52000, 70000, 75000, 90000, 55000]
}

# Load the employee data from the dictionary into a DataFrame
df = pd.DataFrame(employee_data)

initial_rows = len(df)

print("Initial DataFrame head:")
print(df.head())
print("\nInitial DataFrame info:")
df.info()

duplicates_count = df.duplicated().sum()
if duplicates_count > 0:
    df = df.drop_duplicates()
    print(f"\n{duplicates_count} duplicate row(s) removed.")
else:
    print("\nNo duplicate rows found.")

print("\nMissing values before cleaning:")
print(df.isnull().sum())

df = df.dropna(subset=['emp_id'])

# --- Convert explicit 'Unknown' strings to NaN for consistent handling ---
df = df.replace('Unknown', np.nan)

# First, clean the salary column to remove non-numeric characters before converting
df['salary'] = df['salary'].astype(str).str.replace('[$,]', '', regex=True)

# Replace salaries that are 0 or negative with NaN so they are treated as missing values
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')
df['salary'] = df['salary'].apply(lambda x: np.nan if x <= 0 else x)

# A function to check for valid email formats
def is_valid_email(email):
    if pd.isna(email):
        return False
    # This regex is a simple check for a valid email pattern
    return re.match(r"[^@]+@[^@]+\.[^@]+", str(email)) is not None

# Apply the validation function and replace invalid emails with NaN
df['email'] = df['email'].apply(lambda x: x if is_valid_email(x) else np.nan)


# Now perform the fillna operations
median_salary = df['salary'].median()
df['salary'] = df['salary'].fillna(median_salary)

df['name'] = df['name'].fillna('Unknown')
df['email'] = df['email'].fillna('Unknown')
df['department'] = df['department'].fillna('Unknown')

df['joining_date'] = df['joining_date'].fillna('1900-01-01')

print("\nMissing values after filling:")
print(df.isnull().sum())

df['salary'] = pd.to_numeric(df['salary'], errors='coerce')

df['joining_date'] = pd.to_datetime(df['joining_date'], errors='coerce')

df['department'] = df['department'].str.title()
df['department'] = df['department'].str.replace('Hr', 'Human Resources')
df['department'] = df['department'].str.replace('It', 'Information Technology')

final_rows = len(df)
print("\n--- Data Cleaning Summary ---")
print(f"Initial number of rows: {initial_rows}")
print(f"Final number of rows: {final_rows}")
print(f"Rows dropped: {initial_rows - final_rows}")
print(f"Number of duplicate rows removed: {duplicates_count}")
print("\nFinal DataFrame info:")
df.info()
print("\nFinal DataFrame head:")
print(df.head())

df.to_csv("cleaned_employee_data.csv", index=False)
print("\nCleaned data has been saved to 'cleaned_employee_data.csv'.")


Initial DataFrame head:
   emp_id           name                email joining_date       department  \
0     101  Alice Johnson    alice@company.com   2020-03-01  Human Resources   
1     102      Bob Smith      bob@company.com   2021-06-15               HR   
2     103    Charlie Lee  charlie@company.com   2025-01-01          Finance   
3     104      David Kim  david.k@company.com      Unknown               hr   
4     105     Emily Chen  emily.c@company.com   2018-11-23               IT   

   salary  
0   55000  
1  60,000  
2   61000  
3   65000  
4   54000  

Initial DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   emp_id        12 non-null     int64 
 1   name          12 non-null     object
 2   email         12 non-null     object
 3   joining_date  11 non-null     object
 4   department    12 non-null     object
 5   sal