###**Import Libraries and Modules**

In [None]:
import pandas as pd #data manipulation
import numpy as np

In [None]:
# 1. Read the data

file = "/content/drive/MyDrive/Colab Notebooks/employee_data_with_quality_issues.xlsx"
df = pd.DataFrame(pd.read_excel(file))
print("Original shape:", df.shape)
df.head()
df.size

Original shape: (20, 5)


100

In [None]:
#2 Check the statisctis of data
df.describe()

Unnamed: 0,Name,Age,Salary,Join Date,Department
count,19,16,19,20,20
unique,19,15,18,19,8
top,John Doe,29,55000,2020-01-15,IT
freq,1,2,2,2,5


In [None]:
#3 Drop any missing value and save it in new df
df_clean = df.dropna().copy()
df_clean

Unnamed: 0,Name,Age,Salary,Join Date,Department
0,John Doe,29,55000,15-01-2020,Sales
1,john doe,29,55000,2020-01-15,sales
2,Jane Smith,thirty,62000,2019/03/22,Marketing
4,Ana-Marie,41,not available,2017-11-05,HR
5,Chris P,22,39000,2021-02-30,IT
6,Sara K,135,72000,2016-05-18,Finance
8,Li Wei,27,51000,2018-04-09,marketing
10,James O'Neil,44,82000,2014/06/12,HR
12,Robert Jr,17,30000,2022-01-01,Sales
13,Karen,200,70000,2010-10-10,HR


In [None]:
# 4. Standardize column names (lowercase, strip, underscore)
df_clean.columns = df_clean.columns.str.strip().str.lower().str.replace(" ", "_", regex=False)
df_clean.columns

Index(['name', 'age', 'salary', 'join_date', 'department'], dtype='object')

In [None]:
# 5. Clean NAME & Department column

# Name
df_clean['name'] = df_clean['name'].astype(str).str.strip().str.title().str.replace(r'\s+', ' ', regex=True)

# Department
df_clean['department'] = (
    df_clean['department']
    .astype(str)
    .str.strip()
    .str.title()
    .replace({
        'Hr': 'HR', 'It': 'IT', 'Sale': 'Sales', 'Marketting': 'Marketing',
        'Fin': 'Finance', 'H.R.': 'HR', '': np.nan, 'nan': np.nan
    })
)

df_clean

Unnamed: 0,name,age,salary,join_date,department
0,John Doe,29,55000,15-01-2020,Sales
1,John Doe,29,55000,2020-01-15,Sales
2,Jane Smith,thirty,62000,2019/03/22,Marketing
4,Ana-Marie,41,not available,2017-11-05,HR
5,Chris P,22,39000,2021-02-30,IT
6,Sara K,135,72000,2016-05-18,Finance
8,Li Wei,27,51000,2018-04-09,Marketing
10,James O'Neil,44,82000,2014/06/12,HR
12,Robert Jr,17,30000,2022-01-01,Sales
13,Karen,200,70000,2010-10-10,HR


In [None]:
# 6. Clean AGE column
age_map = {
    'thirty': 30,
    'NaN': np.nan,
    '': np.nan,
    'unknown': np.nan,
    'N/A': np.nan
}

df_clean['age'] = df_clean['age'].replace(age_map)
df_clean['age'] = pd.to_numeric(df_clean['age'], errors='coerce')
df_clean['age'] = df_clean['age'].where(
    (df_clean['age'] >= 16) & (df_clean['age'] <= 70),
    np.nan
)
df_clean

Unnamed: 0,name,age,salary,join_date,department
0,John Doe,29.0,55000,15-01-2020,Sales
1,John Doe,29.0,55000,2020-01-15,Sales
2,Jane Smith,30.0,62000,2019/03/22,Marketing
4,Ana-Marie,41.0,not available,2017-11-05,HR
5,Chris P,22.0,39000,2021-02-30,IT
6,Sara K,,72000,2016-05-18,Finance
8,Li Wei,27.0,51000,2018-04-09,Marketing
10,James O'Neil,44.0,82000,2014/06/12,HR
12,Robert Jr,17.0,30000,2022-01-01,Sales
13,Karen,,70000,2010-10-10,HR


In [None]:
# 7. Clean SALARY column
# ────────────────────────────────────────────────

# Remove currency symbols, 'k', 'not available', etc.
df_clean['salary'] = df_clean['salary'].replace({
    'not available': np.nan,
    'NaN': np.nan,
    '': np.nan,
    -48000: np.nan,           # negative salary doesn't make sense
})

df_clean['salary'] = df_clean['salary'].astype(str).str.replace('k', '000', regex=False)
df_clean['salary'] = df_clean['salary'].replace(r'[\$,]', '', regex=True)
df_clean['salary'] = pd.to_numeric(df_clean['salary'], errors='coerce')

df_clean

Unnamed: 0,name,age,salary,join_date,department
0,John Doe,29.0,55000.0,15-01-2020,Sales
1,John Doe,29.0,55000.0,2020-01-15,Sales
2,Jane Smith,30.0,62000.0,2019/03/22,Marketing
4,Ana-Marie,41.0,,2017-11-05,HR
5,Chris P,22.0,39000.0,2021-02-30,IT
6,Sara K,,72000.0,2016-05-18,Finance
8,Li Wei,27.0,51000.0,2018-04-09,Marketing
10,James O'Neil,44.0,82000.0,2014/06/12,HR
12,Robert Jr,17.0,30000.0,2022-01-01,Sales
13,Karen,,70000.0,2010-10-10,HR


In [None]:

import calendar

from datetime import datetime

def validate_date(date_str):
    # Normalize separators
    date_str = date_str.replace('/', '-')
    parts = date_str.split('-')

    # Detect format
    if len(parts[0]) == 4:
        # YYYY-MM-DD
        year, month, day = parts
    else:
        # DD-MM-YYYY
        day, month, year = parts

    # Try to validate
    try:
        dt = datetime(int(year), int(month), int(day))
        return dt.strftime('%Y-%m-%d')
    except:
        return np.nan

# Apply to your DataFrame
df_clean['join_date'] = df_clean['join_date'].apply(validate_date)


df_clean


Unnamed: 0,name,age,salary,join_date,department
0,John Doe,29.0,55000.0,2020-01-15,Sales
1,John Doe,29.0,55000.0,2020-01-15,Sales
2,Jane Smith,30.0,62000.0,2019-03-22,Marketing
4,Ana-Marie,41.0,,2017-11-05,HR
5,Chris P,22.0,39000.0,,IT
6,Sara K,,72000.0,2016-05-18,Finance
8,Li Wei,27.0,51000.0,2018-04-09,Marketing
10,James O'Neil,44.0,82000.0,2014-06-12,HR
12,Robert Jr,17.0,30000.0,2022-01-01,Sales
13,Karen,,70000.0,2010-10-10,HR


In [None]:
df_clean = df_clean.replace("", pd.NA)
df_clean = df_clean.drop_duplicates().copy()
df_clean = df_clean.dropna().copy()

df_clean

Unnamed: 0,name,age,salary,join_date,department
0,John Doe,29.0,55000.0,2020-01-15,Sales
2,Jane Smith,30.0,62000.0,2019-03-22,Marketing
8,Li Wei,27.0,51000.0,2018-04-09,Marketing
10,James O'Neil,44.0,82000.0,2014-06-12,HR
12,Robert Jr,17.0,30000.0,2022-01-01,Sales
14,Tommy,28.0,55000.0,2018-08-08,IT
