Question: Data Cleaning Challenge
You are given a dataset with the following issues:

Missing values in some columns.
Incorrect data types in certain columns.
Duplicate entries in the dataset.
Outliers in numerical columns.


In [1]:
import pandas as pd
import numpy as np

# Create a sample dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Alice'],
    'Age': [25, 32, 29, np.nan, 30, 25],
    'Salary': [50000, np.nan, 65000, 72000, 71000, 50000],
    'Joining Date': ['2020-01-15', '2019-06-22', '2018-03-12', '2020-08-01', np.nan, '2020-01-15'],
    'Department': ['Sales', 'HR', 'IT', 'IT', 'Sales', 'Sales']
}

df = pd.DataFrame(data)

# Display the dataset
print("Original Dataset:")
print(df)


Original Dataset:
      Name   Age   Salary Joining Date Department
0    Alice  25.0  50000.0   2020-01-15      Sales
1      Bob  32.0      NaN   2019-06-22         HR
2  Charlie  29.0  65000.0   2018-03-12         IT
3    David   NaN  72000.0   2020-08-01         IT
4      Eve  30.0  71000.0          NaN      Sales
5    Alice  25.0  50000.0   2020-01-15      Sales


In [2]:
# Fill missing Age values with the mean
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Fill missing Salary values with the mean
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

# Drop rows where Joining Date is missing
df.dropna(subset=['Joining Date'], inplace=True)

print("\nDataset after handling missing values:")
print(df)



Dataset after handling missing values:
      Name   Age   Salary Joining Date Department
0    Alice  25.0  50000.0   2020-01-15      Sales
1      Bob  32.0  61600.0   2019-06-22         HR
2  Charlie  29.0  65000.0   2018-03-12         IT
3    David  28.2  72000.0   2020-08-01         IT
5    Alice  25.0  50000.0   2020-01-15      Sales


In [3]:
# Convert 'Joining Date' to datetime format
df['Joining Date'] = pd.to_datetime(df['Joining Date'])

print("\nDataset after converting 'Joining Date' to datetime:")
print(df.dtypes)



Dataset after converting 'Joining Date' to datetime:
Name                    object
Age                    float64
Salary                 float64
Joining Date    datetime64[ns]
Department              object
dtype: object


In [4]:
# Check for duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Remove duplicate rows
df.drop_duplicates(inplace=True)

print("\nDataset after removing duplicates:")
print(df)



Number of duplicate rows: 1

Dataset after removing duplicates:
      Name   Age   Salary Joining Date Department
0    Alice  25.0  50000.0   2020-01-15      Sales
1      Bob  32.0  61600.0   2019-06-22         HR
2  Charlie  29.0  65000.0   2018-03-12         IT
3    David  28.2  72000.0   2020-08-01         IT
