In [1]:
#2. Data Cleaning
import numpy as np
import pandas as pd 

# 1. Create a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Heidi'],
    'Age': [24, 30, np.nan, 28, 22, 35, 29, 40],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Boston', 'Los Angeles', 'Chicago', ''],
    'Salary': [50000, 60000, 55000, 62000, np.nan, 70000, 58000, 75000],
    'Department': ['IT', 'HR', 'IT', 'Sales', 'HR', 'IT', 'Sales', 'IT']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\n" + "="*30 + "\n")

# 2. Handle Missing Values
# Check for missing values
print("Missing values before cleaning:")
print(df.isnull().sum())

# Fill missing 'Age' with the mean
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Fill missing 'Salary' with a specific value (e.g., 0)
df['Salary'].fillna(0, inplace=True)

# Drop rows where 'City' is an empty string
df = df[df['City'] != '']
print("\nDataFrame after handling missing values:")
print(df)
print("\n" + "="*30 + "\n")

# 3. Handle Duplicates (if any)
# Create a DataFrame with duplicates for demonstration
df_dup = pd.DataFrame({'A': [1, 2, 2, 3], 'B': ['x', 'y', 'y', 'z']})
print("DataFrame with duplicates:")
print(df_dup)
df_dup.drop_duplicates(inplace=True)
print("\nDataFrame after dropping duplicates:")
print(df_dup)
print("\n" + "="*30 + "\n")

# 4. Data Type Conversion
# Convert 'Age' to integer type (after filling NaNs)
df['Age'] = df['Age'].astype(int)
print("DataFrame after converting 'Age' to int:")
print(df.dtypes)
print("\n" + "="*30 + "\n")



Original DataFrame:
      Name   Age         City   Salary Department
0    Alice  24.0     New York  50000.0         IT
1      Bob  30.0  Los Angeles  60000.0         HR
2  Charlie   NaN      Chicago  55000.0         IT
3    David  28.0     New York  62000.0      Sales
4      Eve  22.0       Boston      NaN         HR
5    Frank  35.0  Los Angeles  70000.0         IT
6    Grace  29.0      Chicago  58000.0      Sales
7    Heidi  40.0               75000.0         IT


Missing values before cleaning:
Name          0
Age           1
City          0
Salary        1
Department    0
dtype: int64

DataFrame after handling missing values:
      Name        Age         City   Salary Department
0    Alice  24.000000     New York  50000.0         IT
1      Bob  30.000000  Los Angeles  60000.0         HR
2  Charlie  29.714286      Chicago  55000.0         IT
3    David  28.000000     New York  62000.0      Sales
4      Eve  22.000000       Boston      0.0         HR
5    Frank  35.000000  Los Ange