# Data Cleaning and Preprocessing

<!--
Author: RSK World
Website: https://rskworld.in
Email: help@rskworld.in
Phone: +91 93305 39277
Description: Comprehensive guide to data cleaning and preprocessing with Pandas
-->

## Introduction

This notebook covers essential data cleaning techniques including handling missing values, removing duplicates, data type conversion, and handling outliers.



In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

import pandas as pd
import numpy as np

# Create a sample DataFrame with missing values and inconsistencies
data = {
    'Name': ['Alice', 'Bob', None, 'David', 'Eve', 'Frank', None],
    'Age': [25, 30, None, 28, None, 45, 35],
    'Salary': [50000, None, 70000, 55000, 65000, None, 60000],
    'City': ['New York', 'London', 'Tokyo', 'Paris', 'Sydney', 'Berlin', 'Mumbai'],
    'Department': ['IT', 'IT', 'HR', 'HR', 'IT', 'HR', 'IT'],
    'Experience': [2, 5, None, 3, 8, 12, 6]
}

df = pd.DataFrame(data)
print("Original DataFrame with missing values:")
print(df)
print("\nDataFrame info:")
print(df.info())



## Detecting Missing Values


In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

# Check for missing values
print("=== Missing Values Count ===")
print(df.isnull().sum())

print("\n=== Missing Values Percentage ===")
print((df.isnull().sum() / len(df)) * 100)

print("\n=== Check if any missing values ===")
print(df.isnull().any())

print("\n=== Total missing values ===")
print(df.isnull().sum().sum())



## Handling Missing Values


In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

# Method 1: Drop rows with any missing values
df_dropna = df.dropna()
print("=== Drop all rows with missing values ===")
print(df_dropna)

# Method 2: Drop rows where all values are missing
df_dropna_all = df.dropna(how='all')
print("\n=== Drop rows where all values are missing ===")
print(df_dropna_all)

# Method 3: Drop rows with missing values in specific columns
df_dropna_subset = df.dropna(subset=['Name', 'Age'])
print("\n=== Drop rows with missing Name or Age ===")
print(df_dropna_subset)



In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

# Method 4: Fill missing values with a constant
df_fill_constant = df.copy()
df_fill_constant['Name'] = df_fill_constant['Name'].fillna('Unknown')
print("=== Fill Name with 'Unknown' ===")
print(df_fill_constant)

# Method 5: Fill missing values with forward fill (ffill)
df_ffill = df.copy()
df_ffill['Age'] = df_ffill['Age'].ffill()
print("\n=== Forward fill Age ===")
print(df_ffill[['Name', 'Age']])

# Method 6: Fill missing values with backward fill (bfill)
df_bfill = df.copy()
df_bfill['Age'] = df_bfill['Age'].bfill()
print("\n=== Backward fill Age ===")
print(df_bfill[['Name', 'Age']])



In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

# Method 7: Fill with mean, median, or mode
df_fill_stats = df.copy()
df_fill_stats['Age'].fillna(df_fill_stats['Age'].mean(), inplace=True)
df_fill_stats['Salary'].fillna(df_fill_stats['Salary'].median(), inplace=True)
df_fill_stats['Name'].fillna(df_fill_stats['Name'].mode()[0], inplace=True)

print("=== Fill with statistics ===")
print("Age filled with mean:", df['Age'].mean())
print("Salary filled with median:", df['Salary'].median())
print(df_fill_stats)



## Removing Duplicates


In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

# Create DataFrame with duplicates
df_dup = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Alice', 'David', 'Bob'],
    'Age': [25, 30, 25, 28, 30],
    'City': ['New York', 'London', 'New York', 'Paris', 'London']
})

print("=== Original DataFrame with duplicates ===")
print(df_dup)
print(f"\nDuplicates: {df_dup.duplicated().sum()}")

# Remove duplicates
df_no_dup = df_dup.drop_duplicates()
print("\n=== After removing duplicates ===")
print(df_no_dup)

# Remove duplicates based on specific columns
df_no_dup_subset = df_dup.drop_duplicates(subset=['Name', 'Age'])
print("\n=== Remove duplicates based on Name and Age ===")
print(df_no_dup_subset)



## Data Type Conversion


In [None]:
# Author: RSK World | Website: https://rskworld.in | Email: help@rskworld.in | Phone: +91 93305 39277

# Create DataFrame with mixed types
df_types = pd.DataFrame({
    'Age': ['25', '30', '35', '28'],
    'Salary': ['50000', '60000', '70000', '55000'],
    'Active': ['True', 'False', 'True', 'True'],
    'Date': ['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01']
})

print("=== Original data types ===")
print(df_types.dtypes)
print("\nDataFrame:")
print(df_types)

# Convert data types
df_types['Age'] = df_types['Age'].astype(int)
df_types['Salary'] = df_types['Salary'].astype(float)
df_types['Active'] = df_types['Active'].astype(bool)
df_types['Date'] = pd.to_datetime(df_types['Date'])

print("\n=== After type conversion ===")
print(df_types.dtypes)
print("\nDataFrame:")
print(df_types)

