In [1]:
# 🧹 Cleaning Data in Pandas

# Real-world data is often messy — it may contain missing values, duplicates, incorrect types, or extra whitespace.

# Pandas provides powerful tools to clean and prepare data for analysis.

In [2]:
import pandas as pd
import numpy as np

data = {
    "Name": ["Alice", "Bob", "Charlie", "David", None],
    "Age": [25, np.nan, 35, 40, 30],
    "City": [" New York", "London ", "Paris", "Berlin", "Paris"],
    "Salary": [50000, 60000, np.nan, 70000, 60000],
    "Department": ["HR", "IT", "Finance", "IT", "HR"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City,Salary,Department
0,Alice,25.0,New York,50000.0,HR
1,Bob,,London,60000.0,IT
2,Charlie,35.0,Paris,,Finance
3,David,40.0,Berlin,70000.0,IT
4,,30.0,Paris,60000.0,HR


In [3]:
## 🔍 Checking for Missing Data
#Use `.isnull()`, `.notnull()`, and `df.isnull().sum()` to detect missing values.

df.isnull()
df.isnull().sum()

Name          1
Age           1
City          0
Salary        1
Department    0
dtype: int64

In [6]:
## 🧯 Drop or Fill Missing Values

# - Drop rows → `df.dropna()`  
# - Fill values → `df.fillna(value)`  

# Drop rows with any missing values
df_dropped = df.dropna()

# Fill missing age with average
df["Age"] = df["Age"].fillna(df["Age"].mean())

# Fill missing name with "Unknown"
df["Name"] = df["Name"].fillna("Unknown")
df

Unnamed: 0,Name,Age,City,Salary,Department
0,Alice,25.0,New York,50000.0,HR
1,Bob,32.5,London,60000.0,IT
2,Charlie,35.0,Paris,,Finance
3,David,40.0,Berlin,70000.0,IT
4,Unknown,30.0,Paris,60000.0,HR


In [7]:
## ✂️ Strip Extra Whitespace

# Use `.str.strip()` to clean text columns.

df["City"] = df["City"].str.strip()
df

Unnamed: 0,Name,Age,City,Salary,Department
0,Alice,25.0,New York,50000.0,HR
1,Bob,32.5,London,60000.0,IT
2,Charlie,35.0,Paris,,Finance
3,David,40.0,Berlin,70000.0,IT
4,Unknown,30.0,Paris,60000.0,HR


In [8]:
## 🔁 Drop Duplicates

# Use `df.duplicated()` and `df.drop_duplicates()` to handle duplicate rows.

# Check for duplicates
print(df.duplicated())

# Drop duplicates
df = df.drop_duplicates()

0    False
1    False
2    False
3    False
4    False
dtype: bool


In [11]:
## 🧠 Convert Data Types

# Sometimes columns are read with the wrong type. Use `astype()` to fix them.

df["Age"] = df["Age"].astype(int)
df

Unnamed: 0,Name,Age,City,Salary,Department
0,Alice,25,New York,50000.0,HR
1,Bob,32,London,60000.0,IT
2,Charlie,35,Paris,,Finance
3,David,40,Berlin,70000.0,IT
4,Unknown,30,Paris,60000.0,HR


In [12]:
df[df["Salary"].isnull()]

Unnamed: 0,Name,Age,City,Salary,Department
2,Charlie,35,Paris,,Finance


In [13]:
df["Salary"] = df["Salary"].fillna(df["Salary"].mean())

In [14]:
df

Unnamed: 0,Name,Age,City,Salary,Department
0,Alice,25,New York,50000.0,HR
1,Bob,32,London,60000.0,IT
2,Charlie,35,Paris,60000.0,Finance
3,David,40,Berlin,70000.0,IT
4,Unknown,30,Paris,60000.0,HR


In [None]:
# ✅ Use `.isnull()`, `.fillna()`, and `.dropna()` to handle missing values  
# ✅ Use `.str.strip()` to clean strings  
# ✅ Use `.drop_duplicates()` to remove duplicates  
# ✅ Use `.astype()` to fix data types  

