## Cleaning Data
### Handle Missing Values (NaN)

In [79]:
import pandas as pd
df = pd.read_csv("raw_data.csv")

In [80]:
df

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,,Canada,Female,62000.0
3,3,Alex,,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0


In [81]:
# 1. df.isnull() / df.isna() - Returns true for null values (NaN)
df.isnull()

Unnamed: 0,id,name,age,country,gender,income
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
3,False,False,True,False,False,False
4,False,False,False,False,False,True
5,False,False,False,False,False,False
6,False,True,False,False,False,False
7,False,False,False,True,False,False
8,False,False,False,False,False,False
9,False,False,True,False,False,False


In [82]:
# 2. df.isnull().sum() - Returns Count of NaNs per column 
df.isnull().sum()

id         0
name       1
age        3
country    1
gender     1
income     1
dtype: int64

In [83]:
# 3. df.dropna() - Drops rows with missing values
df.dropna()

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
5,5,Li Wei,27.0,China,Male,51000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0


In [84]:
# 4. df.dropna(axis=1) - Drops cols with missing values
df.dropna(axis=1)

Unnamed: 0,id
0,1
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [85]:
# 5. df.fillna(val) - Fills NaN with a value
df.fillna("Missing")
# df["age"].fillna(0)
# df.iloc[0:,0:5].fillna("Missing")

# Creating Cleaned data variable to store cleaned data into it.
cleaned_data = df.copy()
age_mean = cleaned_data["age"].mean()
cleaned_data["age"] = cleaned_data["age"].fillna(age_mean)

print(cleaned_data)
print() # Space
print(df)

    id          name    age country   gender   income
0    1      John Doe  29.00     USA     Male  55000.0
1    1      John Doe  29.00     USA     Male  55000.0
2    2    Jane Smith  32.75  Canada   Female  62000.0
3    3          Alex  32.75     USA  Unknown  47000.0
4    4  Maria Garcia  34.00   Spain   Female      NaN
5    5        Li Wei  27.00   China     Male  51000.0
6    6           NaN  45.00   India   Female  73000.0
7    7    Ahmed Khan  38.00     NaN     Male  68000.0
8    8    Rachel Lee  29.00     USA   Female  62000.0
9    9   Carlos Ruiz  32.75  Mexico     Male  45000.0
10  10   Emily Davis  31.00     USA      NaN  58000.0

    id          name   age country   gender   income
0    1      John Doe  29.0     USA     Male  55000.0
1    1      John Doe  29.0     USA     Male  55000.0
2    2    Jane Smith   NaN  Canada   Female  62000.0
3    3          Alex   NaN     USA  Unknown  47000.0
4    4  Maria Garcia  34.0   Spain   Female      NaN
5    5        Li Wei  27.0   Chin

In [86]:
# 6. df.ffill() - Forward Fill (carry previous value)
df.ffill()

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,29.0,Canada,Female,62000.0
3,3,Alex,29.0,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,47000.0
5,5,Li Wei,27.0,China,Male,51000.0
6,6,Li Wei,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,India,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,29.0,Mexico,Male,45000.0


In [87]:
# 7. df.bfill() - Backward Fill (carry next value)
df.bfill()

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
1,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,34.0,Canada,Female,62000.0
3,3,Alex,34.0,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,51000.0
5,5,Li Wei,27.0,China,Male,51000.0
6,6,Ahmed Khan,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,USA,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,31.0,Mexico,Male,45000.0


### Handle Duplicates

In [88]:
# 1. df.duplicated() - Find Duplicates & Returns true if found.
df.duplicated()
# df["name"].duplicated() #checks for duplicate values in name column only
# df[["name","country"]].duplicated() # checks for duplicate values in 2 columns

0     False
1      True
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
dtype: bool

In [89]:
# 2. df.drop_duplicates() - Remove duplicate rows
df.drop_duplicates()

Unnamed: 0,id,name,age,country,gender,income
0,1,John Doe,29.0,USA,Male,55000.0
2,2,Jane Smith,,Canada,Female,62000.0
3,3,Alex,,USA,Unknown,47000.0
4,4,Maria Garcia,34.0,Spain,Female,
5,5,Li Wei,27.0,China,Male,51000.0
6,6,,45.0,India,Female,73000.0
7,7,Ahmed Khan,38.0,,Male,68000.0
8,8,Rachel Lee,29.0,USA,Female,62000.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0
10,10,Emily Davis,31.0,USA,,58000.0


### Handle Data Types & date-time

In [90]:
# 1. df.dtypes
df.dtypes

id           int64
name        object
age        float64
country     object
gender      object
income     float64
dtype: object

In [91]:
# 2. df.astype(new_type) - Changes dtype

# df.astype('str')

# Keep in mind:
# Int64 → Pandas nullable integer
# df["age"].astype('Int64')

# int64 → NumPy integer (does NOT allow NaN)
df2 = df.copy()
df2 = df2.fillna(0)
df2["age"] = df2["age"].astype("int64").copy()
df2.dtypes


id           int64
name        object
age          int64
country     object
gender      object
income     float64
dtype: object

In [92]:
# 3. to_datetime() - used to convert date/time strings or values into Pandas datetime (Timestamp) format.
date_str = "2026-02-02"
date = pd.to_datetime(date_str)
print(date,type(date))

2026-02-02 00:00:00 <class 'pandas._libs.tslibs.timestamps.Timestamp'>


### Handle Strings

In [93]:
# 1. .str.lower()
df["name"].str.lower()

0         john doe
1         john doe
2       jane smith
3             alex
4     maria garcia
5           li wei
6              NaN
7       ahmed khan
8       rachel lee
9      carlos ruiz
10     emily davis
Name: name, dtype: object

In [94]:
# 2. .str.upper()
df["name"].str.upper()

0         JOHN DOE
1         JOHN DOE
2       JANE SMITH
3             ALEX
4     MARIA GARCIA
5           LI WEI
6              NaN
7       AHMED KHAN
8       RACHEL LEE
9      CARLOS RUIZ
10     EMILY DAVIS
Name: name, dtype: object

In [95]:
# 3. .str.capitalize() - converts the first character of a string to uppercase and the rest to lowercase.
df["name"].str.capitalize()

0         John doe
1         John doe
2       Jane smith
3             Alex
4     Maria garcia
5           Li wei
6              NaN
7       Ahmed khan
8       Rachel lee
9      Carlos ruiz
10     Emily davis
Name: name, dtype: object

In [None]:
# 4. .str.strip() - Removes leading/trailing spaces
# 5. .str.split("") - Split into parts based on a separator
df["name"].str.split(" ") # Space separator - splits each string in the name column at every space

# df3 = pd.DataFrame({"email": ["rohit@gmail.com", "abc@yahoo.com"]})
# df3["email"].str.split("@") # @ Separator - splits each string at every @

0         [John, Doe]
1         [John, Doe]
2       [Jane, Smith]
3              [Alex]
4     [Maria, Garcia]
5           [Li, Wei]
6                 NaN
7       [Ahmed, Khan]
8       [Rachel, Lee]
9      [Carlos, Ruiz]
10     [Emily, Davis]
Name: name, dtype: object

In [None]:
# 6. .str.contains() - check if a value exists in string or not
df["country"].str.contains("US")
df["country"].str.contains("india",case=False) # case=False makes case insensitive, as a result I and i consider as same

0     False
1     False
2     False
3     False
4     False
5     False
6      True
7       NaN
8     False
9     False
10    False
Name: country, dtype: object