### Transforming Data (feature Engineering)

In [70]:
# 1. apply() - used to apply a function to each value in a Series (single column) or DataFrame (row-wise or column wise).
import pandas as pd

df = pd.DataFrame({
    'Age':[18,22,30]
})

# Add 10 to each value of dataframe
df['Age_plus_10'] = df['Age'].apply(lambda x: x + 10)

print(df)

# Another Example
df2 = pd.read_csv("raw_data.csv")

# Adding another column with taxes percentage they have to pay
df2["tax"]=df2["income"].apply(

    lambda x: "20%" if x >= 50000 else "10%"
    # if income is 50,000 or more, return "20%" otherwise "10%"
)
df2["tax amnt"]=df2["income"].apply(
    lambda x: ((x)*20/100) if x >= 50_000 else ((x)*10/100)
)
df2

   Age  Age_plus_10
0   18           28
1   22           32
2   30           40


Unnamed: 0,id,name,age,country,gender,income,tax,tax amnt
0,1,John Doe,29.0,USA,Male,55000.0,20%,11000.0
1,1,John Doe,29.0,USA,Male,55000.0,20%,11000.0
2,2,Jane Smith,,Canada,Female,62000.0,20%,12400.0
3,3,Alex,,USA,Unknown,47000.0,10%,4700.0
4,4,Maria Garcia,34.0,Spain,Female,,10%,
5,5,Li Wei,27.0,China,Male,51000.0,20%,10200.0
6,6,,45.0,India,Female,73000.0,20%,14600.0
7,7,Ahmed Khan,38.0,,Male,68000.0,20%,13600.0
8,8,Rachel Lee,29.0,USA,Female,62000.0,20%,12400.0
9,9,Carlos Ruiz,,Mexico,Male,45000.0,10%,4500.0


In [71]:
# 2. map() - used on a pandas Series to replace or transform its values.
df3 = pd.DataFrame({
    'City':["Delhi","Mumbai","Bangalore","Chennai"]
})
# fillna() - Fills Nan with a value
df3["City"] = df3["City"].map({"Delhi":"DL","Mumbai":"MB"}).fillna(df3["City"])
print(df3)

# Another Example
gender_map = {"Male":"M","Female":"F","Unknown":"U"}
# fillna() is used to fill U where NaN is present
df2["gender"] = df2["gender"].map(gender_map).fillna("U")
df2

        City
0         DL
1         MB
2  Bangalore
3    Chennai


Unnamed: 0,id,name,age,country,gender,income,tax,tax amnt
0,1,John Doe,29.0,USA,M,55000.0,20%,11000.0
1,1,John Doe,29.0,USA,M,55000.0,20%,11000.0
2,2,Jane Smith,,Canada,F,62000.0,20%,12400.0
3,3,Alex,,USA,U,47000.0,10%,4700.0
4,4,Maria Garcia,34.0,Spain,F,,10%,
5,5,Li Wei,27.0,China,M,51000.0,20%,10200.0
6,6,,45.0,India,F,73000.0,20%,14600.0
7,7,Ahmed Khan,38.0,,M,68000.0,20%,13600.0
8,8,Rachel Lee,29.0,USA,F,62000.0,20%,12400.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,4500.0


In [72]:

# 3. assign() - Create new columns or modify existing columns.
# returns a copy of the DataFrame with the new (or modified) columns.
# does not change the original DataFrame unless you assign it back.

df4 = pd.DataFrame({
    "Name":["Rohit","Namo","Aryan","Manya"],
    "Income":[55_000,50_000,56_000,60_000]
})
print(df4)
df4 = df4.assign(New_Income = df4["Income"] + 10_000)
df4

    Name  Income
0  Rohit   55000
1   Namo   50000
2  Aryan   56000
3  Manya   60000


Unnamed: 0,Name,Income,New_Income
0,Rohit,55000,65000
1,Namo,50000,60000
2,Aryan,56000,66000
3,Manya,60000,70000


In [73]:
# 4. replace(old,new) - Replace specific values in a Series or DataFrame
# returns a copy, only the specified values are changed, other values remain the same; 
# to modify the original DataFrame, assign the result back to the original column or DataFrame.
df2["country"] = df2["country"].replace("USA","United States")
df2

Unnamed: 0,id,name,age,country,gender,income,tax,tax amnt
0,1,John Doe,29.0,United States,M,55000.0,20%,11000.0
1,1,John Doe,29.0,United States,M,55000.0,20%,11000.0
2,2,Jane Smith,,Canada,F,62000.0,20%,12400.0
3,3,Alex,,United States,U,47000.0,10%,4700.0
4,4,Maria Garcia,34.0,Spain,F,,10%,
5,5,Li Wei,27.0,China,M,51000.0,20%,10200.0
6,6,,45.0,India,F,73000.0,20%,14600.0
7,7,Ahmed Khan,38.0,,M,68000.0,20%,13600.0
8,8,Rachel Lee,29.0,United States,F,62000.0,20%,12400.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,4500.0


In [93]:
# 5. rename(rows/column={old:new}) - rename specified columns or row 
df5 = pd.DataFrame({
    "A":[1,2,3],
    "B":[4,5,6],
    "C":[7,8,9]
})
df_renamed = df5.rename(columns={"A":"X","B":"Y"})
print(df_renamed)
df_renamed=df5.rename(index={0:"Zero"})
print(df_renamed)

# Renames all columns at once
df6 = pd.DataFrame({
    "A":[1,2,3],
    "B":[4,5,6],
    "C":[7,8,9]
})
df6.columns = ["First","Second","Third"] # new column names must match the number of cols available
df6

   X  Y  C
0  1  4  7
1  2  5  8
2  3  6  9
      A  B  C
Zero  1  4  7
1     2  5  8
2     3  6  9


Unnamed: 0,First,Second,Third
0,1,4,7
1,2,5,8
2,3,6,9


In [None]:
# 6. sort_values()
df2["income"].sort_values() # shows specific column in inreasing order

df2.sort_values("income") # shows overall data in increasing order based on income column

df2.sort_values("income",ascending=False) # in decreasing order

df2.sort_values(["income","age"]) 
# Pandas first sorts all rows by income (ascending by default)
# If two rows have the same income, then: It sorts those rows by age in ascending order

Unnamed: 0,id,name,age,country,gender,income,tax,tax amnt
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,4500.0
3,3,Alex,,United States,U,47000.0,10%,4700.0
5,5,Li Wei,27.0,China,M,51000.0,20%,10200.0
0,1,John Doe,29.0,United States,M,55000.0,20%,11000.0
1,1,John Doe,29.0,United States,M,55000.0,20%,11000.0
10,10,Emily Davis,31.0,United States,U,58000.0,20%,11600.0
8,8,Rachel Lee,29.0,United States,F,62000.0,20%,12400.0
2,2,Jane Smith,,Canada,F,62000.0,20%,12400.0
7,7,Ahmed Khan,38.0,,M,68000.0,20%,13600.0
6,6,,45.0,India,F,73000.0,20%,14600.0


In [106]:
# 7. sort_index()
df2.sort_index()
df2.sort_index(ascending=False)

Unnamed: 0,id,name,age,country,gender,income,tax,tax amnt
10,10,Emily Davis,31.0,United States,U,58000.0,20%,11600.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,4500.0
8,8,Rachel Lee,29.0,United States,F,62000.0,20%,12400.0
7,7,Ahmed Khan,38.0,,M,68000.0,20%,13600.0
6,6,,45.0,India,F,73000.0,20%,14600.0
5,5,Li Wei,27.0,China,M,51000.0,20%,10200.0
4,4,Maria Garcia,34.0,Spain,F,,10%,
3,3,Alex,,United States,U,47000.0,10%,4700.0
2,2,Jane Smith,,Canada,F,62000.0,20%,12400.0
1,1,John Doe,29.0,United States,M,55000.0,20%,11000.0


In [110]:
# 8. reset_index() - resets the index of a DataFrame back to the default numeric index and also saves previous index in a new column "index"
sorted_df = df2.sort_index(ascending=False)
sorted_df.reset_index() # to not save the previous index values
sorted_df.reset_index(drop=True)

Unnamed: 0,id,name,age,country,gender,income,tax,tax amnt
0,10,Emily Davis,31.0,United States,U,58000.0,20%,11600.0
1,9,Carlos Ruiz,,Mexico,M,45000.0,10%,4500.0
2,8,Rachel Lee,29.0,United States,F,62000.0,20%,12400.0
3,7,Ahmed Khan,38.0,,M,68000.0,20%,13600.0
4,6,,45.0,India,F,73000.0,20%,14600.0
5,5,Li Wei,27.0,China,M,51000.0,20%,10200.0
6,4,Maria Garcia,34.0,Spain,F,,10%,
7,3,Alex,,United States,U,47000.0,10%,4700.0
8,2,Jane Smith,,Canada,F,62000.0,20%,12400.0
9,1,John Doe,29.0,United States,M,55000.0,20%,11000.0


In [122]:
# 9. rank() - assigns a rank (position) to each value in a Series or DataFrame based on its order

sorted_df["Ranking"] = sorted_df["income"].rank()
# Pandas looks at the income column.
# It orders the values from lowest to highest (default behavior).
# It assigns a rank number to each value:
# Lowest income → rank 1
# Highest income → largest rank
# If two incomes are equal, they get the average rank (default).
# The ranks are stored in a new column called Ranking.
sorted_df

Unnamed: 0,id,name,age,country,gender,income,tax,tax amnt,Ranking
10,10,Emily Davis,31.0,United States,U,58000.0,20%,11600.0,6.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,4500.0,1.0
8,8,Rachel Lee,29.0,United States,F,62000.0,20%,12400.0,7.5
7,7,Ahmed Khan,38.0,,M,68000.0,20%,13600.0,9.0
6,6,,45.0,India,F,73000.0,20%,14600.0,10.0
5,5,Li Wei,27.0,China,M,51000.0,20%,10200.0,3.0
4,4,Maria Garcia,34.0,Spain,F,,10%,,
3,3,Alex,,United States,U,47000.0,10%,4700.0,2.0
2,2,Jane Smith,,Canada,F,62000.0,20%,12400.0,7.5
1,1,John Doe,29.0,United States,M,55000.0,20%,11000.0,4.5


In [121]:
# When rank(ascending=False),
# Highest income → Rank 1
# Lowest income -> largest Rank
sorted_df["Ranking"] = sorted_df["income"].rank(ascending=False)
sorted_df

Unnamed: 0,id,name,age,country,gender,income,tax,tax amnt,Ranking
10,10,Emily Davis,31.0,United States,U,58000.0,20%,11600.0,5.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,4500.0,10.0
8,8,Rachel Lee,29.0,United States,F,62000.0,20%,12400.0,3.5
7,7,Ahmed Khan,38.0,,M,68000.0,20%,13600.0,2.0
6,6,,45.0,India,F,73000.0,20%,14600.0,1.0
5,5,Li Wei,27.0,China,M,51000.0,20%,10200.0,8.0
4,4,Maria Garcia,34.0,Spain,F,,10%,,
3,3,Alex,,United States,U,47000.0,10%,4700.0,9.0
2,2,Jane Smith,,Canada,F,62000.0,20%,12400.0,3.5
1,1,John Doe,29.0,United States,M,55000.0,20%,11000.0,6.5


In [123]:
# When rank(method="dense") - Uses whole numbers (1,2,3) not (2.5,3.5), Equal values get the same rank, The next rank increases by 1
sorted_df["Ranking"] = sorted_df["income"].rank(method="dense")
sorted_df

Unnamed: 0,id,name,age,country,gender,income,tax,tax amnt,Ranking
10,10,Emily Davis,31.0,United States,U,58000.0,20%,11600.0,5.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,4500.0,1.0
8,8,Rachel Lee,29.0,United States,F,62000.0,20%,12400.0,6.0
7,7,Ahmed Khan,38.0,,M,68000.0,20%,13600.0,7.0
6,6,,45.0,India,F,73000.0,20%,14600.0,8.0
5,5,Li Wei,27.0,China,M,51000.0,20%,10200.0,3.0
4,4,Maria Garcia,34.0,Spain,F,,10%,,
3,3,Alex,,United States,U,47000.0,10%,4700.0,2.0
2,2,Jane Smith,,Canada,F,62000.0,20%,12400.0,6.0
1,1,John Doe,29.0,United States,M,55000.0,20%,11000.0,4.0


In [127]:
# When rank(method="min") - uses whole numbers, equal values get lowest rank 
# if incomes are 50,000, 60,000, 60,000, and 70,000, the income 50,000 gets rank 1, both 60,000 values get rank 2, and 70,000 gets rank 4. 
# The rank 3 is skipped because method="min" allows gaps after tied values.
sorted_df["Ranking"] = sorted_df["income"].rank(method="min")
sorted_df

Unnamed: 0,id,name,age,country,gender,income,tax,tax amnt,Ranking
10,10,Emily Davis,31.0,United States,U,58000.0,20%,11600.0,6.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,4500.0,1.0
8,8,Rachel Lee,29.0,United States,F,62000.0,20%,12400.0,7.0
7,7,Ahmed Khan,38.0,,M,68000.0,20%,13600.0,9.0
6,6,,45.0,India,F,73000.0,20%,14600.0,10.0
5,5,Li Wei,27.0,China,M,51000.0,20%,10200.0,3.0
4,4,Maria Garcia,34.0,Spain,F,,10%,,
3,3,Alex,,United States,U,47000.0,10%,4700.0,2.0
2,2,Jane Smith,,Canada,F,62000.0,20%,12400.0,7.0
1,1,John Doe,29.0,United States,M,55000.0,20%,11000.0,4.0


In [None]:
# When rank(method="max") - uses whole numbers, equal values get highest rank 
# if incomes are 50,000, 60,000, 60,000, and 70,000:
#  the income 50,000 gets rank 1, both 60,000 values get rank 3, and 70,000 gets rank 4. 
# The rank 2 is skipped because method="max" allows gaps after tied values.
sorted_df["Ranking"] = sorted_df["income"].rank(method="max")
sorted_df

Unnamed: 0,id,name,age,country,gender,income,tax,tax amnt,Ranking
10,10,Emily Davis,31.0,United States,U,58000.0,20%,11600.0,6.0
9,9,Carlos Ruiz,,Mexico,M,45000.0,10%,4500.0,1.0
8,8,Rachel Lee,29.0,United States,F,62000.0,20%,12400.0,8.0
7,7,Ahmed Khan,38.0,,M,68000.0,20%,13600.0,9.0
6,6,,45.0,India,F,73000.0,20%,14600.0,10.0
5,5,Li Wei,27.0,China,M,51000.0,20%,10200.0,3.0
4,4,Maria Garcia,34.0,Spain,F,,10%,,
3,3,Alex,,United States,U,47000.0,10%,4700.0,2.0
2,2,Jane Smith,,Canada,F,62000.0,20%,12400.0,8.0
1,1,John Doe,29.0,United States,M,55000.0,20%,11000.0,5.0


In [135]:
# 10. Reorder columns
new_order=df2[["id","name","age","gender","country","tax"]]
# Selecting columns returns a new DataFrame.
# The list must include all columns we want to keep  
# columns not listed will be dropped.
print(new_order)

# Reordering does not affect the original DataFrame
# unless the result is assigned back to it.
print(df2)

    id          name   age gender        country  tax
0    1      John Doe  29.0      M  United States  20%
1    1      John Doe  29.0      M  United States  20%
2    2    Jane Smith   NaN      F         Canada  20%
3    3          Alex   NaN      U  United States  10%
4    4  Maria Garcia  34.0      F          Spain  10%
5    5        Li Wei  27.0      M          China  20%
6    6           NaN  45.0      F          India  20%
7    7    Ahmed Khan  38.0      M            NaN  20%
8    8    Rachel Lee  29.0      F  United States  20%
9    9   Carlos Ruiz   NaN      M         Mexico  10%
10  10   Emily Davis  31.0      U  United States  20%
    id          name   age        country gender   income  tax  tax amnt
0    1      John Doe  29.0  United States      M  55000.0  20%   11000.0
1    1      John Doe  29.0  United States      M  55000.0  20%   11000.0
2    2    Jane Smith   NaN         Canada      F  62000.0  20%   12400.0
3    3          Alex   NaN  United States      U  47000.0  1