In [2]:
import pandas as pd
import numpy as np

# ================== Data Transformation ==================
> changing or deriving new data from existing data.

## ----------- 1. Apply -----------

In [199]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Gender": ['F', 'M', 'M', 'M'],
    "Age": [25, 32, 37, 29],
    "Salary": [50000, 60000, 75000, 52000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Gender,Age,Salary
0,Alice,F,25,50000
1,Bob,M,32,60000
2,Charlie,M,37,75000
3,David,M,29,52000


### map()
> for Series only

In [None]:
# map on a Series
df["Name_Upper"] = df["Name"].map(str.upper)
df

Unnamed: 0,Name,Gender,Age,Salary,Salary_Bucket,Age_Group,Total,Name_Salary,Name_Upper
0,Alice,F,25,50000,Low,Young,50025,Alice-50000,ALICE
1,Bob,M,32,60000,Low,Senior,60032,Bob-60000,BOB
2,Charlie,M,37,75000,High,Senior,75037,Charlie-75000,CHARLIE
3,David,M,29,52000,Low,Young,52029,David-52000,DAVID


In [None]:
# Map categories
df["Salary_Level"] = df["Salary"].map({50000: "Low", 60000: "Medium", 75000: "High"})
df

Unnamed: 0,Name,Gender,Age,Salary,Salary_Bucket,Age_Group,Total,Name_Salary,Salary_Level
0,Alice,F,25,50000,Low,Young,50025,Alice-50000,Low
1,Bob,M,32,60000,Low,Senior,60032,Bob-60000,Medium
2,Charlie,M,37,75000,High,Senior,75037,Charlie-75000,High
3,David,M,29,52000,Low,Young,52029,David-52000,


### apply()
> flexible, works with DataFrame/Series

In [200]:
# apply to Series
df["Salary_Bucket"] = df["Salary"].apply(lambda x: "High" if x > 60000 else "Low")
df

Unnamed: 0,Name,Gender,Age,Salary,Salary_Bucket
0,Alice,F,25,50000,Low
1,Bob,M,32,60000,Low
2,Charlie,M,37,75000,High
3,David,M,29,52000,Low


In [201]:
df["Age_Group"] = df["Age"].apply(lambda x: "Young" if x < 30 else "Senior")
df

Unnamed: 0,Name,Gender,Age,Salary,Salary_Bucket,Age_Group
0,Alice,F,25,50000,Low,Young
1,Bob,M,32,60000,Low,Senior
2,Charlie,M,37,75000,High,Senior
3,David,M,29,52000,Low,Young


#### > If you apply directly on a DataFrame (not a single column), then the function is applied on rows or columns, depending on axis

In [202]:
# apply to DataFrame row wise
df['Total'] = df[['Age', 'Salary']].apply(sum)
df

Unnamed: 0,Name,Gender,Age,Salary,Salary_Bucket,Age_Group,Total
0,Alice,F,25,50000,Low,Young,
1,Bob,M,32,60000,Low,Senior,
2,Charlie,M,37,75000,High,Senior,
3,David,M,29,52000,Low,Young,


In [203]:
# apply to DataFrame column wise
df['Total'] = df[['Age', 'Salary']].apply(sum, axis=1)
df

Unnamed: 0,Name,Gender,Age,Salary,Salary_Bucket,Age_Group,Total
0,Alice,F,25,50000,Low,Young,50025
1,Bob,M,32,60000,Low,Senior,60032
2,Charlie,M,37,75000,High,Senior,75037
3,David,M,29,52000,Low,Young,52029


In [204]:
# Combine multiple columns row-wise
df["Name_Salary"] = df.apply(lambda row: f"{row['Name']}-{row['Salary']}", axis=1)
df

Unnamed: 0,Name,Gender,Age,Salary,Salary_Bucket,Age_Group,Total,Name_Salary
0,Alice,F,25,50000,Low,Young,50025,Alice-50000
1,Bob,M,32,60000,Low,Senior,60032,Bob-60000
2,Charlie,M,37,75000,High,Senior,75037,Charlie-75000
3,David,M,29,52000,Low,Young,52029,David-52000


### applymap()
> element-wise on entire DataFrame

In [62]:
df_str = df[["Name", "Age_Group"]]
df_str.applymap(str.lower)

  df_str.applymap(str.lower)


Unnamed: 0,Name,Age_Group
0,alice,young
1,bob,senior
2,charlie,senior
3,david,young


In [63]:
df_str = df[['Age','Salary']]
df_str.applymap(lambda x: x*2)

  df_str.applymap(lambda x: x*2)


Unnamed: 0,Age,Salary
0,50,100000
1,64,120000
2,74,150000
3,58,104000


## ----------- 2. Vectorized Operations -----------
> Faster than looping, directly applied on columns.

In [64]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Gender": ['F', 'M', 'M', 'M'],
    "Age": [25, 32, 37, 29],
    "Salary": [50000, 60000, 75000, 52000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Gender,Age,Salary
0,Alice,F,25,50000
1,Bob,M,32,60000
2,Charlie,M,37,75000
3,David,M,29,52000


In [65]:
# Arithmetic operations
df["Salary_in_K"] = df["Salary"] / 1000
df

Unnamed: 0,Name,Gender,Age,Salary,Salary_in_K
0,Alice,F,25,50000,50.0
1,Bob,M,32,60000,60.0
2,Charlie,M,37,75000,75.0
3,David,M,29,52000,52.0


In [66]:
# Create new column using multiple columns
df["Age_Salary_Ratio"] = np.round(df["Salary"] / df["Age"], 2)
df

Unnamed: 0,Name,Gender,Age,Salary,Salary_in_K,Age_Salary_Ratio
0,Alice,F,25,50000,50.0,2000.0
1,Bob,M,32,60000,60.0,1875.0
2,Charlie,M,37,75000,75.0,2027.03
3,David,M,29,52000,52.0,1793.1


In [67]:
# Broadcasting with scalar
df["Bonus"] = df["Salary"] * 0.40
df

Unnamed: 0,Name,Gender,Age,Salary,Salary_in_K,Age_Salary_Ratio,Bonus
0,Alice,F,25,50000,50.0,2000.0,20000.0
1,Bob,M,32,60000,60.0,1875.0,24000.0
2,Charlie,M,37,75000,75.0,2027.03,30000.0
3,David,M,29,52000,52.0,1793.1,20800.0


## ----------- 3. Replace Values -----------

In [189]:
df = pd.read_csv("weather_data2.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32 F,6 mph,Rain
1,1/2/2017,-99999,7,Sunny
2,1/3/2017,28 C,-99999,Snow
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-88888,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


#### Replacing specific values

In [190]:
new_df = df.replace('-99999', value=np.nan)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32 F,6 mph,Rain
1,1/2/2017,,7,Sunny
2,1/3/2017,28 C,,Snow
3,1/4/2017,,7,0
4,1/5/2017,32,-88888,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


In [191]:
# Replace everywhere the condition occurs
new_df = df.replace(
    to_replace=['-99999','-88888'], 
    value=0
)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32 F,6 mph,Rain
1,1/2/2017,0,7,Sunny
2,1/3/2017,28 C,0,Snow
3,1/4/2017,0,7,0
4,1/5/2017,32,0,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


#### regex :::: Replacing text of 'units' with empty str in temperature and windspeed col

In [192]:
# when windspeed is 6 mph, 7 mph etc. & temperature is 32 F, 28 F etc.
df.replace(
    {'temperature': '[A-Za-z]', 'windspeed': '[a-z]'},
    value = '', 
    regex=True,
    inplace=True
) 
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,-99999,7,Sunny
2,1/3/2017,28,-99999,Snow
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-88888,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


#### Replacing per col

In [193]:
new_df = df.replace(
    {
        'temperature': '-99999',
        'windspeed': '-99999',     # Not considered
        'windspeed': '-88888',
        'event': '0'
    }, 
    value = np.nan
)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/2/2017,,7.0,Sunny
2,1/3/2017,28.0,-99999.0,Snow
3,1/4/2017,,7.0,
4,1/5/2017,32.0,,Rain
5,1/6/2017,31.0,2.0,Sunny
6,1/6/2017,34.0,5.0,


In [194]:
new_df = df.replace(
    {
        'temperature': '-99999',
        'windspeed': ['-99999', '-88888'],
        'event': '0'
    }, 
    value = "Replaced Val"
)
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,Replaced Val,7,Sunny
2,1/3/2017,28,Replaced Val,Snow
3,1/4/2017,Replaced Val,7,Replaced Val
4,1/5/2017,32,Replaced Val,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,Replaced Val


#### Replacing using mapping

In [195]:
new_df = df.replace(
    {
        '-99999': "Replaced Val",
        '-88888': "Replaced Val",
        'no event': 'Sunny',
    })
new_df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,Replaced Val,7,Sunny
2,1/3/2017,28,Replaced Val,Snow
3,1/4/2017,Replaced Val,7,0
4,1/5/2017,32,Replaced Val,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


#### Replacing list with another list

In [196]:
df = pd.DataFrame({
    'score': ['exceptional','average', 'good', 'poor', 'average', 'exceptional'],
    'student': ['rob', 'maya', 'parthiv', 'tom', 'julian', 'erica']
})
df

Unnamed: 0,score,student
0,exceptional,rob
1,average,maya
2,good,parthiv
3,poor,tom
4,average,julian
5,exceptional,erica


In [197]:
df.replace(['poor', 'average', 'good', 'exceptional'], [1,2,3,4])

  df.replace(['poor', 'average', 'good', 'exceptional'], [1,2,3,4])


Unnamed: 0,score,student
0,4,rob
1,2,maya
2,3,parthiv
3,1,tom
4,2,julian
5,4,erica


In [198]:
# Replace values
df["score"] = df["score"].replace({'good':'exceptional', 'average':'good'})
df

Unnamed: 0,score,student
0,exceptional,rob
1,good,maya
2,exceptional,parthiv
3,poor,tom
4,good,julian
5,exceptional,erica


## ----------- 4. Rename -----------

In [162]:
df.rename(columns={'Salary':'Annual_Salary'}, inplace=True)
df

Unnamed: 0,score,student
0,exceptional,rob
1,average,maya
2,good,parthiv
3,poor,tom
4,average,julian
5,exceptional,erica


In [163]:
df.rename(index={0:'First'}, inplace=True)
df

Unnamed: 0,score,student
First,exceptional,rob
1,average,maya
2,good,parthiv
3,poor,tom
4,average,julian
5,exceptional,erica


## ----------- 5. Change Datatype -----------

In [164]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Gender": ['F', 'M', 'M', 'M'],
    "Age": [25, 61, 37, 15],
    "Salary": [50000, 60000, 75000, 52000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Gender,Age,Salary
0,Alice,F,25,50000
1,Bob,M,61,60000
2,Charlie,M,37,75000
3,David,M,15,52000


In [165]:
print(df.dtypes)
print("\n-----------------")
df['Age'] = df['Age'].astype(float)
print(df.dtypes)
df

Name      object
Gender    object
Age        int64
Salary     int64
dtype: object

-----------------
Name       object
Gender     object
Age       float64
Salary      int64
dtype: object


Unnamed: 0,Name,Gender,Age,Salary
0,Alice,F,25.0,50000
1,Bob,M,61.0,60000
2,Charlie,M,37.0,75000
3,David,M,15.0,52000


## ----------- 6. Binning Data (Discretization) -----------

In [166]:
# Cut into fixed bins
df['Age_group'] = pd.cut(df['Age'], bins=[0,18,35,60,100], labels=['Teen','Young','Adult','Senior'])
df

Unnamed: 0,Name,Gender,Age,Salary,Age_group
0,Alice,F,25.0,50000,Young
1,Bob,M,61.0,60000,Senior
2,Charlie,M,37.0,75000,Adult
3,David,M,15.0,52000,Teen


In [167]:
# Quantile-based binning
df['Salary_quantile'] = pd.qcut(df['Salary'], q=4, labels=['Low','Mid','High','Very High'])
df

Unnamed: 0,Name,Gender,Age,Salary,Age_group,Salary_quantile
0,Alice,F,25.0,50000,Young,Low
1,Bob,M,61.0,60000,Senior,High
2,Charlie,M,37.0,75000,Adult,Very High
3,David,M,15.0,52000,Teen,Mid


## ----------- 7. Sorting -----------

In [168]:
df.sort_values(by='Salary', inplace=True)
df

Unnamed: 0,Name,Gender,Age,Salary,Age_group,Salary_quantile
0,Alice,F,25.0,50000,Young,Low
3,David,M,15.0,52000,Teen,Mid
1,Bob,M,61.0,60000,Senior,High
2,Charlie,M,37.0,75000,Adult,Very High


In [169]:
df.sort_index(ascending=False, inplace=True)
df

Unnamed: 0,Name,Gender,Age,Salary,Age_group,Salary_quantile
3,David,M,15.0,52000,Teen,Mid
2,Charlie,M,37.0,75000,Adult,Very High
1,Bob,M,61.0,60000,Senior,High
0,Alice,F,25.0,50000,Young,Low


## ----------- 8. String Operations -----------

In [170]:
data = {
    "Name": ["Alice    ", "Bob", "     Charlie", "David +"],
    "Gender": ['F', 'M', 'M', 'M'],
    "Age": [25, 61, 37, 15],
    "Salary": [50000, 60000, 75000, 52000],
    "Email": ['alice@gcc.com', 'bob@gcc.com', 'charlie@gcc.com', 'david@gcc.com']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Gender,Age,Salary,Email
0,Alice,F,25,50000,alice@gcc.com
1,Bob,M,61,60000,bob@gcc.com
2,Charlie,M,37,75000,charlie@gcc.com
3,David +,M,15,52000,david@gcc.com


In [171]:
df['Name'] = df['Name'].str.strip()           # remove spaces
df

Unnamed: 0,Name,Gender,Age,Salary,Email
0,Alice,F,25,50000,alice@gcc.com
1,Bob,M,61,60000,bob@gcc.com
2,Charlie,M,37,75000,charlie@gcc.com
3,David +,M,15,52000,david@gcc.com


In [172]:
df['Name_upper'] = df['Name'].str.upper()     # convert to uppercase
df

Unnamed: 0,Name,Gender,Age,Salary,Email,Name_upper
0,Alice,F,25,50000,alice@gcc.com,ALICE
1,Bob,M,61,60000,bob@gcc.com,BOB
2,Charlie,M,37,75000,charlie@gcc.com,CHARLIE
3,David +,M,15,52000,david@gcc.com,DAVID +


In [173]:
df["Name_Length"] = df["Name"].str.len()
df

Unnamed: 0,Name,Gender,Age,Salary,Email,Name_upper,Name_Length
0,Alice,F,25,50000,alice@gcc.com,ALICE,5
1,Bob,M,61,60000,bob@gcc.com,BOB,3
2,Charlie,M,37,75000,charlie@gcc.com,CHARLIE,7
3,David +,M,15,52000,david@gcc.com,DAVID +,7


In [174]:
df["Name_First3"] = df["Name"].str[:3]
df

Unnamed: 0,Name,Gender,Age,Salary,Email,Name_upper,Name_Length,Name_First3
0,Alice,F,25,50000,alice@gcc.com,ALICE,5,Ali
1,Bob,M,61,60000,bob@gcc.com,BOB,3,Bob
2,Charlie,M,37,75000,charlie@gcc.com,CHARLIE,7,Cha
3,David +,M,15,52000,david@gcc.com,DAVID +,7,Dav


In [175]:
df['Email_domain'] = df['Email'].str.split('@').str[1]
df

Unnamed: 0,Name,Gender,Age,Salary,Email,Name_upper,Name_Length,Name_First3,Email_domain
0,Alice,F,25,50000,alice@gcc.com,ALICE,5,Ali,gcc.com
1,Bob,M,61,60000,bob@gcc.com,BOB,3,Bob,gcc.com
2,Charlie,M,37,75000,charlie@gcc.com,CHARLIE,7,Cha,gcc.com
3,David +,M,15,52000,david@gcc.com,DAVID +,7,Dav,gcc.com


In [176]:
# Search
df["Has_A"] = df["Name"].str.contains("a", case=False)
df

Unnamed: 0,Name,Gender,Age,Salary,Email,Name_upper,Name_Length,Name_First3,Email_domain,Has_A
0,Alice,F,25,50000,alice@gcc.com,ALICE,5,Ali,gcc.com,True
1,Bob,M,61,60000,bob@gcc.com,BOB,3,Bob,gcc.com,False
2,Charlie,M,37,75000,charlie@gcc.com,CHARLIE,7,Cha,gcc.com,True
3,David +,M,15,52000,david@gcc.com,DAVID +,7,Dav,gcc.com,True


In [177]:
# Replace
df["Replace_A"] = df["Name"].str.replace("a", "@", case=False)
df

Unnamed: 0,Name,Gender,Age,Salary,Email,Name_upper,Name_Length,Name_First3,Email_domain,Has_A,Replace_A
0,Alice,F,25,50000,alice@gcc.com,ALICE,5,Ali,gcc.com,True,@lice
1,Bob,M,61,60000,bob@gcc.com,BOB,3,Bob,gcc.com,False,Bob
2,Charlie,M,37,75000,charlie@gcc.com,CHARLIE,7,Cha,gcc.com,True,Ch@rlie
3,David +,M,15,52000,david@gcc.com,DAVID +,7,Dav,gcc.com,True,D@vid +


## ----------- 9. Datetime Operations -----------

In [178]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Gender": ['F', 'M', 'M', 'M'],
    "Age": [25, 61, 37, 15],
    "Salary": [50000, 60000, 75000, 52000],
    "JoiningDate": ['2017-01-02', '2017-01-10', '2017-01-01', '2017-01-11']
}
df = pd.DataFrame(data)
print(df.dtypes)
df

Name           object
Gender         object
Age             int64
Salary          int64
JoiningDate    object
dtype: object


Unnamed: 0,Name,Gender,Age,Salary,JoiningDate
0,Alice,F,25,50000,2017-01-02
1,Bob,M,61,60000,2017-01-10
2,Charlie,M,37,75000,2017-01-01
3,David,M,15,52000,2017-01-11


In [179]:
df['JoiningDate'] = pd.to_datetime(df['JoiningDate'])
print(df.dtypes)
df

Name                   object
Gender                 object
Age                     int64
Salary                  int64
JoiningDate    datetime64[ns]
dtype: object


Unnamed: 0,Name,Gender,Age,Salary,JoiningDate
0,Alice,F,25,50000,2017-01-02
1,Bob,M,61,60000,2017-01-10
2,Charlie,M,37,75000,2017-01-01
3,David,M,15,52000,2017-01-11


In [180]:
df['Year'] = df['JoiningDate'].dt.year
df

Unnamed: 0,Name,Gender,Age,Salary,JoiningDate,Year
0,Alice,F,25,50000,2017-01-02,2017
1,Bob,M,61,60000,2017-01-10,2017
2,Charlie,M,37,75000,2017-01-01,2017
3,David,M,15,52000,2017-01-11,2017


In [181]:
df['Month'] = df['JoiningDate'].dt.month
df

Unnamed: 0,Name,Gender,Age,Salary,JoiningDate,Year,Month
0,Alice,F,25,50000,2017-01-02,2017,1
1,Bob,M,61,60000,2017-01-10,2017,1
2,Charlie,M,37,75000,2017-01-01,2017,1
3,David,M,15,52000,2017-01-11,2017,1


In [182]:
df['DayOfWeek'] = df['JoiningDate'].dt.day_name()
df

Unnamed: 0,Name,Gender,Age,Salary,JoiningDate,Year,Month,DayOfWeek
0,Alice,F,25,50000,2017-01-02,2017,1,Monday
1,Bob,M,61,60000,2017-01-10,2017,1,Tuesday
2,Charlie,M,37,75000,2017-01-01,2017,1,Sunday
3,David,M,15,52000,2017-01-11,2017,1,Wednesday


In [183]:
df["Is_Weekend"] = df["JoiningDate"].dt.dayofweek >= 5
df

Unnamed: 0,Name,Gender,Age,Salary,JoiningDate,Year,Month,DayOfWeek,Is_Weekend
0,Alice,F,25,50000,2017-01-02,2017,1,Monday,False
1,Bob,M,61,60000,2017-01-10,2017,1,Tuesday,False
2,Charlie,M,37,75000,2017-01-01,2017,1,Sunday,True
3,David,M,15,52000,2017-01-11,2017,1,Wednesday,False


## ----------- 10. Scaling & Normalization -----------

In [184]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Gender": ['F', 'M', 'M', 'M'],
    "Age": [25, 61, 37, 15],
    "Salary": [50000, 60000, 75000, 52000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Gender,Age,Salary
0,Alice,F,25,50000
1,Bob,M,61,60000
2,Charlie,M,37,75000
3,David,M,15,52000


In [185]:
# Min-Max scaling (manual)
df['Age_scaled'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min())
df

Unnamed: 0,Name,Gender,Age,Salary,Age_scaled
0,Alice,F,25,50000,0.217391
1,Bob,M,61,60000,1.0
2,Charlie,M,37,75000,0.478261
3,David,M,15,52000,0.0


In [186]:
# Z-score normalization
df['Salary_zscore'] = (df['Salary'] - df['Salary'].mean()) / df['Salary'].std()
df

Unnamed: 0,Name,Gender,Age,Salary,Age_scaled,Salary_zscore
0,Alice,F,25,50000,0.217391,-0.81468
1,Bob,M,61,60000,1.0,0.066055
2,Charlie,M,37,75000,0.478261,1.387158
3,David,M,15,52000,0.0,-0.638533
