<a href="https://colab.research.google.com/github/samer-glitch/Trustworthy-AI-Data-Pipeline-Framework/blob/main/5_Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Creating a new feature 'FamilySize'
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Display the first few rows to verify the new feature
print(df[['SibSp', 'Parch', 'FamilySize']].head())

   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1


In [None]:
# Extracting titles from the 'Name' column
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())

# Display the first few rows to verify the new feature
df[['Name', 'Title']].head()

Unnamed: 0,Name,Title
0,"Braund, Mr. Owen Harris",Mr
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs
2,"Heikkinen, Miss. Laina",Miss
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs
4,"Allen, Mr. William Henry",Mr


In [None]:
import numpy as np

# Log transformation of 'Fare' to reduce skewness
df['Fare_log'] = df['Fare'].apply(lambda x: np.log(x + 1))  # Adding 1 to avoid log(0)

# Compare before and after transformation
df[['Fare', 'Fare_log']].head()

Unnamed: 0,Fare,Fare_log
0,7.25,2.110213
1,71.2833,4.280593
2,7.925,2.188856
3,53.1,3.990834
4,8.05,2.202765


In [None]:
# Binning the 'Age' feature into categories
bins = [0, 12, 18, 60, np.inf]
labels = ['Child', 'Teenager', 'Adult', 'Senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

# Display the first few rows to verify the new feature
df[['Age', 'AgeGroup']].head()

Unnamed: 0,Age,AgeGroup
0,22.0,Adult
1,38.0,Adult
2,26.0,Adult
3,35.0,Adult
4,35.0,Adult


In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Creating polynomial features of degree 2 for 'Age'
poly = PolynomialFeatures(degree=2, include_bias=False)
age_poly = poly.fit_transform(df[['Age']].fillna(df['Age'].median()))  # Handling missing values

# Convert to DataFrame and concatenate with the original data
age_poly_df = pd.DataFrame(age_poly, columns=['Age', 'Age^2'])
df = pd.concat([df, age_poly_df], axis=1)

# Display the first few rows
print(df[['Age', 'Age^2']].head())

    Age   Age   Age^2
0  22.0  22.0   484.0
1  38.0  38.0  1444.0
2  26.0  26.0   676.0
3  35.0  35.0  1225.0
4  35.0  35.0  1225.0


In [None]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,Title,Fare_log,AgeGroup,Age.1,Age^2
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,Mr,2.110213,Adult,22.0,484.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,Mrs,4.280593,Adult,38.0,1444.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Miss,2.188856,Adult,26.0,676.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,Mrs,3.990834,Adult,35.0,1225.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,Mr,2.202765,Adult,35.0,1225.0
