<a href="https://colab.research.google.com/github/samer-glitch/Trustworthy-AI-Data-Pipeline-Framework/blob/main/2_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load dataset from a CSV file
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Impute missing values in 'Age' using median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Impute missing values in 'Embarked' using mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column due to a high percentage of missing values
df.drop(columns=['Cabin'], inplace=True)

# Verify if missing values have been handled
print("\nMissing Values After Imputation:\n", df.isnull().sum())


Missing Values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Missing Values After Imputation:
 PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [None]:
import numpy as np

# Identify outliers in 'Fare' using the IQR method
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df['Fare'] < (Q1 - 1.5 * IQR)) | (df['Fare'] > (Q3 + 1.5 * IQR)))

print("Outliers in 'Fare':", outliers.sum())

# Capping outliers in 'Fare'
df.loc[df['Fare'] > (Q3 + 1.5 * IQR), 'Fare'] = Q3 + 1.5 * IQR
df.loc[df['Fare'] < (Q1 - 1.5 * IQR), 'Fare'] = Q1 - 1.5 * IQR

# Verify outliers have been handled
print("\nOutliers in 'Fare' After Capping:", ((df['Fare'] < (Q1 - 1.5 * IQR)) | (df['Fare'] > (Q3 + 1.5 * IQR))).sum())


Outliers in 'Fare': 116

Outliers in 'Fare' After Capping: 0


In [None]:
# Check for duplicate rows
print("Duplicate Rows Before Removal:", df.duplicated().sum())

# Remove duplicates
df.drop_duplicates(inplace=True)

# Verify duplicates have been removed
print("Duplicate Rows After Removal:", df.duplicated().sum())


Duplicate Rows Before Removal: 0
Duplicate Rows After Removal: 0
