In [2]:
import pandas as pd
import numpy as np

# Load the Titanic dataset
titanic = pd.read_csv("titanic.csv")

# Introduce missing values
np.random.seed(42)
mask = np.random.rand(*titanic.shape) < 0.2  # 20% missing values
titanic[mask] = np.nan

# Introduce outliers
titanic.loc[titanic['Fare'] > 300, 'Fare'] *= 2  # Increase fare for outliers

# Display the first few rows of the dataset
print("Original Titanic Dataset:")
print(titanic.head())

# Handling Missing Values
titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)
titanic['Siblings/Spouses Aboard'].fillna(titanic['Siblings/Spouses Aboard'].mode()[0], inplace=True)

# Handling Outliers
Q1 = titanic['Fare'].quantile(0.25)
Q3 = titanic['Fare'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

titanic['Fare'] = np.clip(titanic['Fare'], lower_bound, upper_bound)

# Display the cleaned dataset
print("\nCleaned Titanic Dataset:")
print(titanic.head())

Original Titanic Dataset:
   Survived  Pclass                     Name     Sex   Age  \
0       0.0     3.0   Mr. Owen Harris Braund    male   NaN   
1       1.0     1.0                      NaN  female  38.0   
2       1.0     3.0    Miss. Laina Heikkinen  female  26.0   
3       1.0     1.0                      NaN  female  35.0   
4       NaN     3.0  Mr. William Henry Allen    male  35.0   

   Siblings/Spouses Aboard  Parents/Children Aboard   Fare  
0                      NaN                      NaN  7.250  
1                      1.0                      NaN    NaN  
2                      NaN                      0.0  7.925  
3                      NaN                      0.0    NaN  
4                      NaN                      0.0  8.050  

Cleaned Titanic Dataset:
   Survived  Pclass                     Name     Sex        Age  \
0       0.0     3.0   Mr. Owen Harris Braund    male  29.666798   
1       1.0     1.0                      NaN  female  38.000000   
2       