In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from scipy import stats

# Step 1: Load the dataset into the Python environment
titanic_df = pd.read_csv('titanic.csv')  # Replace 'titanic.csv' with your actual file path

# Step 2: Make 'PassengerId' as the index column
titanic_df.set_index('PassengerId', inplace=True)

# Step 3: Check the basic details of the dataset
print("Basic details of the dataset:")
print(titanic_df.info())
print("\nSummary statistics:")
print(titanic_df.describe())

# Step 4: Fill in missing values in all columns
imputer = SimpleImputer(strategy='mean')
titanic_df_filled = pd.DataFrame(imputer.fit_transform(titanic_df), columns=titanic_df.columns)

# Step 5: Check and handle outliers in at least 3 columns
outlier_columns = ['Age', 'Fare', 'SibSp']  # Replace with your choice of columns
for column in outlier_columns:
    z_scores = stats.zscore(titanic_df_filled[column])
    outliers = (z_scores > 3) | (z_scores < -3)
    titanic_df_filled.loc[outliers, column] = titanic_df_filled[column].median()

# Step 6: Min-Max scaling on the feature set with 'Survived' as the target
scaler = MinMaxScaler()
features = titanic_df_filled.drop('Survived', axis=1)
scaled_features = scaler.fit_transform(features)

# Display the processed data
print("\nProcessed dataset:")
print(titanic_df_filled.head())






FileNotFoundError: ignored