In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Check if the file exists
if not os.path.exists(titanic_train_path):
    print(f"Error: train.csv not found at {titanic_train_path}")
    print("Please ensure you have downloaded and unzipped titanic.zip,")
    print("and placed train.csv into the 'data/raw/' directory of your project.")
else:
    # Load the dataset
    df_titanic = pd.read_csv(titanic_train_path)
    print("Titanic train.csv data loaded successfully!")
    print(f"Dataset shape: {df_titanic.shape} (rows, columns)")

    # Display the first few rows of the data
    print("\nFirst 5 rows of the dataset:")
    print(df_titanic.head())

    # Display basic information about the dataset (column names, non-null counts, data types)
    print("\nDataset Information:")
    df_titanic.info()

    # Display descriptive statistics for numerical columns
    print("\nDescriptive statistics for numerical columns:")
    print(df_titanic.describe())

    # Check the distribution of the target variable 'Survived'
    print("\nDistribution of 'Survived' column:")
    print(df_titanic['Survived'].value_counts())
    print("0: Not Survived, 1: Survived")



NameError: name 'titanic_train_path' is not defined

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Assuming df_titanic is the DataFrame loaded in Step 1

# Create a copy of the data to avoid modifying the original DataFrame
df_processed = df_titanic.copy()

# 1. Handle Missing Values

# 'Age' column: Impute with median, as age distribution might not be normal, and median is robust to outliers
age_imputer = SimpleImputer(strategy='median')
df_processed['Age'] = age_imputer.fit_transform(df_processed[['Age']])

# 'Embarked' column: Impute with mode, as it is a categorical feature
# First, find the mode
most_frequent_embarked = df_processed['Embarked'].mode()[0]
df_processed['Embarked'].fillna(most_frequent_embarked, inplace=True)

# 'Cabin' column: Too many missing values, and the feature itself might be too complex for simple models.
# Typically dropped or converted to a binary "has_cabin_info" feature.
# For this task, we choose to drop it directly.
df_processed.drop('Cabin', axis=1, inplace=True)

# 2. Feature Selection
# Remove 'PassengerId', 'Name', 'Ticket' as they are not directly relevant for classification
# 'Survived' is the target variable and should also be removed from features
features_to_drop = ['PassengerId', 'Name', 'Ticket']
df_features = df_processed.drop(columns=features_to_drop + ['Survived'])
target = df_processed['Survived']

# 3. Identify Numerical and Categorical Features
numeric_features = df_features.select_dtypes(include=np.number).columns.tolist()
categorical_features = df_features.select_dtypes(include='object').columns.tolist()

print(f"\nNumerical features before preprocessing: {numeric_features}")
print(f"Categorical features before preprocessing: {categorical_features}")

# 4. Build Preprocessing Pipelines
# Numerical feature processing: Standardization
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()) # Standardize numerical features
])

# Categorical feature processing: One-hot encoding
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode categorical features
])

# Use ColumnTransformer to apply different preprocessing steps to different types of features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 5. Split data into training and testing sets
# test_size=0.2 means 20% of the data is used for testing, random_state ensures reproducibility
# stratify=target ensures that the proportion of 'Survived' in training and testing sets is similar to the original data, which is important for imbalanced datasets
X_train, X_test, y_train, y_test = train_test_split(df_features, target, test_size=0.2, random_state=42, stratify=target)

print(f"\nTraining set features shape (original): {X_train.shape}")
print(f"Test set features shape (original): {X_test.shape}")

# Apply the preprocessor to the training data and transform it
X_train_processed = preprocessor.fit_transform(X_train)
# Apply the preprocessor to the test data (only transform, do not refit)
X_test_processed = preprocessor.transform(X_test)

# Get the names of the processed features
# For one-hot encoded categorical features, names will be expanded
processed_feature_names = numeric_features + \
                          list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features))

print("\nData preprocessing complete!")
print(f"Processed training set features shape: {X_train_processed.shape}")
print(f"Number of processed feature names: {len(processed_feature_names)}")
print("Example of processed feature names (first 10):", processed_feature_names[:10])




Numerical features before preprocessing: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Categorical features before preprocessing: ['Sex', 'Embarked']

Training set features shape (original): (712, 7)
Test set features shape (original): (179, 7)

Data preprocessing complete!
Processed training set features shape: (712, 10)
Number of processed feature names: 10
Example of processed feature names (first 10): ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Embarked'].fillna(most_frequent_embarked, inplace=True)
