In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# 1. Data Loading & Selection
df = pd.read_csv('Titanic-Dataset.csv')
cols_to_keep = ['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
df = df[cols_to_keep]

# Identify features with missing values
print("Missing values per column:\n", df.isnull().sum())

# 2. Handling Missing Values
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# 3. Categorical Encoding
# Label Encoding for Sex (Binary)
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])

# One-Hot Encoding for Embarked (Binary Vectors)
df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')

# 4. Feature Scaling
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

# 5. Train-Test Split
X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Display results
print("\nFinal Feature Set (First 5 rows):")
print(X.head())
print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Missing values per column:
 Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
Embarked      2
dtype: int64

Final Feature Set (First 5 rows):
   Pclass  Sex       Age      Fare  Embarked_C  Embarked_Q  Embarked_S
0       3    1 -0.592481 -0.502445       False       False        True
1       1    0  0.638789  0.786845        True       False       False
2       3    0 -0.284663 -0.488854       False       False        True
3       1    0  0.407926  0.420730       False       False        True
4       3    1  0.407926 -0.486337       False       False        True

Training set shape: (712, 7)
Testing set shape: (179, 7)
