In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv(r"C:\Users\gudek\OneDrive\Desktop\college notes\Titanic-Dataset.csv")

# Explore basic info
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()
print("\nFirst 5 rows:")
print(df.head())
print("\nMissing Values:")
print(df.isnull().sum())

Dataset Shape: (891, 12)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4        

In [2]:
# Explore categorical variables
print("Categorical variables:")
print(df_clean.select_dtypes(include=['object']).columns.tolist())

# One-hot encoding for nominal variables (no inherent order)
df_encoded = pd.get_dummies(df_clean, columns=['Sex', 'Embarked'], prefix=['Sex', 'Embarked'])

# Label encoding for ordinal variables (if any existed)
# For demonstration, let's encode 'Pclass' as it has ordinal nature
le = LabelEncoder()
df_encoded['Pclass_encoded'] = le.fit_transform(df_encoded['Pclass'])

# Drop original categorical columns that we've encoded
df_encoded.drop(['Name', 'Ticket', 'Pclass'], axis=1, inplace=True, errors='ignore')

print("\nAfter encoding - Dataset shape:", df_encoded.shape)
print("Columns after encoding:", df_encoded.columns.tolist())

Categorical variables:


NameError: name 'df_clean' is not defined

In [3]:
# Step 1: Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv(r"C:\Users\gudek\OneDrive\Desktop\college notes\Titanic-Dataset.csv")

print("=== STEP 1: INITIAL DATA EXPLORATION ===")
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()
print("\nFirst 5 rows:")
print(df.head())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nBasic Statistics:")
print(df.describe())

=== STEP 1: INITIAL DATA EXPLORATION ===
Dataset Shape: (891, 12)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3   

In [4]:
# Step 2: Handle Missing Values
print("\n=== STEP 2: HANDLING MISSING VALUES ===")

# Create a copy for cleaning
df_clean = df.copy()

# Display missing values before handling
print("Missing values before handling:")
print(df_clean.isnull().sum())

# Handle missing values strategically
# Age - fill with median (less sensitive to outliers)
age_median = df_clean['Age'].median()
df_clean['Age'].fillna(age_median, inplace=True)
print(f"Filled {df['Age'].isnull().sum()} missing Age values with median: {age_median}")

# Embarked - fill with mode (most frequent value)
embarked_mode = df_clean['Embarked'].mode()[0]
df_clean['Embarked'].fillna(embarked_mode, inplace=True)
print(f"Filled {df['Embarked'].isnull().sum()} missing Embarked values with mode: {embarked_mode}")

# Cabin - too many missing values, drop the column
cabin_missing = df_clean['Cabin'].isnull().sum()
df_clean.drop('Cabin', axis=1, inplace=True)
print(f"Dropped Cabin column with {cabin_missing} missing values ({cabin_missing/len(df)*100:.1f}% missing)")

# Verify missing values after handling
print("\nMissing values after handling:")
print(df_clean.isnull().sum())


=== STEP 2: HANDLING MISSING VALUES ===
Missing values before handling:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Filled 177 missing Age values with median: 28.0
Filled 2 missing Embarked values with mode: S
Dropped Cabin column with 687 missing values (77.1% missing)

Missing values after handling:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [5]:
# Step 3: Convert Categorical Features
print("\n=== STEP 3: HANDLING CATEGORICAL FEATURES ===")

# Explore categorical variables
print("Categorical variables:")
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
print(categorical_cols)

print("\nUnique values in categorical columns:")
for col in categorical_cols:
    print(f"{col}: {df_clean[col].nunique()} unique values")
    if df_clean[col].nunique() < 10:  # Show values for columns with few categories
        print(f"   Values: {df_clean[col].unique()}")

# One-hot encoding for nominal variables
print("\nPerforming one-hot encoding...")
df_encoded = pd.get_dummies(df_clean, columns=['Sex', 'Embarked'], prefix=['Sex', 'Embarked'])

# For demonstration, let's also encode Pclass
le = LabelEncoder()
df_encoded['Pclass_encoded'] = le.fit_transform(df_encoded['Pclass'])

# Drop original categorical columns that are not needed
columns_to_drop = ['Name', 'Ticket']  # Keeping 'Pclass' for reference
df_encoded.drop(columns_to_drop, axis=1, inplace=True, errors='ignore')

print(f"\nAfter encoding - Dataset shape: {df_encoded.shape}")
print("Columns after encoding:")
print(df_encoded.columns.tolist())
print("\nFirst 3 rows after encoding:")
print(df_encoded.head(3))


=== STEP 3: HANDLING CATEGORICAL FEATURES ===
Categorical variables:
['Name', 'Sex', 'Ticket', 'Embarked']

Unique values in categorical columns:
Name: 891 unique values
Sex: 2 unique values
   Values: ['male' 'female']
Ticket: 681 unique values
Embarked: 3 unique values
   Values: ['S' 'C' 'Q']

Performing one-hot encoding...

After encoding - Dataset shape: (891, 13)
Columns after encoding:
['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Pclass_encoded']

First 3 rows after encoding:
   PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare  Sex_female  \
0            1         0       3  22.0      1      0   7.2500       False   
1            2         1       1  38.0      1      0  71.2833        True   
2            3         1       3  26.0      0      0   7.9250        True   

   Sex_male  Embarked_C  Embarked_Q  Embarked_S  Pclass_encoded  
0      True       False       False      

In [6]:
# Step 4: Normalize/Standardize Numerical Features
print("\n=== STEP 4: FEATURE SCALING ===")

# Select numerical features for scaling
numerical_features = ['Age', 'SibSp', 'Parch', 'Fare']

print("Before standardization:")
print(df_encoded[numerical_features].describe())

# Standardization (mean=0, std=1)
scaler_standard = StandardScaler()
df_encoded[numerical_features] = scaler_standard.fit_transform(df_encoded[numerical_features])

print("\nAfter standardization:")
print(df_encoded[numerical_features].describe())

# Demonstrate the difference with Normalization
print("\n--- For comparison: Normalization ---")
df_normalized = df_clean.copy()
scaler_minmax = MinMaxScaler()
df_normalized[numerical_features] = scaler_minmax.fit_transform(df_normalized[numerical_features])
print("After normalization (min=0, max=1):")
print(df_normalized[numerical_features].describe())


=== STEP 4: FEATURE SCALING ===
Before standardization:
              Age       SibSp       Parch        Fare
count  891.000000  891.000000  891.000000  891.000000
mean    29.361582    0.523008    0.381594   32.204208
std     13.019697    1.102743    0.806057   49.693429
min      0.420000    0.000000    0.000000    0.000000
25%     22.000000    0.000000    0.000000    7.910400
50%     28.000000    0.000000    0.000000   14.454200
75%     35.000000    1.000000    0.000000   31.000000
max     80.000000    8.000000    6.000000  512.329200

After standardization:
                Age         SibSp         Parch          Fare
count  8.910000e+02  8.910000e+02  8.910000e+02  8.910000e+02
mean   2.272780e-16  4.386066e-17  5.382900e-17  3.987333e-18
std    1.000562e+00  1.000562e+00  1.000562e+00  1.000562e+00
min   -2.224156e+00 -4.745452e-01 -4.736736e-01 -6.484217e-01
25%   -5.657365e-01 -4.745452e-01 -4.736736e-01 -4.891482e-01
50%   -1.046374e-01 -4.745452e-01 -4.736736e-01 -3.573909e-01