### Garbage In, Garbage Out (GIGO): Cleaning Missing Data
**Description**: Load a dataset (e.g., Titanic dataset) and identify missing values. Use
appropriate techniques to handle these missing values.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Titanic dataset
try:
    df = pd.read_csv('titanic.csv')  # Replace 'titanic.csv' with the actual path if needed
    print("Titanic dataset loaded successfully.\n")
except FileNotFoundError:
    print("Error: titanic.csv not found. Please make sure the file is in the correct directory.")
    exit()

# --- Step 1: Identify Missing Values ---
print("--- Identifying Missing Values ---")
missing_data = df.isnull().sum()
print("Number of missing values per column:")
print(missing_data[missing_data > 0])
print(f"\nTotal number of rows in the dataset: {len(df)}")

# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

print("\nData types of columns:")
print(df.dtypes)

# --- Step 2: Handle Missing Values ---
print("\n--- Handling Missing Values ---")

# --- Handling 'Age' ---
print("\nHandling 'Age' column:")
# 'Age' is a numerical column. Let's consider imputation with the mean or median.
# Check the distribution of 'Age' to decide.
plt.figure(figsize=(8, 6))
sns.histplot(df['Age'].dropna(), kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

# The distribution looks somewhat skewed, so the median might be a better choice than the mean.
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)
print(f"Missing 'Age' values filled with the median: {median_age}")

# --- Handling 'Cabin' ---
print("\nHandling 'Cabin' column:")
# 'Cabin' has a lot of missing values. Imputation might not be meaningful.
# We can either drop the column or try to extract some information (e.g., the first letter).
print(f"Percentage of missing 'Cabin' values: {(df['Cabin'].isnull().sum() / len(df)) * 100:.2f}%")

# Let's try extracting the first letter of the cabin as a potential feature.
df['Cabin_Initial'] = df['Cabin'].str[0]
print("\nNew 'Cabin_Initial' column:")
print(df['Cabin_Initial'].value_counts(dropna=False))

# Now handle missing values in 'Cabin_Initial' (if any after extraction, though there shouldn't be if original was NaN)
df['Cabin_Initial'].fillna('Unknown', inplace=True)
print("\n'Cabin_Initial' value counts after handling missing values:")
print(df['Cabin_Initial'].value_counts())

# We might choose to drop the original 'Cabin' column later.

# --- Handling 'Embarked' ---
print("\nHandling 'Embarked' column:")
# 'Embarked' is a categorical column with a few missing values.
print(f"Number of missing 'Embarked' values: {df['Embarked'].isnull().sum()}")
print("Value counts of 'Embarked' before imputation:")
print(df['Embarked'].value_counts(dropna=False))

# Impute with the mode (most frequent value).
mode_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(mode_embarked, inplace=True)
print(f"Missing 'Embarked' values filled with the mode: {mode_embarked}")
print("Value counts of 'Embarked' after imputation:")
print(df['Embarked'].value_counts(dropna=False))

# --- Handling 'Fare' (checking for any missing values) ---
print("\nHandling 'Fare' column:")
print(f"Number of missing 'Fare' values: {df['Fare'].isnull().sum()}")
# No missing values found in 'Fare' in this dataset, but if there were,
# we could use mean/median imputation as it's numerical.

# --- Check Missing Values After Handling ---
print("\n--- Missing Values After Handling ---")
final_missing_data = df.isnull().sum()
print("Number of missing values per column after handling:")
print(final_missing_data[final_missing_data > 0])

# Visualize missing values after handling
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap After Handling')
plt.show()

print("\nCleaned DataFrame Information:")
df.info()

# Optional: Drop the original 'Cabin' column if 'Cabin_Initial' is deemed sufficient
# df.drop('Cabin', axis=1, inplace=True)
# print("\nDataFrame after dropping 'Cabin' column:")
# df.info()

Error: titanic.csv not found. Please make sure the file is in the correct directory.
--- Identifying Missing Values ---


NameError: name 'df' is not defined

: 