<a href="https://colab.research.google.com/github/rachalanalytics/clean_titantic/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

# Load Titanic dataset
titanic_data = sns.load_dataset('titanic')

# Display the first few records
print(titanic_data.head())

# Review the structure of the dataset
print(titanic_data.info())

In [None]:
# Null/Missing values

# Check for missing values
na_counts = titanic_data.isna().sum()

# Filter to show only columns with missing values
na_counts = na_counts[na_counts > 0]

# Display column names and the counts of missing values
print(na_counts)

In [None]:
# Solution:
# We can impute age, given the considerable amount of rows missing and using sex to fill in the median values
titanic_data['age'] = titanic_data.groupby(['sex', 'pclass'])['age'].transform(lambda x: x.fillna(x.median()))

# Embarked has only 2 missing values and we can fill these using the most frequent value
titanic_data['embarked'] = titanic_data['embarked'].fillna(titanic_data['embarked'].mode()[0])

# Deck is missing a significant portion of values. This column is not useful to analysis
titanic_data = titanic_data.drop('deck', axis=1)

# Embarked town has only 2 missing values, we can handle it like the embarked column
titanic_data['embark_town'] = titanic_data['embark_town'].fillna(titanic_data['embark_town'].mode()[0])

titanic_data.info()

In [None]:
# Duplicates
duplicates = titanic_data.duplicated()

# Find out if there are any duplicates
has_duplicates = duplicates.any()
print("Are there duplicate rows? ", has_duplicates)

# Count the total number of duplicate rows
num_duplicates = duplicates.sum()
print("Number of duplicate rows: ", num_duplicates)

# Inspect duplicate rows
duplicates_data = titanic_data[titanic_data.duplicated()]
print(duplicates_data)

In [None]:
# Solution:
# Drop all duplicate rows
titanic_data = titanic_data.drop_duplicates()

# Verify duplicates are removed
print("Number of duplicate rows after removal: ", titanic_data.duplicated().sum())

In [None]:
# Now, lets see if the data values have any inconsistencies in them. We can check each unique value in the columns to determine if more cleaning must be done
# Function to find unique values in each column
def find_inconsistencies(data):
    for column in data.select_dtypes(include=['object', 'category']):
        unique_values = data[column].unique()
        print(f"Column: {column}")
        print(f"Unique Values: {unique_values}\n")

# Check for inconsistencies
find_inconsistencies(titanic_data)

In [None]:
# Now, lets check for outliers in the data. We can do this for both categorical data and numeric to see if there are values that appear less frequently, or if there is a very large number

# Loop through numerical columns and create a boxplot for each
numerical_cols = titanic_data.select_dtypes(include=['float64', 'int64']).columns

for col in numerical_cols:
    plt.figure(figsize=(6, 3))
    sns.boxplot(x=titanic_data[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

In [None]:
# Fare has an extreme outlier of >500
# Check the class of the passengers with fare 500
expensive_fare_passenger = titanic_data[titanic_data['fare'] >= 500]
print(expensive_fare_passenger)

# We see that each client has similar characteristics and is in first class, which can be very expensive. As such, we are leaving this outlier in

In [None]:
# Data is now cleaned

# Display the first few records
print(titanic_data.head())

# Review the structure of the dataset
print(titanic_data.info())