In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows',100)

%matplotlib inline
print("✓ All libraries imported successfully!")

✓ All libraries imported successfully!


In [16]:
import pandas as pd
train_df = pd.read_csv('../data/train.csv')


print(f"Data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")


display(train_df.head())
print("\nFirst 5 rows:")

Data shape: (891, 12)
Columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



First 5 rows:


In [17]:
import os
print(f"Current directory: {os.getcwd()}")
print(f"Files in data folder: {os.listdir('../data')}")

Current directory: C:\Users\kings\data_analytics_portfolio\titanic_analysis\notebooks
Files in data folder: ['gender_submission.csv', 'test.csv', 'train.csv']


In [18]:
print(train_df.head())

   PassengerId  Survived  Pclass                                               Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
0            1         0       3                            Braund, Mr. Owen Harris    male  22.0      1      0         A/5 21171   7.2500   NaN        S
1            2         1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1      0          PC 17599  71.2833   C85        C
2            3         1       3                             Heikkinen, Miss. Laina  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S
3            4         1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0            113803  53.1000  C123        S
4            5         0       3                           Allen, Mr. William Henry    male  35.0      0      0            373450   8.0500   NaN        S


In [19]:
# Cell 3: Data verification
print("=== DATA VERIFICATION ===")
print(f"1. Total rows: {len(train_df)}")
print(f"2. Total columns: {len(train_df.columns)}")
print(f"3. Column names: {train_df.columns.tolist()}")
print(f"4. Data types:\n{train_df.dtypes}")
print(f"5. Missing values:\n{train_df.isnull().sum()}")

=== DATA VERIFICATION ===
1. Total rows: 891
2. Total columns: 12
3. Column names: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
4. Data types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
5. Missing values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


# Titanic Survival Analysis - EDA
## Project Goal
Identify factors that influenced survival rates on the Titanic

## Analysis Plan
1. Data Overview & Structure
2. Data Cleaning & Preprocessing
3. Univariate Analysis (Single Variables)
4. Bivariate Analysis (Two Variables)
5. Multivariate Analysis
6. Key Insights & Recommendations

In [20]:
# Cell 5: Comprehensive Data Overview
print("=" * 60)
print("DATA OVERVIEW")
print("=" * 60)

# 5.1 Basic information
print("\n1. DATASET INFORMATION:")
print(f"   • Shape: {train_df.shape} (rows, columns)")
print(f"   • Memory Usage: {train_df.memory_usage(deep=True).sum() / 1024:.2f} KB")

# 5.2 Column data types
print("\n2. DATA TYPES:")
print(train_df.dtypes.to_string())

# 5.3 Display first few rows with formatting
print("\n3. SAMPLE DATA (First 3 rows):")
display(train_df.head(3))

print("\n4. SAMPLE DATA (Last 3 rows):")
display(train_df.tail(3))

# 5.4 Random sample
print("\n5. RANDOM SAMPLE (5 rows):")
display(train_df.sample(5, random_state=42))

DATA OVERVIEW

1. DATASET INFORMATION:
   • Shape: (891, 12) (rows, columns)
   • Memory Usage: 285.61 KB

2. DATA TYPES:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object

3. SAMPLE DATA (First 3 rows):


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S



4. SAMPLE DATA (Last 3 rows):


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q



5. RANDOM SAMPLE (5 rows):


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
709,710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S
840,841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S
720,721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6.0,0,1,248727,33.0,,S
39,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C


In [21]:
# Cell 6: Data Quality Assessment
print("=" * 60)
print("DATA QUALITY ASSESSMENT")
print("=" * 60)

# 6.1 Missing values analysis
missing_data = train_df.isnull().sum()
missing_percent = (missing_data / len(train_df)) * 100

print("\n1. MISSING VALUES:")
missing_df = pd.DataFrame({
    'Missing_Values': missing_data,
    'Percentage (%)': missing_percent.round(2)
})
print(missing_df[missing_df['Missing_Values'] > 0].sort_values('Percentage (%)', ascending=False))

# 6.2 Duplicate check
duplicates = train_df.duplicated().sum()
print(f"\n2. DUPLICATE ROWS: {duplicates}")

# 6.3 Unique values per column
print("\n3. UNIQUE VALUES PER COLUMN:")
for col in train_df.columns:
    unique_count = train_df[col].nunique()
    if unique_count < 15:  # Show values for low-cardinality columns
        print(f"   • {col}: {unique_count} unique → {train_df[col].unique()}")
    else:
        print(f"   • {col}: {unique_count} unique values")

DATA QUALITY ASSESSMENT

1. MISSING VALUES:
          Missing_Values  Percentage (%)
Cabin                687           77.10
Age                  177           19.87
Embarked               2            0.22

2. DUPLICATE ROWS: 0

3. UNIQUE VALUES PER COLUMN:
   • PassengerId: 891 unique values
   • Survived: 2 unique → [0 1]
   • Pclass: 3 unique → [3 1 2]
   • Name: 891 unique values
   • Sex: 2 unique → ['male' 'female']
   • Age: 88 unique values
   • SibSp: 7 unique → [1 0 3 4 2 5 8]
   • Parch: 7 unique → [0 1 2 5 3 4 6]
   • Ticket: 681 unique values
   • Fare: 248 unique values
   • Cabin: 147 unique values
   • Embarked: 3 unique → ['S' 'C' 'Q' nan]


In [22]:


# Cell 4: Basic Data Exploration
print("=" * 60)
print("BASIC DATA EXPLORATION")
print("=" * 60)

# 1. Dataset Overview
print("\n1. DATASET OVERVIEW:")
print(f"• Shape: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
print(f"• Memory Usage: {train_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 2. Column Information
print("\n2. COLUMN INFORMATION:")
for col in train_df.columns:
    dtype = train_df[col].dtype
    unique = train_df[col].nunique()
    print(f"• {col:15} | Type: {str(dtype):10} | Unique Values: {unique:3}")

# 3. Statistical Summary
print("\n3. NUMERICAL FEATURES - STATISTICAL SUMMARY:")
numeric_cols = train_df.select_dtypes(include=[np.number]).columns
print(train_df[numeric_cols].describe().round(2))

# 4. Categorical Features Summary
print("\n4. CATEGORICAL FEATURES - VALUE COUNTS:")
categorical_cols = ['Survived', 'Pclass', 'Sex', 'Embarked', 'SibSp', 'Parch']
for col in categorical_cols:
    if col in train_df.columns:
        print(f"\n{col}:")
        print(train_df[col].value_counts().head())

# 5. Missing Values Summary
print("\n5. MISSING VALUES ANALYSIS:")
missing_df = pd.DataFrame({
    'Missing_Count': train_df.isnull().sum(),
    'Missing_Percentage': (train_df.isnull().sum() / len(train_df) * 100).round(2)
})
print(missing_df[missing_df['Missing_Count'] > 0])

# 6. Duplicate Check
duplicates = train_df.duplicated().sum()
print(f"\n6. DUPLICATE ROWS: {duplicates}")

print("\n" + "=" * 60)
print("EXPLORATION COMPLETE")
print("=" * 60)

BASIC DATA EXPLORATION

1. DATASET OVERVIEW:
• Shape: 891 rows, 12 columns
• Memory Usage: 0.28 MB

2. COLUMN INFORMATION:
• PassengerId     | Type: int64      | Unique Values: 891
• Survived        | Type: int64      | Unique Values:   2
• Pclass          | Type: int64      | Unique Values:   3
• Name            | Type: object     | Unique Values: 891
• Sex             | Type: object     | Unique Values:   2
• Age             | Type: float64    | Unique Values:  88
• SibSp           | Type: int64      | Unique Values:   7
• Parch           | Type: int64      | Unique Values:   7
• Ticket          | Type: object     | Unique Values: 681
• Fare            | Type: float64    | Unique Values: 248
• Cabin           | Type: object     | Unique Values: 147
• Embarked        | Type: object     | Unique Values:   3

3. NUMERICAL FEATURES - STATISTICAL SUMMARY:
       PassengerId  Survived  Pclass     Age   SibSp   Parch    Fare
count       891.00    891.00  891.00  714.00  891.00  891.00  891.