In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown

In [None]:
sns.set(style='whitegrid')

In [None]:
# 1) Load data
df = pd.read_csv('train.csv')
df_original = df.copy()
print('Loaded dataset shape:', df.shape)

In [None]:
print('\nColumns:')
print(df.columns.tolist())

In [None]:
# 2) Quick info & summary statistics


print('--- INFO ---')
print(df.info())

print('\n--- DESCRIBE (numeric) ---')
display(df.describe())

print('\n--- DESCRIBE (object) ---')
display(df.describe(include=['O']))


In [None]:
# Missing values
missing = df.isnull().sum().sort_values(ascending=False)
missing_pct = (df.isnull().mean() * 100).sort_values(ascending=False)
missing_table = pd.concat([missing, missing_pct], axis=1)
missing_table.columns = ['missing_count', 'missing_pct']
missing_table

In [None]:
# 3) Univariate analysis
#  distributions of key columns: `Survived`, `Pclass`, `Sex`, `Age`, `Fare`, `Embarked`.
plt.figure(figsize=(6,4))
sns.countplot(x='Survived', data=df)
plt.title('Survival Count (0 = Died, 1 = Survived)')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
print('Observation: Proportion survived = {:.2f}%'.format(df['Survived'].mean()*100))


In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x='Pclass', data=df, order=[1,2,3])
plt.title('Passenger Class Distribution')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='Sex', data=df)
plt.title('Gender Distribution')
plt.tight_layout()
plt.show()


In [None]:
print(df.columns.tolist())


In [None]:
plt.figure(figsize=(8,5))
plt.hist(df['Age'].dropna(), bins=30)
plt.title('Age Distribution')
plt.show()

In [None]:
plt.figure(figsize=(8,5))
plt.hist(df['Fare'].dropna(), bins=40)
plt.title('Fare Distribution')
plt.xlim(0, 200)
plt.show()



In [None]:
#  Bivariate Analysis

plt.figure(figsize=(8,5))
sns.barplot(x='Pclass', y='Survived', data=df, estimator=np.mean)
plt.title('Survival Rate by Class')
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.barplot(x='Sex', y='Survived', data=df, estimator=np.mean)
plt.title('Survival Rate by Gender')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='Survived', y='Age', data=df)
plt.title('Age Distribution by Survival')
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='Survived', y='Fare', data=df)
plt.title('Fare Distribution by Survival')
plt.ylim(0,200)
plt.show()


In [None]:
#  Correlation & Pairplot

encoded_df = df.copy()
# Encode only categorical columns, skip non-numeric text columns like Name, Ticket, Cabin
encoded_df['Sex'] = encoded_df['Sex'].map({'male':0, 'female':1})
encoded_df['Embarked'] = encoded_df['Embarked'].map({'S':0, 'C':1, 'Q':2})


In [None]:
# Select only numeric columns for correlation
numeric_cols = encoded_df.select_dtypes(include=[np.number])


In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(numeric_cols.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
sns.pairplot(numeric_cols[['Survived','Pclass','Sex','Age','Fare']].dropna(), hue='Survived', diag_kind='hist', corner=True)
plt.show()