In [None]:
import numpy as np
import pandas as pd

# -----------------------------------------------------------------------------
# A. Data Loading and Exploration
# -----------------------------------------------------------------------------

df = pd.read_csv("train.csv")

print("First 5 rows:")
print(df.head())

print("\nShape:", df.shape)
print("\nColumn names:", df.columns.tolist())

print("\nMissing values per column:")
print(df.isna().sum())

# -----------------------------------------------------------------------------
# B. Data Cleaning
# -----------------------------------------------------------------------------

mean_age = np.nanmean(df["Age"])
df["Age"] = df["Age"].fillna(mean_age)

most_freq_embarked = df["Embarked"].mode()[0]
df["Embarked"] = df["Embarked"].fillna(most_freq_embarked)

if "Cabin" in df.columns:
    df = df.drop(columns=["Cabin"])

print("\nAfter cleaning, missing values:")
print(df.isna().sum())

# -----------------------------------------------------------------------------
# C. Feature Engineering
# -----------------------------------------------------------------------------

df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

df["SexNum"] = df["Sex"].map({"male": 1, "female": 0})

df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.")

df["IsMinor"] = np.where(df["Age"] < 18, 1, 0)

print("\nFeature Engineering done. Columns now:")
print(df.columns.tolist())

# -----------------------------------------------------------------------------
# D. NumPy Practice
# -----------------------------------------------------------------------------

avg_fare_survived = np.nanmean(df.loc[df["Survived"] == 1, "Fare"])
avg_fare_not_survived = np.nanmean(df.loc[df["Survived"] == 0, "Fare"])

print("\nAverage fare (Survived=1):", avg_fare_survived)
print("Average fare (Survived=0):", avg_fare_not_survived)

std_age = np.nanstd(df["Age"])
print("Standard deviation of Age:", std_age)

count_20_40 = np.sum((df["Age"] >= 20) & (df["Age"] <= 40))
print("Passengers with Age 20-40:", count_20_40)

# -----------------------------------------------------------------------------
# E. Analysis
# -----------------------------------------------------------------------------

survival_by_pclass = df.groupby("Pclass")["Survived"].mean()
print("\nAverage survival rate by Pclass:")
print(survival_by_pclass)

top_fares = df.nlargest(3, "Fare")["Name"]
print("\nTop 3 passengers by fare:")
print(top_fares)

# -----------------------------------------------------------------------------
# Save the cleaned dataset
# -----------------------------------------------------------------------------

df.to_csv("titanic_cleaned.csv", index=False)
print("\nCleaned dataset saved as titanic_cleaned.csv")


First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN