# Titanic Survival Prediction — Project Notebook

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.decomposition import PCA
from scipy import stats
import statsmodels.api as sm

In [None]:
df = pd.read_csv("train.csv")
df = df[["Survived", "Pclass", "Sex", "Age", "Fare"]].dropna()
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
df.head()

In [None]:
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
sns.histplot(df["Age"], kde=True)
plt.title("Histogram: Age")
plt.subplot(1, 2, 2)
sns.boxplot(x="Survived", y="Fare", data=df)
plt.title("Boxplot: Fare vs Survival")
plt.tight_layout()
plt.show()

In [None]:
sm.qqplot(df["Age"], line='s')
plt.title("QQ-Plot: Age")
plt.show()

stat, p = stats.shapiro(df["Age"])
print(f"Shapiro-Wilk p-value: {p:.6f}")


In [None]:
X = df[["Pclass", "Sex", "Age", "Fare"]].values
y = df["Survived"].values.reshape(-1, 1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_pca = PCA(n_components=2).fit_transform(X_scaled)
plt.figure(figsize=(6,5))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y.reshape(-1), cmap="coolwarm", alpha=0.6)
plt.title("PCA 2D Projection")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.show()

In [None]:
class LogisticRegressionCustom:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.m, self.n = X.shape
        self.weights = np.zeros((self.n, 1))
        self.bias = 0

        for _ in range(self.epochs):
            linear_model = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(linear_model)

            dw = (1/self.m) * np.dot(X.T, (y_pred - y))
            db = (1/self.m) * np.sum(y_pred - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(linear_model)

    def predict_classes(self, X, threshold=0.5):
        return (self.predict(X) >= threshold).astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = LogisticRegressionCustom()
model.fit(X_train, y_train)
y_pred_proba = model.predict(X_test)
y_pred = model.predict_classes(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MAPE:", mean_absolute_percentage_error(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
#df = pd.read_csv("train.csv")
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna('S')
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
df.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)

df = pd.concat([
    df,
    pd.get_dummies(df['Sex'], drop_first=True),
    pd.get_dummies(df['Embarked'], drop_first=True),
    pd.get_dummies(df['Pclass'], prefix='Pclass', drop_first=True)
], axis=1)

df.drop(['Sex', 'Embarked', 'Pclass'], axis=1, inplace=True)

X = df.drop("Survived", axis=1).values
y = df["Survived"].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

model = MyLogisticRegression(lr=0.1, epochs=1000)
model.fit(X_train, y_train, X_val=X_test, y_val=y_test)
y_pred = model.predict(X_test)

print("Improved Accuracy:", accuracy_score(y_test, y_pred))
print("Improved Precision:", precision_score(y_test, y_pred))
print("Improved Recall:", recall_score(y_test, y_pred))
print("Improved F1 Score:", f1_score(y_test, y_pred))