In [15]:
import numpy as np

In [16]:
import pandas as pd

In [17]:
df = pd.read_csv("Data_Folder/titanic.csv")

In [18]:
df.columns = df.columns.str.strip()
df.columns = df.columns.str.lower()

In [54]:
df = df.drop(columns=['cabin'])

KeyError: "['cabin'] not found in axis"

In [None]:
# Filling the numerical gaps
median_age = df['age'].median()
df['age'] = df['age'].fillna(median_age)
# Droping missing values
df = df.dropna()

In [None]:
df['familysize'] = df['sibsp'] + df['parch']
df = df.drop(columns=['sibsp','parch'])

# 1.Binary Encoding for the sex column
df['sex'] = df['sex'].map({'male':0,'female':1})

# 2.One hot encoding for embarked column
df = pd.get_dummies(df,columns=['embarked'],drop_first=True)

In [None]:
cols_scale = ['age','fare','familysize']

for col in cols_scale:
    min_val = df[col].min()
    max_val = df[col].max()
    df[col] = (df[col] - min_val) / (max_val - min_val)

In [None]:
features = ['pclass','sex','age','fare','familysize','embarked_Q','embarked_S']
X = df[features].values
y = df['survived'].values

X = X.astype(float)
y = y.astype(float)

In [55]:
class MyLogisticRegression:
    def __init__(self,learning_rate=0.01,iterations=1000):
        self.lr = learning_rate
        self.iterations = iterations
        self.weights = None
        self.bias = None
        self.cost_history = []
    def _sigmoid(self,z):
        if not hasattr(np, "exp"):
            raise TypeError("np is not NumPy! Did you overwrite it?")
        return 1 / (1 + np.exp(-z))

    def fit(self,X,y):
        n_samples,n_features = X.shape
        y = y.reshape(-1,1)

        self.weights = np.zeros((n_features,1))
        self.bias = 0

        for i in range(self.iterations):
            linear_model = np.dot(X,self.weights) + self.bias

            # Applying sigmoid function
            y_pred = self._sigmoid(linear_model)

            # Cost Calculation (log loss)
            epsilon = 1e-15
            y_pred_clipped = np.clip(y_pred,epsilon,1-epsilon)
            cost = -(1/n_samples) * np.sum(y*np.log(y_pred_clipped)+(1-y) * np.log(1-y_pred_clipped))
            self.cost_history.append(cost)

            # Backward pass
            error = y_pred - y
            dw = (1/n_samples) * np.dot(X.T, error)
            db = (1/n_samples) * np.sum(error)

            # updating the parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

            if i % 100 == 0:
                print(f"Iter {i}: Cost {cost:.4f}")
    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        # CHANGE 3: Convert Probability to Class (0 or 1)
        # If probability > 0.5, class is 1. Else 0.
        return [1 if i > 0.5 else 0 for i in y_predicted]            
        

In [56]:
split_idx = 700
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

In [57]:
print(f"Training on {len(X_train)} passengers...")

Training on 700 passengers...


Training the model

In [49]:
model = MyLogisticRegression(learning_rate=0.1,iterations=5000)
model.fit(X_train,y_train)

Iter 0: Cost 0.6931
Iter 100: Cost 0.5303
Iter 200: Cost 0.4922
Iter 300: Cost 0.4778
Iter 400: Cost 0.4715
Iter 500: Cost 0.4683
Iter 600: Cost 0.4665
Iter 700: Cost 0.4653
Iter 800: Cost 0.4644
Iter 900: Cost 0.4637
Iter 1000: Cost 0.4631
Iter 1100: Cost 0.4625
Iter 1200: Cost 0.4620
Iter 1300: Cost 0.4615
Iter 1400: Cost 0.4611
Iter 1500: Cost 0.4607
Iter 1600: Cost 0.4603
Iter 1700: Cost 0.4599
Iter 1800: Cost 0.4595
Iter 1900: Cost 0.4591
Iter 2000: Cost 0.4588
Iter 2100: Cost 0.4585
Iter 2200: Cost 0.4582
Iter 2300: Cost 0.4579
Iter 2400: Cost 0.4576
Iter 2500: Cost 0.4573
Iter 2600: Cost 0.4571
Iter 2700: Cost 0.4568
Iter 2800: Cost 0.4566
Iter 2900: Cost 0.4564
Iter 3000: Cost 0.4561
Iter 3100: Cost 0.4559
Iter 3200: Cost 0.4557
Iter 3300: Cost 0.4555
Iter 3400: Cost 0.4554
Iter 3500: Cost 0.4552
Iter 3600: Cost 0.4550
Iter 3700: Cost 0.4549
Iter 3800: Cost 0.4547
Iter 3900: Cost 0.4546
Iter 4000: Cost 0.4544
Iter 4100: Cost 0.4543
Iter 4200: Cost 0.4542
Iter 4300: Cost 0.4540


Evaluating the model



In [50]:
predictions = model.predict(X_test)

In [51]:
accuracy = np.mean(predictions == y_test.flatten())
print(f"\nAccuracy on Test Set: {accuracy * 100:.2f}%")


Accuracy on Test Set: 81.48%


In [52]:
from sklearn.linear_model import LogisticRegression
sk_model = LogisticRegression()
sk_model.fit(X_train, y_train.flatten())
sk_acc = sk_model.score(X_test, y_test.flatten())
print(f"Sklearn Accuracy: {sk_acc * 100:.2f}%")

if abs(accuracy - sk_acc) < 0.05:
    print("SUCCESS: You matched the industry standard.")
else:
    print("FAIL: Check your learning rate or normalization.")

Sklearn Accuracy: 81.48%
SUCCESS: You matched the industry standard.


In [53]:
print(model.weights)

[[-0.97441115]
 [ 2.6656903 ]
 [-1.69630815]
 [ 0.31456144]
 [-1.36099906]
 [ 0.07729933]
 [-0.33749188]]
