#Implementing Logistic Regression From Scratch on Titanic Dataset

##Importing Libraries

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

##Loading the Data

In [18]:
train_data = pd.read_csv("/content/train.csv")
print(train_data.shape)
test_data = pd.read_csv("/content/test.csv")
print(test_data.shape)

(891, 12)
(418, 11)


In [19]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


##Data Preprocessing

In [21]:
y_train = train_data["Survived"]

#important features
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
train_data = train_data[features]
test_data = test_data[features]

#mapping data and filling missing values
genders = {'male': 1, 'female': 0}
embark = {'S': 0, 'Q': 1, 'C': 2}
data = [train_data, test_data]
for d in data:
  d["Sex"] = d["Sex"].map(genders)
  d["Embarked"] = d["Embarked"].map(embark)
  d["Age"].fillna(d["Age"].mean(),inplace=True)
  d["Fare"].fillna(d["Fare"].mean(),inplace=True)
  d["Embarked"].fillna(d["Embarked"].mean(),inplace=True)

In [22]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,0.0
1,1,0,38.0,1,0,71.2833,2.0
2,3,0,26.0,0,0,7.925,0.0
3,1,0,35.0,1,0,53.1,0.0
4,3,1,35.0,0,0,8.05,0.0


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_data, y_train, test_size=0.1, random_state=42, shuffle=True)

##Sigmoid Function

In [24]:
def sigmoid(x):
  return 1. / (1. + np.exp(-x))

##Forward Propagation

In [25]:
def forward_propagate(X, y, W, b):
  m = X.shape[1]

  z = np.dot(W.T, X) + b
  a = sigmoid(z)

  cost = -(1/m) * np.sum(y*np.log(a) + (1-y)*np.log(1-a))

  dW = (1/m) * np.dot(X, (a-y).T)
  db = (1/m) * np.sum(a-y)

  cost = np.squeeze(cost)

  grads = {"dW": dW,
           "db": db}
           
  return grads, cost

##Vanilla Gradient Descent Algorithm

In [26]:
def gradientDescent(X, y, W, b, num_iter, learning_rate):
  costs = []
  for i in range(0, num_iter):

    grads, cost = forward_propagate(X, y, W, b)

    dW = grads["dW"]
    db = grads["db"]

    W = W - learning_rate * dW
    b = b - learning_rate * db

    if i % 100 == 0:
            costs.append(cost)

  params = {"W": W,
            "b": b}

  grads = {"dW": dW,
           "db": db}

  return params, grads, costs

##Predicting the Values

In [27]:
def predict(X, W, b):
  m = X.shape[1]
  y_preds = np.zeros((1, m))
  W = W.reshape((X.shape[0], 1))

  z = np.dot(W.T, X) + b
  a = sigmoid(z)

  for i in range(a.shape[1]):
    if a[0, i] >= 0.5:
      y_preds[0, i] = 1
    else:
      y_preds[0, i] = 0

  return y_preds

##Final Compilation of the Whole Model

In [28]:
def model(X_train, y_train, X_test, y_test, num_iter=1000, learning_rate=0.01):
  W = np.zeros((X_train.shape[0], 1))
  b = 0

  params, grads, costs = gradientDescent(X_train, y_train, W, b, num_iter, learning_rate)

  W = params["W"]
  b = params["b"]

  y_preds_train = predict(X_train, W, b)
  y_preds_test = predict(X_test, W, b)

  print("train accuracy: {} %".format(100 - np.mean(np.abs(y_preds_train - y_train)) * 100))
  print("test accuracy: {} %".format(100 - np.mean(np.abs(y_preds_test - y_test)) * 100))

  data = {"costs": costs,         
         "y_preds_train" : y_preds_train, 
         "y_preds_test": y_preds_test,
         "W" : W, 
         "b" : b,
         "learning_rate" : learning_rate,
         "num_iterations": num_iter}
  return data

In [29]:
X_train = np.array(X_train).T
y_train = np.array(y_train)
y_train = y_train.reshape(y_train.shape[0], 1).T
X_val = np.array(X_val).T
y_val = np.array(y_val)
y_val = y_val.reshape(y_val.shape[0], 1).T

X_train.shape

(7, 801)

##Accuracy of the Model

In [31]:
d = model(X_train, y_train, X_val, y_val, 5000, 0.001)

train accuracy: 70.78651685393258 %
test accuracy: 76.66666666666667 %
