In [7]:
import pandas as pd

# Load the Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
titanic_data = pd.read_csv(url)

# Display the first few rows of the dataset
titanic_data.tail()
#titanic_data.info()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Select features and target variable
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
target = 'Survived'

X = titanic_data[features]
y = titanic_data[target]

# Handle missing values and encode categorical variables
numerical_features = ['Age', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [16]:
print(X_train)

[[ 1.          1.2322632  -0.07868358 ...  0.          0.
   1.        ]
 [ 1.         -0.50048197 -0.37714494 ...  0.          0.
   1.        ]
 [ 1.          0.1926161  -0.47486697 ...  0.          0.
   1.        ]
 ...
 [ 1.          0.88571416 -0.35580399 ...  0.          0.
   1.        ]
 [ 1.         -1.19358003  1.68320121 ...  0.          0.
   1.        ]
 [ 1.         -0.65450376  0.86074761 ...  0.          0.
   1.        ]]


In [13]:
import numpy as np

# Initialize parameters
n_features = X_train.shape[1]
theta = np.zeros(n_features + 1)  # +1 for the intercept term

# Add intercept term to X
X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_test = np.c_[np.ones(X_test.shape[0]), X_test]

In [15]:
print(X_train)


[[ 1.          1.2322632  -0.07868358 ...  0.          0.
   1.        ]
 [ 1.         -0.50048197 -0.37714494 ...  0.          0.
   1.        ]
 [ 1.          0.1926161  -0.47486697 ...  0.          0.
   1.        ]
 ...
 [ 1.          0.88571416 -0.35580399 ...  0.          0.
   1.        ]
 [ 1.         -1.19358003  1.68320121 ...  0.          0.
   1.        ]
 [ 1.         -0.65450376  0.86074761 ...  0.          0.
   1.        ]]


In [17]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def compute_cost(X, y, theta):
    m = len(y)
    h = sigmoid(X.dot(theta))
    epsilon = 1e-5  # Small constant to prevent division by zero
    cost = -1/m * (y.dot(np.log(h + epsilon)) + (1 - y).dot(np.log(1 - h + epsilon)))
    return cost

In [18]:
def gradient_descent(X, y, theta, learning_rate, num_iterations):
    m = len(y)
    cost_history = np.zeros(num_iterations)

    for i in range(num_iterations):
        h = sigmoid(X.dot(theta))
        gradients = X.T.dot(h - y) / m
        theta = theta - learning_rate * gradients
        cost_history[i] = compute_cost(X, y, theta)

    return theta, cost_history

In [19]:
print(cost_history)

NameError: name 'cost_history' is not defined

In [21]:
# Define hyperparameters
learning_rate = 0.01
num_iterations = 1000

# Train the model
theta_optimal, cost_history = gradient_descent(X_train, y_train, theta, learning_rate, num_iterations)

print("Optimal parameters:", theta_optimal)
print("Cost history over iterations:", cost_history)

Optimal parameters: [-0.07090459 -0.15433616  0.32720924  0.23157942  0.14376593 -0.44624993
  0.68985525 -0.76075984  0.11954125 -0.00230822 -0.18813762]
Cost history over iterations: [0.69197511 0.69083386 0.68970332 0.68858335 0.68747383 0.68637464
 0.68528566 0.68420677 0.68313786 0.68207879 0.68102947 0.67998977
 0.67895959 0.6779388  0.67692731 0.675925   0.67493176 0.67394749
 0.67297208 0.67200543 0.67104744 0.670098   0.66915702 0.66822439
 0.66730002 0.66638382 0.66547568 0.66457551 0.66368321 0.66279871
 0.6619219  0.66105269 0.660191   0.65933674 0.65848983 0.65765017
 0.65681768 0.65599228 0.65517389 0.65436242 0.6535578  0.65275994
 0.65196877 0.65118421 0.65040618 0.64963461 0.64886942 0.64811054
 0.64735789 0.64661141 0.64587103 0.64513667 0.64440826 0.64368574
 0.64296903 0.64225808 0.64155281 0.64085317 0.64015908 0.63947048
 0.63878732 0.63810952 0.63743703 0.63676979 0.63610774 0.63545082
 0.63479897 0.63415213 0.63351025 0.63287328 0.63224115 0.63161381
 0.63099122

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Predict function
def predict(X, theta):
    probabilities = sigmoid(X.dot(theta))
    return probabilities >= 0.5

# Make predictions on the test set
y_pred = predict(X_test, theta_optimal)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)


print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Accuracy: 0.7932960893854749
Confusion Matrix:
 [[93 12]
 [25 49]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.89      0.83       105
           1       0.80      0.66      0.73        74

    accuracy                           0.79       179
   macro avg       0.80      0.77      0.78       179
weighted avg       0.79      0.79      0.79       179

